Re: [patch 09/10] Remove the SLOB allocator for 2.6.23

* Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> I said exactly the same thing last time this came up. I would love to 
> remove code if its functionality can be adequately replaced by 
> existing code, but I think your reasons for removing SLOB aren't that 
> good, and just handwaving away the significant memory savings doesn't 
> work.

yeah. Also, the decision here is pretty easy: the behavior of the 
allocator is not really visible to applications. So this isnt like 
having a parallel IO scheduler or a parallel process scheduler (which 
cause problems to us by fragmenting the application space) - so the 
long-term cost to us kernel maintainers should be relatively low.

> > A year ago the -rt kernel defaulted to the SLOB for a few releases, 
> > and barring some initial scalability issues (which were solved in 
> > -rt) it worked pretty well on generic PCs, so i dont buy the 'it 
> > doesnt work' argument either.
> 
> It's actually recently been made to work on SMP, it is much more 
> scalable to large memories, and some initial NUMA work is happening 
> that some embedded guys are interested in, all without increasing 
> static footprint too much, and it has actually decreased dynamic 
> footprint too.

cool. I was referring to something else: people were running -rt on 
their beefy desktop boxes with several gigs of RAM they complained about 
the slowdown that is caused by SLOB's linear list walking.

Steve Rostedt did two nice changes to fix those scalability problems. 
I've attached Steve's two patches. With these in place SLOB was very 
usable for large systems as well, with no measurable overhead. 
(obviously the lack of per-cpu caching can still be measured, but it's a 
lot less of a problem in practice than the linear list walking was.)

	Ingo

This patch uses the mem_map pages to find the bigblock descriptor for
large allocations.

-- Steve

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

 mm/slob.c |   74 ++++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 41 insertions(+), 33 deletions(-)

Index: linux/mm/slob.c
===================================================================
--- linux.orig/mm/slob.c
+++ linux/mm/slob.c
@@ -49,15 +49,42 @@ typedef struct slob_block slob_t;
 struct bigblock {
 	int order;
 	void *pages;
-	struct bigblock *next;
 };
 typedef struct bigblock bigblock_t;
 
 static slob_t arena = { .next = &arena, .units = 1 };
 static slob_t *slobfree = &arena;
-static bigblock_t *bigblocks;
 static DEFINE_SPINLOCK(slob_lock);
-static DEFINE_SPINLOCK(block_lock);
+
+#define __get_slob_block(b) ((unsigned long)(b) & ~(PAGE_SIZE-1))
+
+static inline struct page *get_slob_page(const void *mem)
+{
+	void *virt = (void*)__get_slob_block(mem);
+
+	return virt_to_page(virt);
+}
+
+static inline void zero_slob_block(const void *b)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	memset(&page->lru, 0, sizeof(page->lru));
+}
+
+static inline void *get_slob_block(const void *b)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	return page->lru.next;
+}
+
+static inline void set_slob_block(const void *b, void *data)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	page->lru.next = data;
+}
 
 static void slob_free(void *b, int size);
 static void slob_timer_cbk(void);
@@ -109,6 +136,7 @@ static void *slob_alloc(size_t size, gfp
 			if (!cur)
 				return 0;
 
+			zero_slob_block(cur);
 			slob_free(cur, PAGE_SIZE);
 			spin_lock_irqsave(&slob_lock, flags);
 			cur = slobfree;
@@ -163,7 +191,6 @@ void *__kmalloc(size_t size, gfp_t gfp)
 {
 	slob_t *m;
 	bigblock_t *bb;
-	unsigned long flags;
 
 	if (size < PAGE_SIZE - SLOB_UNIT) {
 		m = slob_alloc(size + SLOB_UNIT, gfp, 0);
@@ -178,10 +205,7 @@ void *__kmalloc(size_t size, gfp_t gfp)
 	bb->pages = (void *)__get_free_pages(gfp, bb->order);
 
 	if (bb->pages) {
-		spin_lock_irqsave(&block_lock, flags);
-		bb->next = bigblocks;
-		bigblocks = bb;
-		spin_unlock_irqrestore(&block_lock, flags);
+		set_slob_block(bb->pages, bb);
 		return bb->pages;
 	}
 
@@ -192,25 +216,16 @@ EXPORT_SYMBOL(__kmalloc);
 
 void kfree(const void *block)
 {
-	bigblock_t *bb, **last = &bigblocks;
-	unsigned long flags;
+	bigblock_t *bb;
 
 	if (!block)
 		return;
 
-	if (!((unsigned long)block & (PAGE_SIZE-1))) {
-		/* might be on the big block list */
-		spin_lock_irqsave(&block_lock, flags);
-		for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
-			if (bb->pages == block) {
-				*last = bb->next;
-				spin_unlock_irqrestore(&block_lock, flags);
-				free_pages((unsigned long)block, bb->order);
-				slob_free(bb, sizeof(bigblock_t));
-				return;
-			}
-		}
-		spin_unlock_irqrestore(&block_lock, flags);
+	bb = get_slob_block(block);
+	if (bb) {
+		free_pages((unsigned long)block, bb->order);
+		slob_free(bb, sizeof(bigblock_t));
+		return;
 	}
 
 	slob_free((slob_t *)block - 1, 0);
@@ -222,20 +237,13 @@ EXPORT_SYMBOL(kfree);
 unsigned int ksize(const void *block)
 {
 	bigblock_t *bb;
-	unsigned long flags;
 
 	if (!block)
 		return 0;
 
-	if (!((unsigned long)block & (PAGE_SIZE-1))) {
-		spin_lock_irqsave(&block_lock, flags);
-		for (bb = bigblocks; bb; bb = bb->next)
-			if (bb->pages == block) {
-				spin_unlock_irqrestore(&slob_lock, flags);
-				return PAGE_SIZE << bb->order;
-			}
-		spin_unlock_irqrestore(&block_lock, flags);
-	}
+	bb = get_slob_block(block);
+	if (bb)
+		return PAGE_SIZE << bb->order;
 
 	return ((slob_t *)block - 1)->units * SLOB_UNIT;
 }

---
 mm/slob.c |  290 ++++++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 235 insertions(+), 55 deletions(-)

Index: linux/mm/slob.c
===================================================================
--- linux.orig/mm/slob.c
+++ linux/mm/slob.c
@@ -27,6 +27,20 @@
  * are allocated by calling __get_free_pages. As SLAB objects know
  * their size, no separate size bookkeeping is necessary and there is
  * essentially no allocation space overhead.
+ *
+ * Modified by: Steven Rostedt <rostedt@goodmis.org> 12/20/05
+ *
+ * Now we take advantage of the kmem_cache usage.  I've removed
+ * the global slobfree, and created one for every cache.
+ *
+ * For kmalloc/kfree I've reintroduced the usage of cache_sizes,
+ * but only for sizes 32 through PAGE_SIZE >> 1 by order of 2.
+ *
+ * Having the SLOB alloc per size of the cache should speed things up
+ * greatly, not only by making the search paths smaller, but also by
+ * keeping all the caches of similar units.  This way the fragmentation
+ * should not be as big of a problem.
+ *
  */
 
 #include <linux/slab.h>
@@ -36,6 +50,8 @@
 #include <linux/module.h>
 #include <linux/timer.h>
 
+#undef DEBUG_CACHE
+
 struct slob_block {
 	int units;
 	struct slob_block *next;
@@ -52,17 +68,66 @@ struct bigblock {
 };
 typedef struct bigblock bigblock_t;
 
-static slob_t arena = { .next = &arena, .units = 1 };
-static slob_t *slobfree = &arena;
-static DEFINE_SPINLOCK(slob_lock);
+struct kmem_cache {
+	unsigned int size, align;
+	const char *name;
+	slob_t *slobfree;
+	slob_t arena;
+	spinlock_t lock;
+	void (*ctor)(void *, struct kmem_cache *, unsigned long);
+	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+	atomic_t items;
+	unsigned int free;
+	struct list_head list;
+};
+
+#define NR_SLOB_CACHES ((PAGE_SHIFT) - 5) /* 32 to PAGE_SIZE-1 by order of 2 */
+#define MAX_SLOB_CACHE_SIZE (PAGE_SIZE >> 1)
+
+static struct kmem_cache *cache_sizes[NR_SLOB_CACHES];
+static struct kmem_cache *bb_cache;
 
-#define __get_slob_block(b) ((unsigned long)(b) & ~(PAGE_SIZE-1))
+static struct semaphore	cache_chain_sem;
+static struct list_head cache_chain;
 
+#ifdef DEBUG_CACHE
+static void test_cache(kmem_cache_t *c)
+{
+	slob_t *cur = c->slobfree;
+	unsigned int x = -1 >> 2;
+
+	do {
+		BUG_ON(!cur->next);
+		cur = cur->next;
+	} while (cur != c->slobfree && --x);
+	BUG_ON(!x);
+}
+#else
+#define test_cache(x) do {} while(0)
+#endif
+
+/*
+ * Here we take advantage of the lru field of the pages that
+ * map to the pages we use in the SLOB.  This is done similar
+ * to what is done with SLAB.
+ *
+ * The lru.next field is used to get the bigblock descriptor
+ *    for large blocks larger than PAGE_SIZE >> 1.
+ *
+ * Set and retrieved by set_slob_block and get_slob_block
+ * respectively.
+ *
+ * The lru.prev field is used to find the cache descriptor
+ *   for small blocks smaller than or equal to PAGE_SIZE >> 1.
+ *
+ * Set and retrieved by set_slob_ptr and get_slob_ptr
+ * respectively.
+ *
+ * The use of lru.next tells us in kmalloc that the page is large.
+ */
 static inline struct page *get_slob_page(const void *mem)
 {
-	void *virt = (void*)__get_slob_block(mem);
-
-	return virt_to_page(virt);
+	return virt_to_page(mem);
 }
 
 static inline void zero_slob_block(const void *b)
@@ -86,20 +151,39 @@ static inline void set_slob_block(const 
 	page->lru.next = data;
 }
 
-static void slob_free(void *b, int size);
-static void slob_timer_cbk(void);
+static inline void *get_slob_ptr(const void *b)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	return page->lru.prev;
+}
+
+static inline void set_slob_ptr(const void *b, void *data)
+{
+	struct page *page;
+	page = get_slob_page(b);
+	page->lru.prev = data;
+}
 
+static void slob_free(kmem_cache_t *cachep, void *b, int size);
 
-static void *slob_alloc(size_t size, gfp_t gfp, int align)
+static void *slob_alloc(kmem_cache_t *cachep, gfp_t gfp, int align)
 {
+	size_t size;
 	slob_t *prev, *cur, *aligned = 0;
-	int delta = 0, units = SLOB_UNITS(size);
+	int delta = 0, units;
 	unsigned long flags;
 
-	spin_lock_irqsave(&slob_lock, flags);
-	prev = slobfree;
+	size = cachep->size;
+	units = SLOB_UNITS(size);
+	BUG_ON(!units);
+
+	spin_lock_irqsave(&cachep->lock, flags);
+	prev = cachep->slobfree;
 	for (cur = prev->next; ; prev = cur, cur = cur->next) {
 		if (align) {
+			while (align < SLOB_UNIT)
+				align <<= 1;
 			aligned = (slob_t *)ALIGN((unsigned long)cur, align);
 			delta = aligned - cur;
 		}
@@ -122,12 +206,16 @@ static void *slob_alloc(size_t size, gfp
 				cur->units = units;
 			}
 
-			slobfree = prev;
-			spin_unlock_irqrestore(&slob_lock, flags);
+			cachep->slobfree = prev;
+			test_cache(cachep);
+			if (prev < prev->next)
+				BUG_ON(cur + cur->units > prev->next);
+			spin_unlock_irqrestore(&cachep->lock, flags);
 			return cur;
 		}
-		if (cur == slobfree) {
-			spin_unlock_irqrestore(&slob_lock, flags);
+		if (cur == cachep->slobfree) {
+			test_cache(cachep);
+			spin_unlock_irqrestore(&cachep->lock, flags);
 
 			if (size == PAGE_SIZE) /* trying to shrink arena? */
 				return 0;
@@ -137,14 +225,15 @@ static void *slob_alloc(size_t size, gfp
 				return 0;
 
 			zero_slob_block(cur);
-			slob_free(cur, PAGE_SIZE);
-			spin_lock_irqsave(&slob_lock, flags);
-			cur = slobfree;
+			set_slob_ptr(cur, cachep);
+			slob_free(cachep, cur, PAGE_SIZE);
+			spin_lock_irqsave(&cachep->lock, flags);
+			cur = cachep->slobfree;
 		}
 	}
 }
 
-static void slob_free(void *block, int size)
+static void slob_free(kmem_cache_t *cachep, void *block, int size)
 {
 	slob_t *cur, *b = (slob_t *)block;
 	unsigned long flags;
@@ -156,26 +245,29 @@ static void slob_free(void *block, int s
 		b->units = SLOB_UNITS(size);
 
 	/* Find reinsertion point */
-	spin_lock_irqsave(&slob_lock, flags);
-	for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+	spin_lock_irqsave(&cachep->lock, flags);
+	for (cur = cachep->slobfree; !(b > cur && b < cur->next); cur = cur->next)
 		if (cur >= cur->next && (b > cur || b < cur->next))
 			break;
 
 	if (b + b->units == cur->next) {
 		b->units += cur->next->units;
 		b->next = cur->next->next;
+		BUG_ON(cur->next == &cachep->arena);
 	} else
 		b->next = cur->next;
 
 	if (cur + cur->units == b) {
 		cur->units += b->units;
 		cur->next = b->next;
+		BUG_ON(b == &cachep->arena);
 	} else
 		cur->next = b;
 
-	slobfree = cur;
+	cachep->slobfree = cur;
 
-	spin_unlock_irqrestore(&slob_lock, flags);
+	test_cache(cachep);
+	spin_unlock_irqrestore(&cachep->lock, flags);
 }
 
 static int FASTCALL(find_order(int size));
@@ -189,15 +281,24 @@ static int fastcall find_order(int size)
 
 void *__kmalloc(size_t size, gfp_t gfp)
 {
-	slob_t *m;
 	bigblock_t *bb;
 
-	if (size < PAGE_SIZE - SLOB_UNIT) {
-		m = slob_alloc(size + SLOB_UNIT, gfp, 0);
-		return m ? (void *)(m + 1) : 0;
+	/*
+	 * If the size is less than PAGE_SIZE >> 1 then
+	 * we use the generic caches.  Otherwise, we
+	 * just allocate the necessary pages.
+	 */
+	if (size <= MAX_SLOB_CACHE_SIZE) {
+		int i;
+		int order;
+		for (i=0, order=32; i < NR_SLOB_CACHES; i++, order <<= 1)
+			if (size <= order)
+				break;
+		BUG_ON(i == NR_SLOB_CACHES);
+		return kmem_cache_alloc(cache_sizes[i], gfp);
 	}
 
-	bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+	bb = slob_alloc(bb_cache, gfp, 0);
 	if (!bb)
 		return 0;
 
@@ -209,26 +310,33 @@ void *__kmalloc(size_t size, gfp_t gfp)
 		return bb->pages;
 	}
 
-	slob_free(bb, sizeof(bigblock_t));
+	slob_free(bb_cache, bb, sizeof(bigblock_t));
 	return 0;
 }
 EXPORT_SYMBOL(__kmalloc);
 
 void kfree(const void *block)
 {
+	kmem_cache_t *c;
 	bigblock_t *bb;
 
 	if (!block)
 		return;
 
+	/*
+	 * look into the page of the allocated block to
+	 * see if this is a big allocation or not.
+	 */
 	bb = get_slob_block(block);
 	if (bb) {
 		free_pages((unsigned long)block, bb->order);
-		slob_free(bb, sizeof(bigblock_t));
+		slob_free(bb_cache, bb, sizeof(bigblock_t));
 		return;
 	}
 
-	slob_free((slob_t *)block - 1, 0);
+	c = get_slob_ptr(block);
+	kmem_cache_free(c, (void *)block);
+
 	return;
 }
 
@@ -237,6 +345,7 @@ EXPORT_SYMBOL(kfree);
 unsigned int ksize(const void *block)
 {
 	bigblock_t *bb;
+	kmem_cache_t *c;
 
 	if (!block)
 		return 0;
@@ -245,14 +354,16 @@ unsigned int ksize(const void *block)
 	if (bb)
 		return PAGE_SIZE << bb->order;
 
-	return ((slob_t *)block - 1)->units * SLOB_UNIT;
+	c = get_slob_ptr(block);
+	return c->size;
 }
 
-struct kmem_cache {
-	unsigned int size, align;
-	const char *name;
-	void (*ctor)(void *, struct kmem_cache *, unsigned long);
-	void (*dtor)(void *, struct kmem_cache *, unsigned long);
+static slob_t cache_arena = { .next = &cache_arena, .units = 0 };
+struct kmem_cache cache_cache = {
+	.name = "cache",
+	.slobfree = &cache_cache.arena,
+	.arena = { .next = &cache_cache.arena, .units = 0 },
+	.lock = SPIN_LOCK_UNLOCKED
 };
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
@@ -261,8 +372,22 @@ struct kmem_cache *kmem_cache_create(con
 	void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
 	struct kmem_cache *c;
+	void *p;
+
+	c = slob_alloc(&cache_cache, flags, 0);
+
+	memset(c, 0, sizeof(*c));
 
-	c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+	c->size = PAGE_SIZE;
+	c->arena.next = &c->arena;
+	c->arena.units = 0;
+	c->slobfree = &c->arena;
+	atomic_set(&c->items, 0);
+	spin_lock_init(&c->lock);
+
+	p = slob_alloc(c, 0, PAGE_SIZE-1);
+	if (p)
+		free_page((unsigned long)p);
 
 	if (c) {
 		c->name = name;
@@ -274,6 +399,9 @@ struct kmem_cache *kmem_cache_create(con
 		if (c->align < align)
 			c->align = align;
 	}
+	down(&cache_chain_sem);
+	list_add_tail(&c->list, &cache_chain);
+	up(&cache_chain_sem);
 
 	return c;
 }
@@ -281,7 +409,17 @@ EXPORT_SYMBOL(kmem_cache_create);
 
 void kmem_cache_destroy(struct kmem_cache *c)
 {
-	slob_free(c, sizeof(struct kmem_cache));
+	down(&cache_chain_sem);
+	list_del(&c->list);
+	up(&cache_chain_sem);
+
+	BUG_ON(atomic_read(&c->items));
+
+	/*
+	 * WARNING!!! Memory leak!
+	 */
+	printk("FIX ME: need to free memory\n");
+	slob_free(&cache_cache, c, sizeof(struct kmem_cache));
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
@@ -289,11 +427,16 @@ void *kmem_cache_alloc(struct kmem_cache
 {
 	void *b;
 
-	if (c->size < PAGE_SIZE)
-		b = slob_alloc(c->size, flags, c->align);
+	atomic_inc(&c->items);
+
+	if (c->size <= MAX_SLOB_CACHE_SIZE)
+		b = slob_alloc(c, flags, c->align);
 	else
 		b = (void *)__get_free_pages(flags, find_order(c->size));
 
+	if (!b)
+		return 0;
+
 	if (c->ctor)
 		c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
 
@@ -313,11 +456,13 @@ EXPORT_SYMBOL(kmem_cache_zalloc);
 
 void kmem_cache_free(struct kmem_cache *c, void *b)
 {
+	atomic_dec(&c->items);
+
 	if (c->dtor)
 		c->dtor(b, c, 0);
 
-	if (c->size < PAGE_SIZE)
-		slob_free(b, c->size);
+	if (c->size <= MAX_SLOB_CACHE_SIZE)
+		slob_free(c, b, c->size);
 	else
 		free_pages((unsigned long)b, find_order(c->size));
 }
@@ -335,9 +480,6 @@ const char *kmem_cache_name(struct kmem_
 }
 EXPORT_SYMBOL(kmem_cache_name);
 
-static struct timer_list slob_timer = TIMER_INITIALIZER(
-	(void (*)(unsigned long))slob_timer_cbk, 0, 0);
-
 int kmem_cache_shrink(struct kmem_cache *d)
 {
 	return 0;
@@ -349,17 +491,55 @@ int kmem_ptr_validate(struct kmem_cache 
 	return 0;
 }
 
-void __init kmem_cache_init(void)
+static char cache_names[NR_SLOB_CACHES][15];
+
+void kmem_cache_init(void)
 {
-	slob_timer_cbk();
+	static int done;
+	void *p;
+
+	if (!done) {
+		int i;
+		int size = 32;
+		done = 1;
+
+		init_MUTEX(&cache_chain_sem);
+		INIT_LIST_HEAD(&cache_chain);
+
+		cache_cache.size = PAGE_SIZE;
+		p = slob_alloc(&cache_cache, 0, PAGE_SIZE-1);
+		if (p)
+			free_page((unsigned long)p);
+		cache_cache.size = sizeof(struct kmem_cache);
+
+		bb_cache = kmem_cache_create("bb_cache",sizeof(bigblock_t), 0,
+					     GFP_KERNEL, NULL, NULL);
+		for (i=0; i < NR_SLOB_CACHES; i++, size <<= 1)
+			cache_sizes[i] = kmem_cache_create(cache_names[i], size, 0,
+							   GFP_KERNEL, NULL, NULL);
+	}
 }
 
-static void slob_timer_cbk(void)
+static void test_slob(slob_t *s)
 {
-	void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+	slob_t *p;
+	long x = 0;
 
-	if (p)
-		free_page((unsigned long)p);
+	for (p=s->next; p != s && x < 10000; p = p->next, x++)
+		printk(".");
+}
+
+void print_slobs(void)
+{
+	struct list_head *curr;
+
+	list_for_each(curr, &cache_chain) {
+		kmem_cache_t *c = list_entry(curr, struct kmem_cache, list);
 
-	mod_timer(&slob_timer, jiffies + HZ);
+		printk("%s items:%d",
+		       c->name?:"<none>",
+		       atomic_read(&c->items));
+		test_slob(&c->arena);
+		printk("\n");
+	}
 }

Follow-Ups:
- Re: [patch 09/10] Remove the SLOB allocator for 2.6.23
  - From: Nick Piggin <nickpiggin@yahoo.com.au>

References:
- [patch 00/10] [RFC] SLUB patches for more functionality, performance and maintenance
  - From: Christoph Lameter <clameter@sgi.com>
- [patch 09/10] Remove the SLOB allocator for 2.6.23
  - From: Christoph Lameter <clameter@sgi.com>
- Re: [patch 09/10] Remove the SLOB allocator for 2.6.23
  - From: Ingo Molnar <mingo@elte.hu>
- Re: [patch 09/10] Remove the SLOB allocator for 2.6.23
  - From: Nick Piggin <nickpiggin@yahoo.com.au>

Prev by Date: Re: RFC: CONFIG_PAGE_SHIFT (aka software PAGE_SIZE)
Next by Date: Re: arch/v850/kernel/time.c compile error
Previous by thread: Re: [patch 09/10] Remove the SLOB allocator for 2.6.23
Next by thread: Re: [patch 09/10] Remove the SLOB allocator for 2.6.23
Index(es):
- Date
- Thread

[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]