On Wed, 16 May 2007, Peter Zijlstra wrote:
> If this 4k cpu system ever gets to touch the new lock it is in way
> deeper problems than a bouncing cache-line.
So its no use on NUMA?
> Please look at it more carefully.
>
> We differentiate pages allocated at the level where GFP_ATOMIC starts to
> fail. By not updating the percpu slabs those are retried every time,
> except for ALLOC_NO_WATERMARKS allocations; those are served from the
> ->reserve_slab.
>
> Once a regular slab allocation succeeds again, the ->reserve_slab is
> cleaned up and never again looked at it until we're in distress again.
A single slab? This may only give you a a single object in an extreme
case. Are you sure that this solution is generic enough?
The problem here is that you may spinlock and take out the slab for one
cpu but then (AFAICT) other cpus can still not get their high priority
allocs satisfied. Some comments follow.
> Signed-off-by: Peter Zijlstra <[email protected]>
> ---
> include/linux/slub_def.h | 2 +
> mm/slub.c | 85 ++++++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 78 insertions(+), 9 deletions(-)
>
> Index: linux-2.6-git/include/linux/slub_def.h
> ===================================================================
> --- linux-2.6-git.orig/include/linux/slub_def.h
> +++ linux-2.6-git/include/linux/slub_def.h
> @@ -46,6 +46,8 @@ struct kmem_cache {
> struct list_head list; /* List of slab caches */
> struct kobject kobj; /* For sysfs */
>
> + struct page *reserve_slab;
> +
> #ifdef CONFIG_NUMA
> int defrag_ratio;
> struct kmem_cache_node *node[MAX_NUMNODES];
> Index: linux-2.6-git/mm/slub.c
> ===================================================================
> --- linux-2.6-git.orig/mm/slub.c
> +++ linux-2.6-git/mm/slub.c
> @@ -20,11 +20,13 @@
> #include <linux/mempolicy.h>
> #include <linux/ctype.h>
> #include <linux/kallsyms.h>
> +#include "internal.h"
>
> /*
> * Lock order:
> - * 1. slab_lock(page)
> - * 2. slab->list_lock
> + * 1. reserve_lock
> + * 2. slab_lock(page)
> + * 3. node->list_lock
> *
> * The slab_lock protects operations on the object of a particular
> * slab and its metadata in the page struct. If the slab lock
> @@ -259,6 +261,8 @@ static int sysfs_slab_alias(struct kmem_
> static void sysfs_slab_remove(struct kmem_cache *s) {}
> #endif
>
> +static DEFINE_SPINLOCK(reserve_lock);
> +
> /********************************************************************
> * Core slab cache functions
> *******************************************************************/
> @@ -1007,7 +1011,7 @@ static void setup_object(struct kmem_cac
> s->ctor(object, s, 0);
> }
>
> -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
> +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int *rank)
> {
> struct page *page;
> struct kmem_cache_node *n;
> @@ -1025,6 +1029,7 @@ static struct page *new_slab(struct kmem
> if (!page)
> goto out;
>
> + *rank = page->rank;
> n = get_node(s, page_to_nid(page));
> if (n)
> atomic_long_inc(&n->nr_slabs);
> @@ -1311,7 +1316,7 @@ static void unfreeze_slab(struct kmem_ca
> /*
> * Remove the cpu slab
> */
> -static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
> +static void __deactivate_slab(struct kmem_cache *s, struct page *page)
> {
> /*
> * Merge cpu freelist into freelist. Typically we get here
> @@ -1330,10 +1335,15 @@ static void deactivate_slab(struct kmem_
> page->freelist = object;
> page->inuse--;
> }
> - s->cpu_slab[cpu] = NULL;
> unfreeze_slab(s, page);
> }
So you want to spill back the lockless_freelist without deactivating the
slab? Why are you using the lockless_freelist at all? If you do not use it
then you can call unfreeze_slab. No need for this split.
> @@ -1395,6 +1405,7 @@ static void *__slab_alloc(struct kmem_ca
> {
> void **object;
> int cpu = smp_processor_id();
> + int rank = 0;
>
> if (!page)
> goto new_slab;
> @@ -1424,10 +1435,26 @@ new_slab:
> if (page) {
> s->cpu_slab[cpu] = page;
> goto load_freelist;
> - }
> + } else if (unlikely(gfp_to_alloc_flags(gfpflags) & ALLOC_NO_WATERMARKS))
> + goto try_reserve;
Ok so we are trying to allocate a slab and do not get one thus ->
try_reserve. But this is only working if we are using the slab after
explicitly flushing the cpuslabs. Otherwise the slab may be full and we
get to alloc_slab.
>
> - page = new_slab(s, gfpflags, node);
> - if (page) {
> +alloc_slab:
> + page = new_slab(s, gfpflags, node, &rank);
> + if (page && rank) {
Huh? You mean !page?
> + if (unlikely(s->reserve_slab)) {
> + struct page *reserve;
> +
> + spin_lock(&reserve_lock);
> + reserve = s->reserve_slab;
> + s->reserve_slab = NULL;
> + spin_unlock(&reserve_lock);
> +
> + if (reserve) {
> + slab_lock(reserve);
> + __deactivate_slab(s, reserve);
> + putback_slab(s, reserve);
Remove the above two lines (they are wrong regardless) and simply make
this the cpu slab.
> + }
> + }
> cpu = smp_processor_id();
> if (s->cpu_slab[cpu]) {
> /*
> @@ -1455,6 +1482,18 @@ new_slab:
> SetSlabFrozen(page);
> s->cpu_slab[cpu] = page;
> goto load_freelist;
> + } else if (page) {
> + spin_lock(&reserve_lock);
> + if (s->reserve_slab) {
> + discard_slab(s, page);
> + page = s->reserve_slab;
> + }
> + slab_lock(page);
> + SetPageActive(page);
> + s->reserve_slab = page;
> + spin_unlock(&reserve_lock);
> +
> + goto got_reserve;
> }
> return NULL;
> debug:
> @@ -1470,6 +1509,31 @@ debug:
> page->freelist = object[page->offset];
> slab_unlock(page);
> return object;
> +
> +try_reserve:
> + spin_lock(&reserve_lock);
> + page = s->reserve_slab;
> + if (!page) {
> + spin_unlock(&reserve_lock);
> + goto alloc_slab;
> + }
> +
> + slab_lock(page);
> + if (!page->freelist) {
> + s->reserve_slab = NULL;
> + spin_unlock(&reserve_lock);
> + __deactivate_slab(s, page);
replace with unfreeze slab.
> + putback_slab(s, page);
Putting back the slab twice.
> + goto alloc_slab;
> + }
> + spin_unlock(&reserve_lock);
> +
> +got_reserve:
> + object = page->freelist;
> + page->inuse++;
> + page->freelist = object[page->offset];
> + slab_unlock(page);
> + return object;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]