Re: [rfc 04/45] cpu alloc: Use in SLUB

* [email protected] ([email protected]) wrote:
> Using cpu alloc removes the needs for the per cpu arrays in the kmem_cache struct.
> These could get quite big if we have to support system of up to thousands of cpus.
> The use of alloc_percpu means that:
> 
> 1. The size of kmem_cache for SMP configuration shrinks since we will only
>    need 1 pointer instead of NR_CPUS. The same pointer can be used by all
>    processors. Reduces cache footprint of the allocator.
> 
> 2. We can dynamically size kmem_cache according to the actual nodes in the
>    system meaning less memory overhead for configurations that may potentially
>    support up to 1k NUMA nodes.
> 
> 3. We can remove the diddle widdle with allocating and releasing kmem_cache_cpu
>    structures when bringing up and shuttting down cpus. The allocpercpu
>    logic will do it all for us. Removes some portions of the cpu hotplug
>    functionality.
> 
> 4. Fastpath performance increases by another 20% vs. the earlier improvements.
>    Instead of having fastpath with 45-50 cycles it is now possible to get
>    below 40.
> 
> Remove the CONFIG_FAST_CMPXCHG version since this patch makes SLUB use CPU ops
> instead.
> 
> Signed-off-by: Christoph Lameter <[email protected]>
> ---
>  arch/x86/Kconfig         |    4 
>  include/linux/slub_def.h |    6 -
>  mm/slub.c                |  229 ++++++++++-------------------------------------
>  3 files changed, 52 insertions(+), 187 deletions(-)
> 
> Index: linux-2.6/include/linux/slub_def.h
> ===================================================================
> --- linux-2.6.orig/include/linux/slub_def.h	2007-11-19 15:45:08.270140279 -0800
> +++ linux-2.6/include/linux/slub_def.h	2007-11-19 15:53:25.869890760 -0800
> @@ -34,6 +34,7 @@ struct kmem_cache_node {
>   * Slab cache management.
>   */
>  struct kmem_cache {
> +	struct kmem_cache_cpu *cpu_slab;
>  	/* Used for retriving partial slabs etc */
>  	unsigned long flags;
>  	int size;		/* The size of an object including meta data */
> @@ -63,11 +64,6 @@ struct kmem_cache {
>  	int defrag_ratio;
>  	struct kmem_cache_node *node[MAX_NUMNODES];
>  #endif
> -#ifdef CONFIG_SMP
> -	struct kmem_cache_cpu *cpu_slab[NR_CPUS];
> -#else
> -	struct kmem_cache_cpu cpu_slab;
> -#endif
>  };
>  
>  /*
> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c	2007-11-19 15:45:08.278140252 -0800
> +++ linux-2.6/mm/slub.c	2007-11-19 15:54:10.513640214 -0800
> @@ -239,15 +239,6 @@ static inline struct kmem_cache_node *ge
>  #endif
>  }
>  
> -static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
> -{
> -#ifdef CONFIG_SMP
> -	return s->cpu_slab[cpu];
> -#else
> -	return &s->cpu_slab;
> -#endif
> -}
> -
>  /*
>   * The end pointer in a slab is special. It points to the first object in the
>   * slab but has bit 0 set to mark it.
> @@ -1472,7 +1463,7 @@ static inline void flush_slab(struct kme
>   */
>  static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
>  {
> -	struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> +	struct kmem_cache_cpu *c = CPU_PTR(s->cpu_slab, cpu);
>  
>  	if (likely(c && c->page))
>  		flush_slab(s, c);
> @@ -1487,15 +1478,7 @@ static void flush_cpu_slab(void *d)
>  
>  static void flush_all(struct kmem_cache *s)
>  {
> -#ifdef CONFIG_SMP
>  	on_each_cpu(flush_cpu_slab, s, 1, 1);
> -#else
> -	unsigned long flags;
> -
> -	local_irq_save(flags);
> -	flush_cpu_slab(s);
> -	local_irq_restore(flags);
> -#endif
>  }
>  

Normally :

You can't use on_each_cpu if interrupts are already disabled. Therefore,
the implementation using "local_irq_disable/enable" in smp.h for the UP
case is semantically correct and there is no need to use a save/restore.
So just using on_each_cpu should be enough here.

I also wonder about flush_cpu_slab execution on _other_ cpus. I am not
convinced interrupts are disabled when it executes.. have I missing
something ?


>  /*
> @@ -1511,6 +1494,15 @@ static inline int node_match(struct kmem
>  	return 1;
>  }
>  
> +static inline int cpu_node_match(struct kmem_cache_cpu *c, int node)
> +{
> +#ifdef CONFIG_NUMA
> +	if (node != -1 && __CPU_READ(c->node) != node)
> +		return 0;
> +#endif
> +	return 1;
> +}
> +
>  /* Allocate a new slab and make it the current cpu slab */
>  static noinline unsigned long get_new_slab(struct kmem_cache *s,
>  	struct kmem_cache_cpu **pc, gfp_t gfpflags, int node)
> @@ -1529,7 +1521,7 @@ static noinline unsigned long get_new_sl
>  	if (!page)
>  		return 0;
>  
> -	*pc = c = get_cpu_slab(s, smp_processor_id());
> +	*pc = c = THIS_CPU(s->cpu_slab);

I think the preferred coding style is :

c = THIS_CPU(s->cpu_slab);
*pc = c;

>  	if (c->page)
>  		flush_slab(s, c);
>  	c->page = page;
> @@ -1554,16 +1546,18 @@ static noinline unsigned long get_new_sl
>   * we need to allocate a new slab. This is slowest path since we may sleep.
>   */
>  static void *__slab_alloc(struct kmem_cache *s,
> -		gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
> +		gfp_t gfpflags, int node, void *addr)
>  {
>  	void **object;
>  	unsigned long state;
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> +	struct kmem_cache_cpu *c;
> +#ifdef CONFIG_FAST_CPU_OPS
>  	unsigned long flags;
>  
>  	local_irq_save(flags);
>  	preempt_enable_no_resched();
>  #endif
> +	c = THIS_CPU(s->cpu_slab);
>  	if (likely(c->page)) {
>  		state = slab_lock(c->page);
>  
> @@ -1597,7 +1591,7 @@ load_freelist:
>  unlock_out:
>  	slab_unlock(c->page, state);
>  out:
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> +#ifdef CONFIG_FAST_CPU_OPS
>  	preempt_disable();
>  	local_irq_restore(flags);
>  #endif
> @@ -1640,26 +1634,24 @@ static void __always_inline *slab_alloc(
>  	void **object;
>  	struct kmem_cache_cpu *c;
>  
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> -	c = get_cpu_slab(s, get_cpu());
> +#ifdef CONFIG_FAST_CPU_OPS

I wonder.. are there some architectures that would provide fast
cmpxchg_local but not fast cpu ops ?

> +	c = s->cpu_slab;
>  	do {
> -		object = c->freelist;
> -		if (unlikely(is_end(object) || !node_match(c, node))) {
> -			object = __slab_alloc(s, gfpflags, node, addr, c);
> -			if (unlikely(!object)) {
> -				put_cpu();
> +		object = __CPU_READ(c->freelist);
> +		if (unlikely(is_end(object) ||
> +					!cpu_node_match(c, node))) {
> +			object = __slab_alloc(s, gfpflags, node, addr);
> +			if (unlikely(!object))
>  				goto out;
> -			}
>  			break;
>  		}
> -	} while (cmpxchg_local(&c->freelist, object, object[c->offset])
> -								!= object);
> -	put_cpu();
> +	} while (CPU_CMPXCHG(c->freelist, object,
> +			object[__CPU_READ(c->offset)]) != object);

Hrm. What happens here if we call __slab_alloc, get a valid object, then
have a CPU_CMPXCHG that fails, restart the loop.. is this case taken
care of or do we end up having an unreferenced object ? Maybe there is
some logic in freelist that takes care of it ?

Also, we have to be aware that we can now change CPU between the

__CPU_READ and the CPU_CMPXCHG. (also : should it be a __CPU_CMPXCHG ?)

But since "object" contains information specific to the local CPU, the
cmpxchg should fail if we are migrated and everything should be ok.

Hrm, actually, the 

c = s->cpu_slab;

should probably be after the object = __CPU_READ(c->freelist); ?

The cpu_read acts as a safeguard checking that we do not change CPU
between the read and the cmpxchg. If we are preempted between the "c"
read and the cpu_read, we could do a !cpu_node_match(c, node) check that
would apply to the wrong cpu.


>  #else
>  	unsigned long flags;
>  
>  	local_irq_save(flags);
> -	c = get_cpu_slab(s, smp_processor_id());
> +	c = THIS_CPU(s->cpu_slab);
>  	if (unlikely((is_end(c->freelist)) || !node_match(c, node))) {
>  
>  		object = __slab_alloc(s, gfpflags, node, addr, c);
> @@ -1709,7 +1701,7 @@ static void __slab_free(struct kmem_cach
>  	void **object = (void *)x;
>  	unsigned long state;
>  
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> +#ifdef CONFIG_FAST_CPU_OPS
>  	unsigned long flags;
>  
>  	local_irq_save(flags);
> @@ -1739,7 +1731,7 @@ checks_ok:
>  
>  out_unlock:
>  	slab_unlock(page, state);
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> +#ifdef CONFIG_FAST_CPU_OPS
>  	local_irq_restore(flags);
>  #endif
>  	return;
> @@ -1752,7 +1744,7 @@ slab_empty:
>  		remove_partial(s, page);
>  
>  	slab_unlock(page, state);
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> +#ifdef CONFIG_FAST_CPU_OPS
>  	local_irq_restore(flags);
>  #endif
>  	discard_slab(s, page);
> @@ -1781,13 +1773,13 @@ static void __always_inline slab_free(st
>  	void **object = (void *)x;
>  	struct kmem_cache_cpu *c;
>  
> -#ifdef CONFIG_FAST_CMPXCHG_LOCAL
> +#ifdef CONFIG_FAST_CPU_OPS
>  	void **freelist;
>  
> -	c = get_cpu_slab(s, get_cpu());
> +	c = s->cpu_slab;
>  	debug_check_no_locks_freed(object, s->objsize);
>  	do {
> -		freelist = c->freelist;
> +		freelist = __CPU_READ(c->freelist);

Same here, c = s->cpu_slab; should probably be read after.

>  		barrier();
>  		/*
>  		 * If the compiler would reorder the retrieval of c->page to
> @@ -1800,19 +1792,19 @@ static void __always_inline slab_free(st
>  		 * then any change of cpu_slab will cause the cmpxchg to fail
>  		 * since the freelist pointers are unique per slab.
>  		 */
> -		if (unlikely(page != c->page || c->node < 0)) {
> -			__slab_free(s, page, x, addr, c->offset);
> +		if (unlikely(page != __CPU_READ(c->page) ||
> +					__CPU_READ(c->node) < 0)) {
> +			__slab_free(s, page, x, addr, __CPU_READ(c->offset));

And same question as above : what happens if we fail after executing the
__slab_free.. is it valid to do it twice ?

>  			break;
>  		}
> -		object[c->offset] = freelist;
> -	} while (cmpxchg_local(&c->freelist, freelist, object) != freelist);
> -	put_cpu();
> +		object[__CPU_READ(c->offset)] = freelist;
> +	} while (CPU_CMPXCHG(c->freelist, freelist, object) != freelist);
>  #else
>  	unsigned long flags;
>  
>  	local_irq_save(flags);
>  	debug_check_no_locks_freed(object, s->objsize);
> -	c = get_cpu_slab(s, smp_processor_id());
> +	c = THIS_CPU(s->cpu_slab);
>  	if (likely(page == c->page && c->node >= 0)) {
>  		object[c->offset] = c->freelist;
>  		c->freelist = object;
> @@ -2015,130 +2007,19 @@ static void init_kmem_cache_node(struct 
>  #endif
>  }
>  
> -#ifdef CONFIG_SMP
> -/*
> - * Per cpu array for per cpu structures.
> - *
> - * The per cpu array places all kmem_cache_cpu structures from one processor
> - * close together meaning that it becomes possible that multiple per cpu
> - * structures are contained in one cacheline. This may be particularly
> - * beneficial for the kmalloc caches.
> - *
> - * A desktop system typically has around 60-80 slabs. With 100 here we are
> - * likely able to get per cpu structures for all caches from the array defined
> - * here. We must be able to cover all kmalloc caches during bootstrap.
> - *
> - * If the per cpu array is exhausted then fall back to kmalloc
> - * of individual cachelines. No sharing is possible then.
> - */
> -#define NR_KMEM_CACHE_CPU 100
> -
> -static DEFINE_PER_CPU(struct kmem_cache_cpu,
> -				kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
> -
> -static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
> -static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
> -
> -static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
> -							int cpu, gfp_t flags)
> -{
> -	struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
> -
> -	if (c)
> -		per_cpu(kmem_cache_cpu_free, cpu) =
> -				(void *)c->freelist;
> -	else {
> -		/* Table overflow: So allocate ourselves */
> -		c = kmalloc_node(
> -			ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
> -			flags, cpu_to_node(cpu));
> -		if (!c)
> -			return NULL;
> -	}
> -
> -	init_kmem_cache_cpu(s, c);
> -	return c;
> -}
> -
> -static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
> -{
> -	if (c < per_cpu(kmem_cache_cpu, cpu) ||
> -			c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
> -		kfree(c);
> -		return;
> -	}
> -	c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
> -	per_cpu(kmem_cache_cpu_free, cpu) = c;
> -}
> -
> -static void free_kmem_cache_cpus(struct kmem_cache *s)
> -{
> -	int cpu;
> -
> -	for_each_online_cpu(cpu) {
> -		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> -
> -		if (c) {
> -			s->cpu_slab[cpu] = NULL;
> -			free_kmem_cache_cpu(c, cpu);
> -		}
> -	}
> -}
> -
>  static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
>  {
>  	int cpu;
>  
> -	for_each_online_cpu(cpu) {
> -		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> +	s->cpu_slab = CPU_ALLOC(struct kmem_cache_cpu, flags);
>  
> -		if (c)
> -			continue;
> -
> -		c = alloc_kmem_cache_cpu(s, cpu, flags);
> -		if (!c) {
> -			free_kmem_cache_cpus(s);
> -			return 0;
> -		}
> -		s->cpu_slab[cpu] = c;
> -	}
> -	return 1;
> -}
> -
> -/*
> - * Initialize the per cpu array.
> - */
> -static void init_alloc_cpu_cpu(int cpu)
> -{
> -	int i;
> -
> -	if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
> -		return;
> -
> -	for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
> -		free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
> -
> -	cpu_set(cpu, kmem_cach_cpu_free_init_once);
> -}
> -
> -static void __init init_alloc_cpu(void)
> -{
> -	int cpu;
> +	if (!s->cpu_slab)
> +		return 0;
>  
>  	for_each_online_cpu(cpu)
> -		init_alloc_cpu_cpu(cpu);
> -  }
> -
> -#else
> -static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
> -static inline void init_alloc_cpu(void) {}
> -
> -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
> -{
> -	init_kmem_cache_cpu(s, &s->cpu_slab);
> +		init_kmem_cache_cpu(s, CPU_PTR(s->cpu_slab, cpu));
>  	return 1;
>  }
> -#endif
>  
>  #ifdef CONFIG_NUMA
>  /*
> @@ -2452,9 +2333,8 @@ static inline int kmem_cache_close(struc
>  	int node;
>  
>  	flush_all(s);
> -
> +	CPU_FREE(s->cpu_slab);
>  	/* Attempt to free all objects */
> -	free_kmem_cache_cpus(s);
>  	for_each_node_state(node, N_NORMAL_MEMORY) {
>  		struct kmem_cache_node *n = get_node(s, node);
>  
> @@ -2958,8 +2838,6 @@ void __init kmem_cache_init(void)
>  	int i;
>  	int caches = 0;
>  
> -	init_alloc_cpu();
> -
>  #ifdef CONFIG_NUMA
>  	/*
>  	 * Must first have the slab cache available for the allocations of the
> @@ -3019,11 +2897,12 @@ void __init kmem_cache_init(void)
>  	for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
>  		kmalloc_caches[i]. name =
>  			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
> -
>  #ifdef CONFIG_SMP
>  	register_cpu_notifier(&slab_notifier);
> -	kmem_size = offsetof(struct kmem_cache, cpu_slab) +
> -				nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
> +#endif
> +#ifdef CONFIG_NUMA
> +	kmem_size = offsetof(struct kmem_cache, node) +
> +				nr_node_ids * sizeof(struct kmem_cache_node *);
>  #else
>  	kmem_size = sizeof(struct kmem_cache);
>  #endif
> @@ -3120,7 +2999,7 @@ struct kmem_cache *kmem_cache_create(con
>  		 * per cpu structures
>  		 */
>  		for_each_online_cpu(cpu)
> -			get_cpu_slab(s, cpu)->objsize = s->objsize;
> +			CPU_PTR(s->cpu_slab, cpu)->objsize = s->objsize;
>  		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
>  		up_write(&slub_lock);
>  		if (sysfs_slab_alias(s, name))
> @@ -3165,11 +3044,9 @@ static int __cpuinit slab_cpuup_callback
>  	switch (action) {
>  	case CPU_UP_PREPARE:
>  	case CPU_UP_PREPARE_FROZEN:
> -		init_alloc_cpu_cpu(cpu);
>  		down_read(&slub_lock);
>  		list_for_each_entry(s, &slab_caches, list)
> -			s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
> -							GFP_KERNEL);
> +			init_kmem_cache_cpu(s, __CPU_PTR(s->cpu_slab, cpu));
>  		up_read(&slub_lock);
>  		break;
>  
> @@ -3179,13 +3056,9 @@ static int __cpuinit slab_cpuup_callback
>  	case CPU_DEAD_FROZEN:
>  		down_read(&slub_lock);
>  		list_for_each_entry(s, &slab_caches, list) {
> -			struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> -
>  			local_irq_save(flags);
>  			__flush_cpu_slab(s, cpu);
>  			local_irq_restore(flags);
> -			free_kmem_cache_cpu(c, cpu);
> -			s->cpu_slab[cpu] = NULL;
>  		}
>  		up_read(&slub_lock);
>  		break;
> @@ -3657,7 +3530,7 @@ static unsigned long slab_objects(struct
>  	for_each_possible_cpu(cpu) {
>  		struct page *page;
>  		int node;
> -		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> +		struct kmem_cache_cpu *c = CPU_PTR(s->cpu_slab, cpu);
>  
>  		if (!c)
>  			continue;
> @@ -3724,7 +3597,7 @@ static int any_slab_objects(struct kmem_
>  	int cpu;
>  
>  	for_each_possible_cpu(cpu) {
> -		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
> +		struct kmem_cache_cpu *c = CPU_PTR(s->cpu_slab, cpu);
>  
>  		if (c && c->page)
>  			return 1;
> Index: linux-2.6/arch/x86/Kconfig
> ===================================================================
> --- linux-2.6.orig/arch/x86/Kconfig	2007-11-19 15:53:55.529390403 -0800
> +++ linux-2.6/arch/x86/Kconfig	2007-11-19 15:54:10.509139813 -0800
> @@ -112,10 +112,6 @@ config GENERIC_TIME_VSYSCALL
>  	bool
>  	default X86_64
>  
> -config FAST_CMPXCHG_LOCAL
> -	bool
> -	default y
> -
>  config ZONE_DMA32
>  	bool
>  	default X86_64
> 
> -- 

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Follow-Ups:
- Re: [rfc 04/45] cpu alloc: Use in SLUB
  - From: Christoph Lameter <[email protected]>
References:
- [rfc 00/45] [RFC] CPU ops and a rework of per cpu data handling on x86_64
  - From: [email protected]
- [rfc 04/45] cpu alloc: Use in SLUB
  - From: [email protected]
Prev by Date: NULL dereference in clockevents_program_event
Next by Date: Re: [rfc 01/45] ACPI: Avoid references to impossible processors.
Previous by thread: [rfc 04/45] cpu alloc: Use in SLUB
Next by thread: Re: [rfc 04/45] cpu alloc: Use in SLUB
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]