[patch 9/11] net: dst_entry.refcount, use, lastuse to use alloc_percpu

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Patch to use alloc_percpu for dst_entry.refcount.  This patch reduces the
cacheline bouncing of the atomic_t dst_entry.__refcount.  This Patch gets us
55% better tbench throughput, on a 8way x445 box.

Signed-off by: Pravin B. Shelar <[email protected]>
Signed-off by: Shobhit Dayal <[email protected]>
Signed-off by: Christoph Lameter <[email protected]>
Signed-off by: Ravikiran Thirumalai <[email protected]>

Index: alloc_percpu-2.6.13/include/net/dst.h
===================================================================
--- alloc_percpu-2.6.13.orig/include/net/dst.h	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/include/net/dst.h	2005-09-12 16:44:05.000000000 -0700
@@ -35,11 +35,33 @@
 
 struct sk_buff;
 
+#ifdef CONFIG_NUMA
+
+/*	A per cpu instance of this exist for every dst_entry.
+ *	These are the most written fields of dst_entry.
+ */
+struct per_cpu_cnt
+{
+	int 		refcnt;
+	int 		use;
+	unsigned long	lastuse;
+};
+
+#endif
+
 struct dst_entry
 {
 	struct dst_entry        *next;
+#ifdef CONFIG_NUMA
+	/* first cpu that should be checked for time-out */
+	int 			s_cpu;
+	/* per cpu client references   */
+	struct per_cpu_cnt	*pcc;
+#else
 	atomic_t		__refcnt;	/* client references	*/
 	int			__use;
+	unsigned long		lastuse;
+#endif
 	struct dst_entry	*child;
 	struct net_device       *dev;
 	short			error;
@@ -50,7 +72,6 @@
 #define DST_NOPOLICY		4
 #define DST_NOHASH		8
 #define DST_BALANCED            0x10
-	unsigned long		lastuse;
 	unsigned long		expires;
 
 	unsigned short		header_len;	/* more space at head required */
@@ -103,25 +124,94 @@
 
 #ifdef __KERNEL__
 
+#ifdef CONFIG_NUMA
+
+static inline int dst_use(struct dst_entry *dst)
+{
+	int total = 0, cpu;
+
+	for_each_online_cpu(cpu)
+		total += per_cpu_ptr(dst->pcc, cpu)->use;
+	return total;
+}
+
+#define dst_use_inc(__dst) do {					\
+		per_cpu_ptr((__dst)->pcc, get_cpu())->use++ ;	\
+		put_cpu();					\
+	} while(0);
+
+static inline unsigned long dst_lastuse(struct dst_entry *dst)
+{
+	unsigned long max = 0;
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		if (max < per_cpu_ptr(dst->pcc, cpu)->lastuse)
+			max = per_cpu_ptr(dst->pcc, cpu)->lastuse;
+	return max;
+}
+
+#define dst_lastuse_set(__dst)  do {					  \
+		per_cpu_ptr((__dst)->pcc, get_cpu())->lastuse = jiffies ; \
+		put_cpu();						  \
+	} while(0);
+
+static inline int dst_refcnt(struct dst_entry *dst)
+{
+	int cpu, sum = 0;
+
+	for_each_online_cpu(cpu)
+		sum += per_cpu_ptr(dst->pcc, cpu)->refcnt;
+
+	return sum;
+}
+
+#define dst_refcnt_one(__dst) do { 					  \
+			per_cpu_ptr((__dst)->pcc, get_cpu())->refcnt = 1; \
+			put_cpu();		  			  \
+		} while(0);
+
+#define dst_refcnt_dec(__dst) do { 					\
+			per_cpu_ptr((__dst)->pcc, get_cpu())->refcnt--;	\
+			put_cpu();					\
+		} while(0);
+#define dst_hold(__dst) do { 						 \
+			per_cpu_ptr((__dst)->pcc, get_cpu())->refcnt++ ; \
+			put_cpu();					 \
+		} while(0);
+
+#else
+
 #define dst_use(__dst) (__dst)->__use
 #define dst_use_inc(__dst) (__dst)->__use++
 
 #define dst_lastuse(__dst) (__dst)->lastuse
 #define dst_lastuse_set(__dst) (__dst)->lastuse = jiffies
 
-#define dst_update_tu(__dst) do { dst_lastuse_set(__dst);dst_use_inc(__dst); } while (0)
-#define dst_update_rtu(__dst) do { dst_lastuse_set(__dst);dst_hold(__dst);dst_use_inc(__dst); } while (0)
-
 #define dst_refcnt(__dst) atomic_read(&(__dst)->__refcnt)
 #define dst_refcnt_one(__dst) atomic_set(&(__dst)->__refcnt, 1)
 #define dst_refcnt_dec(__dst) atomic_dec(&(__dst)->__refcnt)
 #define dst_hold(__dst) atomic_inc(&(__dst)->__refcnt)
 
+#endif
+#define dst_update_tu(__dst) do { 		\
+		dst_lastuse_set(__dst);		\
+		dst_use_inc(__dst); 		\
+	} while (0);
+
+#define dst_update_rtu(__dst) do { 		\
+		dst_lastuse_set(__dst);		\
+		dst_hold(__dst);		\
+		dst_use_inc(__dst); 		\
+	} while (0)
+
 static inline
 void dst_release(struct dst_entry * dst)
 {
 	if (dst) {
+#if  (!defined (CONFIG_NUMA) || (RT_CACHE_DEBUG >= 2 ))
 		WARN_ON(dst_refcnt(dst) < 1);
+#endif
 		smp_mb__before_atomic_dec();
 		dst_refcnt_dec(dst);
 	}
@@ -271,6 +361,48 @@
 
 extern void		dst_init(void);
 
+/*	This function allocates and initializes rtu array of given dst-entry.
+ */
+static inline int dst_init_rtu_array(struct dst_entry *dst)
+{
+#ifdef CONFIG_NUMA
+	int cpu;
+	dst->pcc = alloc_percpu(struct per_cpu_cnt, GFP_ATOMIC);
+	if(!dst->pcc)
+		return -ENOMEM;
+
+	for_each_cpu(cpu) {
+		per_cpu_ptr(dst->pcc, cpu)->use = 0;
+		per_cpu_ptr(dst->pcc, cpu)->refcnt = 0;
+		per_cpu_ptr(dst->pcc, cpu)->lastuse = jiffies;
+	}
+	dst->s_cpu = smp_processor_id();
+#else
+	atomic_set(&dst->__refcnt, 0);
+	dst->lastuse = jiffies;
+#endif
+	return 0;
+}
+
+static inline void dst_free_rtu_array(struct dst_entry *dst)
+{
+#ifdef CONFIG_NUMA
+	free_percpu(dst->pcc);
+#endif
+}
+
+#if	defined (CONFIG_HOTPLUG_CPU) && defined (CONFIG_NUMA)
+inline static void dst_ref_xfr_cpu_down(struct dst_entry *__dst, int cpu)
+{
+	int refcnt = per_cpu_ptr((__dst)->pcc, cpu)->refcnt;
+	if (refcnt) {
+		per_cpu_ptr((__dst)->pcc, get_cpu())->refcnt += refcnt;
+		put_cpu();
+		per_cpu_ptr((__dst)->pcc, cpu)->refcnt = 0;
+	}
+}
+#endif
+
 struct flowi;
 #ifndef CONFIG_XFRM
 static inline int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl,
Index: alloc_percpu-2.6.13/net/bridge/br_netfilter.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/bridge/br_netfilter.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/bridge/br_netfilter.c	2005-09-12 12:24:01.000000000 -0700
@@ -85,7 +85,6 @@
 static struct rtable __fake_rtable = {
 	.u = {
 		.dst = {
-			.__refcnt		= ATOMIC_INIT(1),
 			.dev			= &__fake_net_device,
 			.path			= &__fake_rtable.u.dst,
 			.metrics		= {[RTAX_MTU - 1] = 1500},
@@ -1010,6 +1009,10 @@
 {
 	int i;
 
+	if (dst_init_rtu_array(&__fake_rtable.u.dst) < 0)
+		panic("br_netfilter : cannot allocate memory for dst-entry rtu array");
+	dst_refcnt_one(&__fake_rtable.u.dst);
+
 	for (i = 0; i < ARRAY_SIZE(br_nf_ops); i++) {
 		int ret;
 
@@ -1046,4 +1049,5 @@
 #ifdef CONFIG_SYSCTL
 	unregister_sysctl_table(brnf_sysctl_header);
 #endif
+	dst_free_rtu_array(&__fake_rtable.u.dst);
 }
Index: alloc_percpu-2.6.13/net/core/dst.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/core/dst.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/core/dst.c	2005-09-12 12:24:01.000000000 -0700
@@ -131,9 +131,9 @@
 	if (!dst)
 		return NULL;
 	memset(dst, 0, ops->entry_size);
-	atomic_set(&dst->__refcnt, 0);
+	if (dst_init_rtu_array(dst) < 0)
+		return NULL;
 	dst->ops = ops;
-	dst->lastuse = jiffies;
 	dst->path = dst;
 	dst->input = dst_discard_in;
 	dst->output = dst_discard_out;
@@ -200,6 +200,7 @@
 #if RT_CACHE_DEBUG >= 2 
 	atomic_dec(&dst_total);
 #endif
+	dst_free_rtu_array(dst);
 	kmem_cache_free(dst->ops->kmem_cachep, dst);
 
 	dst = child;
Index: alloc_percpu-2.6.13/net/decnet/dn_route.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/decnet/dn_route.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/decnet/dn_route.c	2005-09-12 12:24:01.000000000 -0700
@@ -77,6 +77,7 @@
 #include <linux/netfilter_decnet.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
+#include <linux/cpu.h>
 #include <asm/errno.h>
 #include <net/neighbour.h>
 #include <net/dst.h>
@@ -157,7 +158,29 @@
 
 static inline int dn_dst_useful(struct dn_route *rth, unsigned long now, unsigned long expire)
 {
+#ifdef CONFIG_NUMA
+	{
+		int max, sum = 0, age, cpu;
+		struct dst_entry *dst = &rth->u.dst;
+
+		cpu = dst->s_cpu;
+		max = cpu + NR_CPUS;
+		for(sum = 0; cpu < max; cpu++) {
+			int cpu_ = cpu % NR_CPUS;
+			if (cpu_online(cpu_)) {
+				sum += per_cpu_ptr(dst->pcc, cpu_)->refcnt;
+				age = now - per_cpu_ptr(dst->pcc, cpu_)->lastuse;
+				if (age <= expire) {
+					dst->s_cpu = cpu_ ;
+					return 1;
+				}
+			}
+		}
+		return (sum != 0);
+	}
+#else
 	return  (atomic_read(&rth->u.dst.__refcnt) || (now - rth->u.dst.lastuse) < expire) ;
+#endif
 }
 
 static void dn_dst_check_expire(unsigned long dummy)
@@ -1766,6 +1789,43 @@
 
 #endif /* CONFIG_PROC_FS */
 
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+static int __devinit dn_rtcache_cpu_callback(struct notifier_block *nfb,
+                                   unsigned long action,
+                                   void *hcpu)
+{
+	int cpu = (int) hcpu;
+
+	switch(action) {
+		int i;
+		struct dn_route *rt, *next;
+
+		case CPU_DEAD:
+
+		for(i = 0; i < dn_rt_hash_mask; i++) {
+			spin_lock_bh(&dn_rt_hash_table[i].lock);
+
+			if ((rt = dn_rt_hash_table[i].chain) == NULL)
+				goto nothing_to_do;
+
+			for(; rt; rt=next) {
+				dst_ref_xfr_cpu_down(&rt->u.dst, cpu);
+				next = rt->u.rt_next;
+			}
+nothing_to_do:
+			spin_unlock_bh(&dn_rt_hash_table[i].lock);
+		}
+
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block dn_rtcache_cpu_notifier =
+			{ &dn_rtcache_cpu_callback, NULL, 0 };
+
+#endif
+
 void __init dn_route_init(void)
 {
 	int i, goal, order;
@@ -1822,10 +1882,16 @@
         dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1);
 
 	proc_net_fops_create("decnet_cache", S_IRUGO, &dn_rt_cache_seq_fops);
+#if	defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	register_cpu_notifier(&dn_rtcache_cpu_notifier);
+#endif
 }
 
 void __exit dn_route_cleanup(void)
 {
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	unregister_cpu_notifier(&dn_rtcache_cpu_notifier);
+#endif
 	del_timer(&dn_route_timer);
 	dn_run_flush(0);
 
Index: alloc_percpu-2.6.13/net/ipv4/route.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/ipv4/route.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/ipv4/route.c	2005-09-12 12:24:01.000000000 -0700
@@ -92,6 +92,7 @@
 #include <linux/jhash.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
+#include <linux/cpu.h>
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/route.h>
@@ -507,6 +508,54 @@
 		rth->u.dst.expires;
 }
 
+#ifdef CONFIG_NUMA
+
+/*
+ * For NUMA systems, we do not want to sum up all local cpu refcnts every
+ * time. So we consider lastuse element of the dst_entry and start loop
+ * with the cpu where this entry was allocated. If dst_entry is not timed
+ * out then update s_cpu of this dst_entry so that next time we can start from
+ * that cpu.
+ */
+static inline int rt_check_age(struct rtable *rth,
+			unsigned long tmo1, unsigned long tmo2)
+{
+	int max, sum = 0, age, idx;
+	struct dst_entry *dst = &rth->u.dst;
+	unsigned long now = jiffies;
+
+	idx = dst->s_cpu;
+	max = idx + NR_CPUS;
+	for(sum = 0; idx < max; idx++) {
+		int cpu_ = idx % NR_CPUS;
+		if (cpu_online(cpu_)) {
+			sum += per_cpu_ptr(dst->pcc, cpu_)->refcnt;
+			age = now - per_cpu_ptr(dst->pcc, cpu_)->lastuse;
+			if ((age <= tmo1 && !rt_fast_clean(rth)) ||
+					(age <= tmo2 && rt_valuable(rth))) {
+				dst->s_cpu = cpu_ ;
+				return 0;
+			}
+		}
+	}
+	return (sum == 0);
+}
+
+/*
+ * In this function order of examining three factors (ref_cnt, expires,
+ * lastuse) is changed, considering the cost of analyzing refcnt and lastuse
+ * which are localized for each cpu on NUMA.
+ */
+static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
+{
+	if (rth->u.dst.expires && time_after_eq(jiffies, rth->u.dst.expires))
+		return (dst_refcnt(&rth->u.dst) == 0) ;
+
+	return rt_check_age(rth, tmo1, tmo2);
+}
+
+#else
+
 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 {
 	unsigned long age;
@@ -529,6 +578,8 @@
 out:	return ret;
 }
 
+#endif
+
 /* Bits of score are:
  * 31: very valuable
  * 30: not quite useless
@@ -1108,8 +1159,19 @@
 
 void ip_rt_copy(struct rtable *to, struct rtable *from)
 {
+#ifdef CONFIG_NUMA
+	struct per_cpu_cnt *tmp_pnc;
+	tmp_pnc = to->u.dst.pcc;
+
+	*to = *from;
+	to->u.dst.pcc = tmp_pnc;
+	per_cpu_ptr(to->u.dst.pcc,get_cpu())->use = 1;
+	to->u.dst.s_cpu = smp_processor_id();
+	put_cpu();
+#else
 	*to = *from;
 	to->u.dst.__use 	= 1;
+#endif
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -3108,6 +3170,33 @@
 }
 __setup("rhash_entries=", set_rhash_entries);
 
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+static int __devinit rtcache_cpu_callback(struct notifier_block *nfb,
+                                   unsigned long action,
+				   void *hcpu)
+{
+	int cpu = (int) hcpu;
+
+	switch(action) {
+		int i ;
+		struct rtable *rth;
+		case CPU_DEAD:
+			for(i = rt_hash_mask; i >= 0; i--) {
+				spin_lock_irq(rt_hash_lock_addr(i));
+				rth = rt_hash_table[i].chain;
+				while(rth) {
+					dst_ref_xfr_cpu_down(&rth->u.dst, cpu);
+					rth = rth->u.rt_next;
+				}
+				spin_unlock_irq(rt_hash_lock_addr(i));
+			}
+			break;
+	}
+	return NOTIFY_OK;
+}
+static struct notifier_block rtcache_cpu_notifier = { &rtcache_cpu_callback, NULL, 0 };
+#endif
+
 int __init ip_rt_init(void)
 {
 	int rc = 0;
@@ -3197,6 +3286,9 @@
 	xfrm_init();
 	xfrm4_init();
 #endif
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	register_cpu_notifier(&rtcache_cpu_notifier);
+#endif
 	return rc;
 }
 
Index: alloc_percpu-2.6.13/net/ipv6/ip6_fib.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/ipv6/ip6_fib.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/ipv6/ip6_fib.c	2005-09-12 12:24:01.000000000 -0700
@@ -1209,6 +1209,35 @@
 	spin_unlock_bh(&fib6_gc_lock);
 }
 
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+#include <linux/cpu.h>
+inline static int rt6_ref_xfr_cpu_down(struct rt6_info *rt, void *arg)
+{
+	dst_ref_xfr_cpu_down(&rt->u.dst, (int)arg);
+	return 0;
+}
+
+static int __devinit ipv6_rtcache_cpu_callback(struct notifier_block *nfb,
+                                   unsigned long action,
+                                   void *hcpu)
+{
+	int cpu = (int) hcpu;
+
+	switch(action) {
+		case CPU_DEAD:
+			write_lock_bh(&rt6_lock);
+			fib6_clean_tree(&ip6_routing_table, rt6_ref_xfr_cpu_down,
+					0, (void *)cpu);
+			write_unlock_bh(&rt6_lock);
+			break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ipv6_rtcache_cpu_notifier =
+				{ &ipv6_rtcache_cpu_callback, NULL, 0 };
+#endif
+
 void __init fib6_init(void)
 {
 	fib6_node_kmem = kmem_cache_create("fib6_nodes",
@@ -1217,10 +1246,16 @@
 					   NULL, NULL);
 	if (!fib6_node_kmem)
 		panic("cannot create fib6_nodes cache");
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	register_cpu_notifier(&ipv6_rtcache_cpu_notifier);
+#endif
 }
 
 void fib6_gc_cleanup(void)
 {
+#if defined(CONFIG_NUMA) && defined(CONFIG_HOTPLUG_CPU)
+	unregister_cpu_notifier(&ipv6_rtcache_cpu_notifier);
+#endif
 	del_timer(&ip6_fib_timer);
 	kmem_cache_destroy(fib6_node_kmem);
 }
Index: alloc_percpu-2.6.13/net/ipv6/route.c
===================================================================
--- alloc_percpu-2.6.13.orig/net/ipv6/route.c	2005-09-12 12:23:37.000000000 -0700
+++ alloc_percpu-2.6.13/net/ipv6/route.c	2005-09-12 12:24:01.000000000 -0700
@@ -110,8 +110,6 @@
 struct rt6_info ip6_null_entry = {
 	.u = {
 		.dst = {
-			.__refcnt	= ATOMIC_INIT(1),
-			.__use		= 1,
 			.dev		= &loopback_dev,
 			.obsolete	= -1,
 			.error		= -ENETUNREACH,
@@ -2104,6 +2102,10 @@
 						     NULL, NULL);
 	if (!ip6_dst_ops.kmem_cachep)
 		panic("cannot create ip6_dst_cache");
+	if (dst_init_rtu_array(&ip6_null_entry.u.dst) < 0)
+		panic("ip6_route : can't allocate memory for dst-entry array");
+	dst_use_inc(&ipv6_null_entry.u.dist);
+	dst_refcnt_one(&ip6_null_entry.u.dst);
 
 	fib6_init();
 #ifdef 	CONFIG_PROC_FS
@@ -2130,4 +2132,5 @@
 	rt6_ifdown(NULL);
 	fib6_gc_cleanup();
 	kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
+	dst_free_rtu_array(&ip6_null_entry.u.dst);
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]
  Powered by Linux