Re: [RFC] (How to) Let idle CPUs sleep

On Thu, Jun 30, 2005 at 06:17:11PM +0530, Srivatsa Vaddagiri wrote:
> Digging further revealed that this max time was restricted by 
> various timers kernel uses. Mostly it was found to be because of
> the slab allocator reap timer (it requests a timer every ~2sec on
> every CPU) and machine_check timer (MCE_RATE in arch/i386/kernel/cpu/mcheck/
> non-fatal.c ).

I modified the slab allocator to do a slightly better job of handling timers.
(instead of blindly increasing the reap timeout to 20 sec as I did in my last 
run).  The patch below is merely a hack meant to see what kind of benefits we 
can possibly hope to get by reworking the reap timer.  A good solution would 
probably increase/decrease the reap periods based on memory pressure and 
idleness of the system. Anyway the patch I tried out is:


---

 linux-2.6.13-rc1-root/mm/slab.c |   40 ++++++++++++++++++++++++++++++++--------
 1 files changed, 32 insertions(+), 8 deletions(-)

diff -puN mm/slab.c~vst-slab mm/slab.c
--- linux-2.6.13-rc1/mm/slab.c~vst-slab	2005-07-05 16:36:03.000000000 +0530
+++ linux-2.6.13-rc1-root/mm/slab.c	2005-07-06 18:01:28.000000000 +0530
@@ -371,6 +371,8 @@ struct kmem_cache_s {
  */
 #define REAPTIMEOUT_CPUC	(2*HZ)
 #define REAPTIMEOUT_LIST3	(4*HZ)
+#define MAX_REAP_TIMEOUT	(30*HZ)
+#define MAX_DRAIN_COUNT		2
 
 #if STATS
 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
@@ -569,6 +571,7 @@ static enum {
 } g_cpucache_up;
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
+static DEFINE_PER_CPU(unsigned long, last_timeout);
 
 static void free_block(kmem_cache_t* cachep, void** objpp, int len);
 static void enable_cpucache (kmem_cache_t *cachep);
@@ -883,8 +886,10 @@ static int __init cpucache_init(void)
 	 * pages to gfp.
 	 */
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		if (cpu_online(cpu))
+		if (cpu_online(cpu)) {
+			per_cpu(last_timeout, cpu) = REAPTIMEOUT_CPUC + cpu;
 			start_cpu_timer(cpu);
+		}
 	}
 
 	return 0;
@@ -1543,7 +1548,7 @@ static void smp_call_function_all_cpus(v
 	preempt_enable();
 }
 
-static void drain_array_locked(kmem_cache_t* cachep,
+static int drain_array_locked(kmem_cache_t* cachep,
 				struct array_cache *ac, int force);
 
 static void do_drain(void *arg)
@@ -2753,10 +2758,10 @@ static void enable_cpucache(kmem_cache_t
 					cachep->name, -err);
 }
 
-static void drain_array_locked(kmem_cache_t *cachep,
+static int drain_array_locked(kmem_cache_t *cachep,
 				struct array_cache *ac, int force)
 {
-	int tofree;
+	int tofree = 1;
 
 	check_spinlock_acquired(cachep);
 	if (ac->touched && !force) {
@@ -2771,6 +2776,8 @@ static void drain_array_locked(kmem_cach
 		memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
 					sizeof(void*)*ac->avail);
 	}
+
+	return tofree;
 }
 
 /**
@@ -2787,17 +2794,20 @@ static void drain_array_locked(kmem_cach
 static void cache_reap(void *unused)
 {
 	struct list_head *walk;
+	int drain_count = 0, freed_slab_count = 0;
+	unsigned long timeout = __get_cpu_var(last_timeout);
+	int cpu = smp_processor_id();
 
 	if (down_trylock(&cache_chain_sem)) {
 		/* Give up. Setup the next iteration. */
-		schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+		schedule_delayed_work(&__get_cpu_var(reap_work), timeout);
 		return;
 	}
 
 	list_for_each(walk, &cache_chain) {
 		kmem_cache_t *searchp;
 		struct list_head* p;
-		int tofree;
+		int tofree, count;
 		struct slab *slabp;
 
 		searchp = list_entry(walk, kmem_cache_t, next);
@@ -2809,7 +2819,9 @@ static void cache_reap(void *unused)
 
 		spin_lock_irq(&searchp->spinlock);
 
-		drain_array_locked(searchp, ac_data(searchp), 0);
+		count = drain_array_locked(searchp, ac_data(searchp), 0);
+		if (count > drain_count)
+			drain_count = count;
 
 		if(time_after(searchp->lists.next_reap, jiffies))
 			goto next_unlock;
@@ -2825,6 +2837,9 @@ static void cache_reap(void *unused)
 		}
 
 		tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
+		if (tofree > freed_slab_count)
+			freed_slab_count = tofree;
+
 		do {
 			p = list3_data(searchp)->slabs_free.next;
 			if (p == &(list3_data(searchp)->slabs_free))
@@ -2854,7 +2869,16 @@ next:
 	up(&cache_chain_sem);
 	drain_remote_pages();
 	/* Setup the next iteration */
-	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+#ifdef CONFIG_NO_IDLE_HZ
+	if (drain_count < MAX_DRAIN_COUNT && !freed_slab_count) {
+		if (timeout * 2 < MAX_REAP_TIMEOUT)
+			timeout *= 2;
+	} else
+#endif
+		timeout = REAPTIMEOUT_CPUC + cpu;
+
+	__get_cpu_var(last_timeout) = timeout;
+	schedule_delayed_work(&__get_cpu_var(reap_work), timeout);
 }
 
 #ifdef CONFIG_PROC_FS
_

The results with this patch on a 8way Intel box are:

CPU#   # of       Mean          Std Dev     Max                 Min
       samples


1:      31       4124.742     4331.914      14317.000          0.000
2:      35       3603.600     3792.050      12556.000         14.000
3:    2585         49.458        4.207         50.000          2.000
4:     151        847.682      329.343       1139.000         15.000
5:      23       5432.652     3461.856      12024.000        120.000
6:      19       6229.158     5641.813      15000.000        169.000
7:      67       1865.672     1528.343       5000.000         19.000


Note that the best average is around ~6sec (with max being 15 sec).

Given, this do you still advocate that we restrict idle CPUs to wakeup
every 100ms and check for imbalance? IMO, we should let them sleep
much longer (few seconds) ..What are the consequences on load balancing
if idle CPUs sleep that long? Does it mean that system can remain unresponsive
for few seconds under some circumstances (when there is a burst of activity and
at that time idle CPUs are sleeping)?


-- 


Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Prev by Date: RE: 2.6.13-rc2 with dual way dual core ck804 MB
Next by Date: USB OOPS 2.6.12
Previous by thread: [PATCH] v4l: add DVB support for DViCO FusionHDTV3 Gold-T
Next by thread: USB OOPS 2.6.12
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind]