Re: [RFC] (How to) Let idle CPUs sleep

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, May 09, 2005 at 04:27:26PM +1000, Nick Piggin wrote:
> I could probably find some time to do my implementation if you have
> a complete working patch for eg. UML.

Well, turns out that if we restrict the amount of time idle cpus are 
allowed to sleep, then there is very little change reqd in the scheduler.
Most of the calculation of exponential sleep times can be done outside
it (in the idle CPU's code).

First, the scheduler support to zero cpu_load[] counters before idle
cpu sleeps.

---

 linux-2.6.12-rc3-mm3-vatsa/include/linux/sched.h |    1 
 linux-2.6.12-rc3-mm3-vatsa/kernel/sched.c        |   33 +++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff -puN kernel/sched.c~sched-nohz kernel/sched.c
--- linux-2.6.12-rc3-mm3/kernel/sched.c~sched-nohz	2005-05-11 17:05:13.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/kernel/sched.c	2005-05-11 17:06:38.000000000 +0530
@@ -2323,6 +2323,39 @@ static void rebalance_tick(int this_cpu,
 		}
 	}
 }
+
+#ifdef CONFIG_NO_IDLE_HZ
+/*
+ * Try hard to pull tasks. Called by idle task before it sleeps cutting off
+ * local timer ticks.  This clears the various load counters and tries to pull
+ * tasks.
+ *
+ * Returns 1 if tasks were pulled over, 0 otherwise.
+ */
+int idle_balance_retry(void)
+{
+	int j, moved = 0, this_cpu = smp_processor_id();
+	runqueue_t *this_rq = this_rq();
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	for (j = 0; j < 3; j++)
+		this_rq->cpu_load[j] = 0;
+
+	rebalance_tick(this_cpu, this_rq, SCHED_IDLE);
+
+	if (this_rq->nr_running) {
+		moved = 1;
+		set_tsk_need_resched(current);
+	}
+
+	local_irq_restore(flags);
+
+	return moved;
+}
+#endif
+
 #else
 /*
  * on UP we do not need to balance between CPUs:
diff -puN include/linux/sched.h~sched-nohz include/linux/sched.h
--- linux-2.6.12-rc3-mm3/include/linux/sched.h~sched-nohz	2005-05-11 17:05:13.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/include/linux/sched.h	2005-05-11 17:13:19.000000000 +0530
@@ -897,6 +897,7 @@ extern int task_curr(const task_t *p);
 extern int idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
 extern task_t *idle_task(int cpu);
+extern int idle_balance_retry(void);
 
 void yield(void);
 

_


A sample patch that implements exponential sleep time is below. Note that this 
patch only makes idle cpu pretend as if it is asleep (instead of really cutting
of timer ticks). I used this merely to test the scheduler change.

Martin,
	You probably need something like this for S390 arch!



---

 linux-2.6.12-rc3-mm3-vatsa/arch/i386/Kconfig          |    4 +
 linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/apic.c    |   16 ++++--
 linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/irq.c     |    4 +
 linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/process.c |   47 ++++++++++++++++--
 linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/smp.c     |    6 ++
 5 files changed, 69 insertions(+), 8 deletions(-)

diff -puN arch/i386/Kconfig~vst-sim arch/i386/Kconfig
--- linux-2.6.12-rc3-mm3/arch/i386/Kconfig~vst-sim	2005-05-10 15:53:33.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/arch/i386/Kconfig	2005-05-10 15:54:22.000000000 +0530
@@ -443,6 +443,10 @@ config X86_OOSTORE
 	depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR
 	default y
 
+config NO_IDLE_HZ
+        bool "Tickless Idle CPUs support"
+        default n
+
 config HPET_TIMER
 	bool "HPET Timer Support"
 	help
diff -puN arch/i386/kernel/process.c~vst-sim arch/i386/kernel/process.c
--- linux-2.6.12-rc3-mm3/arch/i386/kernel/process.c~vst-sim	2005-05-10 15:53:34.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/process.c	2005-05-12 14:06:16.000000000 +0530
@@ -94,6 +94,12 @@ void enable_hlt(void)
 
 EXPORT_SYMBOL(enable_hlt);
 
+DEFINE_PER_CPU(int, idle_asleep);
+DEFINE_PER_CPU(unsigned long, sleep_duration);
+
+#define MAX_SLEEP_DURATION 	128	/* in tick counts */
+#define MIN_SLEEP_DURATION	8	/* in tick counts */
+
 /*
  * We use this if we don't have any better
  * idle routine..
@@ -102,8 +108,36 @@ void default_idle(void)
 {
 	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
 		local_irq_disable();
-		if (!need_resched())
-			safe_halt();
+		if (!need_resched()) {
+			unsigned long jif_next, jif_delta;
+
+			jif_next = next_timer_interrupt();
+			jif_delta = jif_next - jiffies;
+
+			if (jif_delta > MIN_SLEEP_DURATION) {
+				unsigned long slpint;
+
+				if (idle_balance_retry()) {
+					local_irq_enable();
+					return;
+				}
+
+				slpint = min(__get_cpu_var(sleep_duration),
+					     jif_delta);
+
+				jif_next = jiffies + slpint;
+				/* Hack to discard local timer ticks */
+				__get_cpu_var(idle_asleep) = 1;
+				cpu_set(smp_processor_id(), nohz_cpu_mask);
+				local_irq_enable();
+				while ((jiffies < jif_next-1) &&
+					 __get_cpu_var(idle_asleep))
+					cpu_relax();
+				__get_cpu_var(idle_asleep) = 0;
+				cpu_clear(smp_processor_id(), nohz_cpu_mask);
+			} else
+				safe_halt();
+		}
 		else
 			local_irq_enable();
 	} else {
@@ -178,6 +212,8 @@ void cpu_idle(void)
 {
 	int cpu = _smp_processor_id();
 
+	__get_cpu_var(sleep_duration) = MIN_SLEEP_DURATION;
+
 	/* endless idle loop with no priority at all */
 	while (1) {
 		while (!need_resched()) {
@@ -189,7 +225,7 @@ void cpu_idle(void)
 			rmb();
 			idle = pm_idle;
 
-			if (!idle)
+			//if (!idle)
 				idle = default_idle;
 
 			if (cpu_is_offline(cpu))
@@ -197,7 +233,12 @@ void cpu_idle(void)
 
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
 			idle();
+
+			if (__get_cpu_var(sleep_duration) < MAX_SLEEP_DURATION)
+				__get_cpu_var(sleep_duration) *= 2;
+
 		}
+		__get_cpu_var(sleep_duration) = MIN_SLEEP_DURATION;
 		schedule();
 	}
 }
diff -puN arch/i386/kernel/irq.c~vst-sim arch/i386/kernel/irq.c
--- linux-2.6.12-rc3-mm3/arch/i386/kernel/irq.c~vst-sim	2005-05-10 15:53:34.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/irq.c	2005-05-10 15:53:47.000000000 +0530
@@ -46,6 +46,8 @@ static union irq_ctx *hardirq_ctx[NR_CPU
 static union irq_ctx *softirq_ctx[NR_CPUS];
 #endif
 
+DECLARE_PER_CPU(int, idle_asleep);
+
 /*
  * do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
@@ -60,6 +62,8 @@ fastcall unsigned int do_IRQ(struct pt_r
 	u32 *isp;
 #endif
 
+	__get_cpu_var(idle_asleep) = 0;
+
 	irq_enter();
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
diff -puN arch/i386/kernel/smp.c~vst-sim arch/i386/kernel/smp.c
--- linux-2.6.12-rc3-mm3/arch/i386/kernel/smp.c~vst-sim	2005-05-11 16:59:38.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/smp.c	2005-05-11 16:59:58.000000000 +0530
@@ -309,6 +309,8 @@ static inline void leave_mm (unsigned lo
  * 2) Leave the mm if we are in the lazy tlb mode.
  */
 
+DECLARE_PER_CPU(int, idle_asleep);
+
 fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
 {
 	unsigned long cpu;
@@ -336,6 +338,7 @@ fastcall void smp_invalidate_interrupt(s
 			leave_mm(cpu);
 	}
 	ack_APIC_irq();
+	__get_cpu_var(idle_asleep) = 0;
 	smp_mb__before_clear_bit();
 	cpu_clear(cpu, flush_cpumask);
 	smp_mb__after_clear_bit();
@@ -598,6 +601,8 @@ void smp_send_stop(void)
 fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
+
+	__get_cpu_var(idle_asleep) = 0;
 }
 
 fastcall void smp_call_function_interrupt(struct pt_regs *regs)
@@ -607,6 +612,7 @@ fastcall void smp_call_function_interrup
 	int wait = call_data->wait;
 
 	ack_APIC_irq();
+	__get_cpu_var(idle_asleep) = 0;
 	/*
 	 * Notify initiating CPU that I've grabbed the data and am
 	 * about to execute the function
diff -puN arch/i386/kernel/apic.c~vst-sim arch/i386/kernel/apic.c
--- linux-2.6.12-rc3-mm3/arch/i386/kernel/apic.c~vst-sim	2005-05-10 15:53:36.000000000 +0530
+++ linux-2.6.12-rc3-mm3-vatsa/arch/i386/kernel/apic.c	2005-05-10 15:53:47.000000000 +0530
@@ -1171,6 +1171,8 @@ inline void smp_local_timer_interrupt(st
 	 */
 }
 
+DECLARE_PER_CPU(int, idle_asleep);
+
 /*
  * Local APIC timer interrupt. This is the most natural way for doing
  * local interrupts, but local timer interrupts can be emulated by
@@ -1185,15 +1187,19 @@ fastcall void smp_apic_timer_interrupt(s
 	int cpu = smp_processor_id();
 
 	/*
-	 * the NMI deadlock-detector uses this.
-	 */
-	per_cpu(irq_stat, cpu).apic_timer_irqs++;
-
-	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
 	 */
 	ack_APIC_irq();
+
+	if (__get_cpu_var(idle_asleep))
+		return;
+
+	/*
+	 * the NMI deadlock-detector uses this.
+	 */
+	per_cpu(irq_stat, cpu).apic_timer_irqs++;
+
 	/*
 	 * update_process_times() expects us to have done irq_enter().
 	 * Besides, if we don't timer interrupts ignore the global

_
-- 


Thanks and Regards,
Srivatsa Vaddagiri,
Linux Technology Center,
IBM Software Labs,
Bangalore, INDIA - 560017
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux