Re: [PATCH] scheduler: fix x86 regression in native_sched_clock

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Friday 07 December 2007 12:19, Stefano Brivio wrote:
> This patch fixes a regression introduced by:
>
> commit bb29ab26863c022743143f27956cc0ca362f258c
> Author: Ingo Molnar <[email protected]>
> Date:   Mon Jul 9 18:51:59 2007 +0200
>
> This caused the jiffies counter to leap back and forth on cpufreq changes
> on my x86 box. I'd say that we can't always assume that TSC does "small
> errors" only, when marked unstable. On cpufreq changes these errors can be
> huge.
>
> The original bug report can be found here:
> http://bugzilla.kernel.org/show_bug.cgi?id=9475
>
>
> Signed-off-by: Stefano Brivio <[email protected]>

While your fix should probably go into 2.6.24...

This particular issue has aggravated me enough times. Let's
fix the damn thing properly already... I think what would work best
is a relatively simple change to the API along these lines:
Index: linux-2.6/arch/x86/kernel/tsc_32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/tsc_32.c
+++ linux-2.6/arch/x86/kernel/tsc_32.c
@@ -92,27 +92,35 @@ static inline void set_cyc2ns_scale(unsi
 /*
  * Scheduler clock - returns current time in nanosec units.
  */
-unsigned long long native_sched_clock(void)
+u64 native_sched_clock(u64 *sample)
 {
-	unsigned long long this_offset;
+	u64 now, delta;
 
 	/*
-	 * Fall back to jiffies if there's no TSC available:
+	 * Fall back to the default sched_clock() implementation (keep in
+	 * synch with kernel/sched.c) if there's no TSC available:
 	 * ( But note that we still use it if the TSC is marked
 	 *   unstable. We do this because unlike Time Of Day,
 	 *   the scheduler clock tolerates small errors and it's
 	 *   very important for it to be as fast as the platform
 	 *   can achive it. )
 	 */
-	if (unlikely(!tsc_enabled && !tsc_unstable))
-		/* No locking but a rare wrong value is not a big deal: */
-		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+	if (unlikely(!tsc_enabled && !tsc_unstable)) {
+		now = (u64)jiffies;
+		delta = now - *sample;
+		*sample = now;
+
+		return delta * (NSEC_PER_SEC / HZ);
+
+	} else {
+		/* read the Time Stamp Counter: */
+		rdtscll(now);
+		delta = now - *sample;
+		*sample = now;
 
-	/* read the Time Stamp Counter: */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return cycles_2_ns(this_offset);
+		/* return the delta value in ns */
+		return cycles_2_ns(delta);
+	}
 }
 
 /* We need to define a real function for sched_clock, to override the
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -72,9 +72,13 @@
  * This is default implementation.
  * Architectures and sub-architectures can override this.
  */
-unsigned long long __attribute__((weak)) sched_clock(void)
+u64 __attribute__((weak)) sched_clock(u64 *sample)
 {
-	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+	u64 now = (u64)jiffies;
+	u64 delta = now - *sample;
+	*sample = now;
+
+	return delta * (NSEC_PER_SEC / HZ);
 }
 
 /*
@@ -314,7 +318,7 @@ struct rq {
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
 
-	u64 clock, prev_clock_raw;
+	u64 clock, prev_clock_sample;
 	s64 clock_max_delta;
 
 	unsigned int clock_warps, clock_overflows;
@@ -385,9 +389,7 @@ static inline int cpu_of(struct rq *rq)
  */
 static void __update_rq_clock(struct rq *rq)
 {
-	u64 prev_raw = rq->prev_clock_raw;
-	u64 now = sched_clock();
-	s64 delta = now - prev_raw;
+	u64 delta = sched_clock(&rq->prev_clock_sample);
 	u64 clock = rq->clock;
 
 #ifdef CONFIG_SCHED_DEBUG
@@ -416,7 +418,6 @@ static void __update_rq_clock(struct rq 
 		}
 	}
 
-	rq->prev_clock_raw = now;
 	rq->clock = clock;
 }
 
@@ -656,7 +657,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep
 void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
 	struct rq *rq = cpu_rq(smp_processor_id());
-	u64 now = sched_clock();
 
 	rq->idle_clock += delta_ns;
 	/*
@@ -666,7 +666,7 @@ void sched_clock_idle_wakeup_event(u64 d
 	 * rq clock:
 	 */
 	spin_lock(&rq->lock);
-	rq->prev_clock_raw = now;
+	(void)sched_clock(&rq->prev_clock_sample);
 	rq->clock += delta_ns;
 	spin_unlock(&rq->lock);
 }
@@ -4967,7 +4967,7 @@ void __cpuinit init_idle(struct task_str
 	unsigned long flags;
 
 	__sched_fork(idle);
-	idle->se.exec_start = sched_clock();
+	idle->se.exec_start = 0;
 
 	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
Index: linux-2.6/arch/x86/kernel/tsc_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/tsc_64.c
+++ linux-2.6/arch/x86/kernel/tsc_64.c
@@ -30,18 +30,21 @@ static unsigned long long cycles_2_ns(un
 	return (cyc * cyc2ns_scale) >> NS_SCALE;
 }
 
-unsigned long long sched_clock(void)
+u64 sched_clock(u64 *sample)
 {
-	unsigned long a = 0;
+	u64 now, delta;
 
 	/* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
 	 * which means it is not completely exact and may not be monotonous
 	 * between CPUs. But the errors should be too small to matter for
 	 * scheduling purposes.
 	 */
+	rdtscll(now);
+	delta = now - *sample;
+	*sample = now;
 
-	rdtscll(a);
-	return cycles_2_ns(a);
+	/* return the delta value in ns */
+	return cycles_2_ns(delta);
 }
 
 static int tsc_unstable;
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1433,7 +1433,7 @@ static inline int set_cpus_allowed(struc
 }
 #endif
 
-extern unsigned long long sched_clock(void);
+extern u64 sched_clock(u64 *sample);
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
Index: linux-2.6/include/asm-x86/timer.h
===================================================================
--- linux-2.6.orig/include/asm-x86/timer.h
+++ linux-2.6/include/asm-x86/timer.h
@@ -5,7 +5,7 @@
 
 #define TICK_SIZE (tick_nsec / 1000)
 
-unsigned long long native_sched_clock(void);
+unsigned u64 native_sched_clock(u64 *sample);
 unsigned long native_calculate_cpu_khz(void);
 
 extern int timer_ack;
Index: linux-2.6/kernel/lockdep.c
===================================================================
--- linux-2.6.orig/kernel/lockdep.c
+++ linux-2.6/kernel/lockdep.c
@@ -216,7 +216,7 @@ static void lock_release_holdtime(struct
 	if (!lock_stat)
 		return;
 
-	holdtime = sched_clock() - hlock->holdtime_stamp;
+	holdtime = sched_clock(&hlock->holdtime_stamp);
 
 	stats = get_lock_stats(hlock->class);
 	if (hlock->read)
@@ -2413,7 +2413,7 @@ static int __lock_acquire(struct lockdep
 	hlock->hardirqs_off = hardirqs_off;
 #ifdef CONFIG_LOCK_STAT
 	hlock->waittime_stamp = 0;
-	hlock->holdtime_stamp = sched_clock();
+	(void)sched_clock(&hlock->holdtime_stamp);
 #endif
 
 	if (check == 2 && !mark_irqflags(curr, hlock))
@@ -2780,7 +2780,7 @@ __lock_contended(struct lockdep_map *loc
 	return;
 
 found_it:
-	hlock->waittime_stamp = sched_clock();
+	(void)sched_clock(&hlock->waittime_stamp);
 
 	point = lock_contention_point(hlock->class, ip);
 
@@ -2825,9 +2825,8 @@ __lock_acquired(struct lockdep_map *lock
 found_it:
 	cpu = smp_processor_id();
 	if (hlock->waittime_stamp) {
-		now = sched_clock();
-		waittime = now - hlock->waittime_stamp;
-		hlock->holdtime_stamp = now;
+		waittime = sched_clock(&hlock->waittime_stamp);
+		hlock->holdtime_stamp = hlock->waittime_stamp;
 	}
 
 	stats = get_lock_stats(hlock->class);
Index: linux-2.6/kernel/printk.c
===================================================================
--- linux-2.6.orig/kernel/printk.c
+++ linux-2.6/kernel/printk.c
@@ -575,7 +575,7 @@ __setup("time", printk_time_setup);
 
 __attribute__((weak)) unsigned long long printk_clock(void)
 {
-	return sched_clock();
+	return (unsigned long long)get_cycles();
 }
 
 /* Check if we have any console registered that can be called early in boot. */
Index: linux-2.6/kernel/sched_debug.c
===================================================================
--- linux-2.6.orig/kernel/sched_debug.c
+++ linux-2.6/kernel/sched_debug.c
@@ -176,7 +176,7 @@ static void print_cpu(struct seq_file *m
 	P(curr->pid);
 	PN(clock);
 	PN(idle_clock);
-	PN(prev_clock_raw);
+	PN(prev_clock_sample);
 	P(clock_warps);
 	P(clock_overflows);
 	P(clock_deep_idle_events);
@@ -353,12 +353,12 @@ void proc_sched_show_task(struct task_st
 #undef __P
 
 	{
-		u64 t0, t1;
+		u64 sample, delta;
 
-		t0 = sched_clock();
-		t1 = sched_clock();
+		(void)sched_clock(&sample);
+		delta = sched_clock(&sample);
 		SEQ_printf(m, "%-35s:%21Ld\n",
-			   "clock-delta", (long long)(t1-t0));
+			   "clock-delta", (long long)delta);
 	}
 }
 

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux