time_update_mt_guess() is the core of the TSC->MT approximation magic.
Called periodically from the LAPIC timer interrupt handler, it fine-tunes
all the per-CPU offsets and ratios needed by guess_mt() to approximate the
MT using any processor's TSC.
We also need to update these from the cpufreq notifiers. Because a frequency
change makes the approximation unreliable (we don't know _exactly_ when it
happens) the approximation is disabled for a while after a frequency change and
it's not re-enabled until the approximation stabilises again.
Signed-off-by: Jiri Bohac <[email protected]>
Index: linux-2.6.20-rc5/arch/x86_64/kernel/apic.c
===================================================================
--- linux-2.6.20-rc5.orig/arch/x86_64/kernel/apic.c
+++ linux-2.6.20-rc5/arch/x86_64/kernel/apic.c
@@ -63,6 +63,9 @@ int using_apic_timer __read_mostly = 0;
static void apic_pm_activate(void);
+extern void time_update_mt_guess(void);
+
+
void enable_NMI_through_LVT0 (void * dummy)
{
unsigned int v;
@@ -986,6 +989,8 @@ void smp_local_timer_interrupt(void)
* Currently this isn't too much of an issue (performance wise),
* we can take more than 100K local irqs per second on a 100 MHz P5.
*/
+
+ time_update_mt_guess();
}
/*
Index: linux-2.6.20-rc5/arch/x86_64/kernel/time.c
===================================================================
--- linux-2.6.20-rc5.orig/arch/x86_64/kernel/time.c
+++ linux-2.6.20-rc5/arch/x86_64/kernel/time.c
@@ -221,6 +221,126 @@ static u32 read_master_timer_pm(void)
}
/*
+ * This function, called from the LAPIC interrupt,
+ * periodically updates all the per-CPU values needed by
+ * guess_mt()
+ */
+void time_update_mt_guess(void)
+{
+ u64 t, delta_t, delta_mt, mt;
+ s64 guess_mt_err, guess_mt_err_nsec, tsc_per_tick, tsc_slope_corr,
+ current_slope, old_mt_err;
+ int cpu = smp_processor_id(), resync;
+ unsigned long flags;
+
+ if (vxtime.mode == VXTIME_TSC && cpu != 0)
+ return;
+
+ local_irq_save(flags);
+
+ /* if a frequency change is in progress, don't recalculate anything
+ as this would destroy the fine-tuned slope. We don't rely on the TSC
+ during this time, so we don't care about the accuracy at all */
+ if (vxtime.cpu[cpu].tsc_invalid == VXTIME_TSC_CPUFREQ) {
+ local_irq_restore(flags);
+ return;
+ }
+
+ mt = get_master_timer64();
+ t = get_cycles_sync();
+
+ write_seqlock(&xtime_lock);
+
+ /* get the error of the estimated MT value */
+ delta_t = t - vxtime.cpu[cpu].tsc_last;
+ delta_mt = mt - vxtime.cpu[cpu].mt_last;
+ tsc_per_tick = ((mt_per_tick << 32) / delta_mt * delta_t) >> 32;
+
+ vxtime.cpu[cpu].mt_base = __guess_mt(t, cpu);
+
+ guess_mt_err = mt - vxtime.cpu[cpu].mt_base;
+ guess_mt_err_nsec = (guess_mt_err * (s64)vxtime.mt_q) >> 32;
+ old_mt_err = ((s64)(vxtime.cpu[cpu].tsc_slope_avg - vxtime.cpu[cpu].tsc_slope)
+ * tsc_per_tick) >> TSC_SLOPE_SCALE;
+ current_slope = (delta_mt << TSC_SLOPE_SCALE) / delta_t;
+
+ /* calculate a long time average to attenuate oscilation */
+ vxtime.cpu[cpu].tsc_slope_avg = ((TSC_SLOPE_DECAY - 1) * vxtime.cpu[cpu].tsc_slope_avg +
+ current_slope) / TSC_SLOPE_DECAY;
+
+ tsc_slope_corr = ((s64)(guess_mt_err << TSC_SLOPE_SCALE)) / tsc_per_tick;
+ vxtime.cpu[cpu].tsc_slope = vxtime.cpu[cpu].tsc_slope_avg + tsc_slope_corr;
+
+ if ((s64)vxtime.cpu[cpu].tsc_slope < 0) {
+ vxtime.cpu[cpu].tsc_slope = 0;
+ vxtime.cpu[cpu].tsc_slope_avg = current_slope;
+ }
+
+ if (abs(guess_mt_err) > (mt_per_tick >> 2))
+ printk(KERN_DEBUG "Master Timer guess on cpu %d off by %lld.%.6ld seconds\n",
+ cpu, guess_mt_err_nsec / NSEC_PER_SEC,
+ (abs(guess_mt_err_nsec) % NSEC_PER_SEC) / 1000);
+
+ resync = 0;
+ /* if the guess is off by more than a second, something has gone very
+ wrong; we'll break monotonicity and re-sync the guess with the MT */
+ if (abs(guess_mt_err_nsec) > NSEC_PER_SEC) {
+ resync = 1;
+ if (vxtime.mode != VXTIME_MT && guess_mt_err < 0)
+ printk(KERN_ERR "time not monotonic on cpu %d\n", cpu);
+ }
+ /* else if the guess is off by more than a jiffie, only synchronize the
+ guess with the MT if the guess is behind (won't break monotonicity);
+ if the guess is ahead, stop the timer by setting slope to zero */
+ else if (abs(guess_mt_err) > mt_per_tick) {
+ if (guess_mt_err > 0)
+ resync = 1;
+ else {
+ vxtime.cpu[cpu].tsc_slope = 0;
+ vxtime.cpu[cpu].tsc_slope_avg = current_slope;
+ }
+ }
+ /* good enough to switch back from temporary MT mode? */
+ else if (vxtime.cpu[cpu].tsc_invalid &&
+ abs(guess_mt_err) < mt_per_tick / USEC_PER_TICK &&
+ abs(old_mt_err) < mt_per_tick / USEC_PER_TICK &&
+ mt > vxtime.cpu[cpu].last_mt_guess) {
+ vxtime.cpu[cpu].tsc_invalid = 0;
+ vxtime.cpu[cpu].mt_base = mt;
+ vxtime.cpu[cpu].tsc_slope = vxtime.cpu[cpu].tsc_slope_avg;
+ }
+
+ /* hard re-sync of the guess to the current value of the MT */
+ if (resync) {
+ vxtime.cpu[cpu].mt_base = mt;
+ vxtime.cpu[cpu].tsc_slope = vxtime.cpu[cpu].tsc_slope_avg = current_slope;
+
+ printk(KERN_INFO "Master Timer re-syncing on cpu %d (mt=%lld, slope=%lld)\n",
+ cpu, mt, vxtime.cpu[cpu].tsc_slope);
+ }
+
+ if (vxtime.cpu[cpu].tsc_slope == 0)
+ printk(KERN_INFO "timer on cpu %d frozen, waiting for time to catch up\n", cpu);
+
+ vxtime.cpu[cpu].tsc_last = t;
+ vxtime.cpu[cpu].mt_last = mt;
+
+ write_sequnlock(&xtime_lock);
+ local_irq_restore(flags);
+}
+
+inline u64 mt_to_nsec(u64 mt)
+{
+ u64 ret;
+ ret = ((mt & 0xffffff) * vxtime.mt_q) >> 32;
+ mt >>= 24;
+ ret += ((mt & 0xffffff) * vxtime.mt_q) >> 8;
+ mt >>= 24;
+ ret += ( mt * vxtime.mt_q) << 16;
+ return ret;
+}
+
+/*
* do_gettimeoffset() returns microseconds since last timer interrupt was
* triggered by hardware. A memory read of HPET is slower than a register read
* of TSC, but much more reliable. It's also synchronized to the timer
@@ -666,50 +786,83 @@ static void cpufreq_delayed_get(void)
}
static unsigned int ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
static unsigned long cpu_khz_ref = 0;
+struct cpufreq_notifier_data {
+ struct cpufreq_freqs *freq;
+ unsigned long val;
+};
+
+/* called on the CPU that changed frequency */
+static void time_cpufreq_notifier_on_cpu(void *data)
+{
+ unsigned long flags;
+ int cpu;
+ struct cpufreq_notifier_data *cnd = data;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ cpu = smp_processor_id();
+ switch (cnd->val) {
+
+ case CPUFREQ_PRECHANGE:
+ case CPUFREQ_SUSPENDCHANGE:
+ if (!vxtime.cpu[cpu].tsc_invalid)
+ vxtime.cpu[cpu].last_mt_guess = __guess_mt(get_cycles_sync(), cpu);
+ vxtime.cpu[cpu].tsc_invalid = VXTIME_TSC_CPUFREQ;
+ break;
+
+ case CPUFREQ_POSTCHANGE:
+ case CPUFREQ_RESUMECHANGE:
+ vxtime.cpu[cpu].tsc_slope = ((vxtime.cpu[cpu].tsc_slope >> 4) * cnd->freq->old / cnd->freq->new) << 4;
+ vxtime.cpu[cpu].tsc_slope_avg = ((vxtime.cpu[cpu].tsc_slope_avg >> 4) * cnd->freq->old / cnd->freq->new) << 4;
+
+ vxtime.cpu[cpu].mt_base = vxtime.cpu[cpu].mt_last = get_master_timer64();
+ vxtime.cpu[cpu].tsc_last = get_cycles_sync();
+
+ vxtime.cpu[cpu].tsc_invalid = VXTIME_TSC_INVALID;
+ break;
+ }
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
- struct cpufreq_freqs *freq = data;
- unsigned long *lpj, dummy;
+ struct cpufreq_notifier_data cnd = {
+ .freq = data,
+ .val = val,
+ };
- if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC))
+ if (cpu_has(&cpu_data[cnd.freq->cpu], X86_FEATURE_CONSTANT_TSC))
return 0;
- lpj = &dummy;
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-#ifdef CONFIG_SMP
- lpj = &cpu_data[freq->cpu].loops_per_jiffy;
-#else
- lpj = &boot_cpu_data.loops_per_jiffy;
-#endif
-
if (!ref_freq) {
- ref_freq = freq->old;
- loops_per_jiffy_ref = *lpj;
+ ref_freq = cnd.freq->old;
cpu_khz_ref = cpu_khz;
}
- if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
- (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+
+ if ((val == CPUFREQ_PRECHANGE && cnd.freq->old < cnd.freq->new) ||
+ (val == CPUFREQ_POSTCHANGE && cnd.freq->old > cnd.freq->new) ||
(val == CPUFREQ_RESUMECHANGE)) {
- *lpj =
- cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
- cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
- vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
+ cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, cnd.freq->new);
+
}
-
- set_cyc2ns_scale(cpu_khz_ref);
+
+ preempt_disable();
+ if (smp_processor_id() == cnd.freq->cpu)
+ time_cpufreq_notifier_on_cpu(&cnd);
+ else smp_call_function_single(cnd.freq->cpu, time_cpufreq_notifier_on_cpu, &cnd, 0, 1);
+ preempt_enable();
return 0;
}
-
+
static struct notifier_block time_cpufreq_notifier_block = {
- .notifier_call = time_cpufreq_notifier
+ .notifier_call = time_cpufreq_notifier
};
static int __init cpufreq_tsc(void)
--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]