[PATCH 1/1] Make the TSC safe to be used by gettimeofday().

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is done by a per-cpu vxtime structure that stores the last TSC and HPET
values.

Whenever we switch to a userland process after a HLT instruction has been
executed or after the CPU frequency has changed, we force a new read of the
TSC, HPET and xtime so that we know the correct frequency we have to deal
with.

We also force a resynch once every second, on every CPU.

With this, we can safely use RDTSC in gettimeofday() in CPUs where the
TSCs are not synchronized, such as Opterons, instead of doing a very expensive
HPET read.

Signed-off-by: Suleiman Souhlal <[email protected]>
---
arch/x86_64/kernel/process.c     |   19 ++++++
arch/x86_64/kernel/setup64.c     |    3 +
arch/x86_64/kernel/time.c        |   39 ++++++++---
arch/x86_64/kernel/vmlinux.lds.S |    6 +-
arch/x86_64/kernel/vsyscall.c    |  131 ++++++++++++++++++++++++++++++++------
include/asm-x86_64/pda.h         |    3 +
include/asm-x86_64/proto.h | 1 include/asm-x86_64/timex.h | 2 +
include/asm-x86_64/vsyscall.h    |   12 +++
include/linux/hrtimer.h          |    2 +
10 files changed, 185 insertions(+), 33 deletions(-)

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 49f7fac..2af102a 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -51,6 +51,7 @@ #include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
#include <asm/idle.h>
+#include <asm/vsyscall.h>

asmlinkage extern void ret_from_fork(void);

@@ -109,6 +110,10 @@ void exit_idle(void)
 */
static void default_idle(void)
{
+	int cpu;
+
+	cpu = smp_processor_id();
+
	local_irq_enable();

	current_thread_info()->status &= ~TS_POLLING;
@@ -562,6 +567,7 @@ __switch_to(struct task_struct *prev_p, *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu);
+	unsigned long rip;

	/* we're going to use this soon, after a few expensive things */
	if (next_p->fpu_counter>5)
@@ -657,6 +663,19 @@ #endif
	 */
	if (next_p->fpu_counter>5)
		math_state_restore();
+
+	/*
+	 * If we are switching away from a process in vsyscall, touch
+	 * the vxtime seq lock so that userland is aware that a context switch
+	 * has happened.
+	 */
+	rip = *(unsigned long *)(prev->rsp0 +
+	    offsetof(struct user_regs_struct, rip) - sizeof(struct pt_regs));
+	if (unlikely(rip > VSYSCALL_START) && unlikely(rip < VSYSCALL_END)) {
+		write_seqlock(&vxtime.vx_seq);
+		write_sequnlock(&vxtime.vx_seq);
+	}
+
	return prev_p;
}

diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 8c4b80f..a587381 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -23,6 +23,7 @@ #include <asm/i387.h>
#include <asm/percpu.h>
#include <asm/proto.h>
#include <asm/sections.h>
+#include <asm/vsyscall.h>

char x86_boot_params[BOOT_PARAM_SIZE] __initdata;

@@ -146,6 +147,8 @@ void pda_init(int cpu)


	pda->irqstackptr += IRQSTACKSIZE-64;
+
+	pda->vxtime = &vxtime.pcpu[cpu];
}
char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 88722f1..7908025 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -118,14 +118,17 @@ unsigned int (*do_gettimeoffset)(void) =

void do_gettimeofday(struct timeval *tv)
{
-	unsigned long seq;
+	unsigned long seq, t;
 	unsigned int sec, usec;
+	struct vxtime_pcpu *pcpu;

+	preempt_disable();
+	pcpu = read_pda(vxtime);
	do {
-		seq = read_seqbegin(&xtime_lock);
+		seq = read_seqbegin(&vxtime.vx_seq);

-		sec = xtime.tv_sec;
-		usec = xtime.tv_nsec / NSEC_PER_USEC;
+		sec = pcpu->tv_sec;
+		usec = pcpu->tv_usec;

/* i386 does some correction here to keep the clock monotonous even when ntpd is fixing drift.
@@ -135,9 +138,11 @@ void do_gettimeofday(struct timeval *tv)
		   be found. Note when you fix it here you need to do the same
		   in arch/x86_64/kernel/vsyscall.c and export all needed
variables in vmlinux.lds. -AK */ - usec += do_gettimeoffset();
-
-	} while (read_seqretry(&xtime_lock, seq));
+		rdtscll(t);
+		usec += (((t - pcpu->last_tsc) *
+		    pcpu->tsc_nsquot) >> NS_SCALE) / NSEC_PER_USEC;
+	} while (read_seqretry(&vxtime.vx_seq, seq));
+	preempt_enable();

	tv->tv_sec = sec + usec / USEC_PER_SEC;
	tv->tv_usec = usec % USEC_PER_SEC;
@@ -624,10 +629,17 @@ #endif
		cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);

		cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
-		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
			vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
+
+			vxtime.pcpu[freq->cpu].tsc_nsquot =
+			    (NSEC_PER_SEC << NS_SCALE)
+			    / (cpufreq_scale(cpu_khz_ref, ref_freq, freq->new) *
+			    NSEC_PER_USEC);
+			vxtime_update_pcpu();
+		}	
	}
-	
+
	set_cyc2ns_scale(cpu_khz_ref);

	return 0;
@@ -887,6 +899,9 @@ time_cpu_notifier(struct notifier_block
void __init time_init(void)
{
+	char *timename;
+	int i;
+
	if (nohpet)
		vxtime.hpet_address = 0;

@@ -931,13 +946,17 @@ #endif
#ifndef CONFIG_SMP
	time_init_gtod();
#endif
+
+	for (i = 0; i < NR_CPUS; i++)
+		vxtime.pcpu[i].tsc_nsquot = (NSEC_PER_SEC << NS_SCALE)
+		    / (cpu_khz * NSEC_PER_USEC);
}

/*
 * Make an educated guess if the TSC is trustworthy and synchronized
 * over all CPUs.
 */
-__cpuinit int unsynchronized_tsc(void)
+int unsynchronized_tsc(void)
{
#ifdef CONFIG_SMP
	if (apic_is_clustered_box())
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index edb24aa..b1a39d1 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -96,9 +96,6 @@ #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET
  .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) }
  xtime_lock = VVIRT(.xtime_lock);

-  .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
-  vxtime = VVIRT(.vxtime);
-
  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
  vgetcpu_mode = VVIRT(.vgetcpu_mode);

@@ -119,6 +116,9 @@ #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET
  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }

+  .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
+  vxtime = VVIRT(.vxtime);
+
  . = VSYSCALL_VIRT_ADDR + 4096;

#undef VSYSCALL_ADDR
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index a98b460..873a4e5 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -27,6 +27,8 @@ #include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
#include <linux/getcpu.h>
+#include <linux/smp.h>
+#include <linux/kthread.h>

#include <asm/vsyscall.h>
#include <asm/pgtable.h>
@@ -37,6 +39,9 @@ #include <asm/io.h>
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
+#include <asm/smp.h>
+#include <asm/idle.h>
+#include <asm/proto.h>

#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))

@@ -46,6 +51,11 @@ int __vgetcpu_mode __section_vgetcpu_mod

#include <asm/unistd.h>

+#define	NS_SCALE 10
+#define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
+
+long vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache);
+
static __always_inline void timeval_normalize(struct timeval * tv)
{
	time_t __sec;
@@ -57,35 +67,116 @@ static __always_inline void timeval_norm
	}
}

+static inline int _vgetcpu(void)
+{
+	int cpu;
+
+	vgetcpu(&cpu, NULL, NULL);
+	return (cpu);
+}
+
static __always_inline void do_vgettimeofday(struct timeval * tv)
{
	long sequence, t;
	unsigned long sec, usec;
+	int cpu;

	do {
-		sequence = read_seqbegin(&__xtime_lock);
-		
-		sec = __xtime.tv_sec;
-		usec = __xtime.tv_nsec / 1000;
-
-		if (__vxtime.mode != VXTIME_HPET) {
-			t = get_cycles_sync();
-			if (t < __vxtime.last_tsc)
-				t = __vxtime.last_tsc;
-			usec += ((t - __vxtime.last_tsc) *
-				 __vxtime.tsc_quot) >> 32;
-			/* See comment in x86_64 do_gettimeofday. */
-		} else {
-			usec += ((readl((void __iomem *)
-				   fix_to_virt(VSYSCALL_HPET) + 0xf0) -
-				  __vxtime.last) * __vxtime.quot) >> 32;
-		}
-	} while (read_seqretry(&__xtime_lock, sequence));
+		sequence = read_seqbegin(&__vxtime.vx_seq);
+		cpu = _vgetcpu();
+
+		sec = __vxtime.pcpu[cpu].tv_sec;
+		usec = __vxtime.pcpu[cpu].tv_usec;
+		rdtscll(t);
+
+		usec += (((t - __vxtime.pcpu[cpu].last_tsc) *
+		    __vxtime.pcpu[cpu].tsc_nsquot) >> NS_SCALE) / NSEC_PER_USEC;
+		/*
+		 * If we get a context switch while here, the seq lock will
+		 * change, and we'll have to retry.
+		 */
+	} while (read_seqretry(&__vxtime.vx_seq, sequence));
+
+	tv->tv_sec = sec + usec / USEC_PER_SEC;
+	tv->tv_usec = usec % USEC_PER_SEC;
+}
+
+void vxtime_update_pcpu(void)
+{
+	unsigned long flags, offset, seq;
+	struct vxtime_pcpu *pcpu;
+	
+	write_seqlock_irqsave(&vxtime.vx_seq, flags);
+	pcpu = read_pda(vxtime);
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		pcpu->tv_sec = xtime.tv_sec;
+		pcpu->tv_usec = xtime.tv_nsec / 1000;
+		offset = hpet_readl(HPET_COUNTER) - vxtime.last;
+	} while (read_seqretry(&xtime_lock, seq));

-	tv->tv_sec = sec + usec / 1000000;
-	tv->tv_usec = usec % 1000000;
+	pcpu->tv_usec += (offset * vxtime.quot) >> 32;
+	pcpu->last_tsc = get_cycles_sync();
+
+	write_sequnlock_irqrestore(&vxtime.vx_seq, flags);
}

+static void _vxtime_update_pcpu(void *arg)
+{
+	vxtime_update_pcpu();
+}
+
+static int vxtime_periodic(void *arg)
+{
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(msecs_to_jiffies(1000));
+
+		smp_call_function(_vxtime_update_pcpu, NULL, 1, 1);
+		_vxtime_update_pcpu(NULL);
+	}
+
+	return (0); /* NOTREACHED */
+}
+
+void clock_was_set(void)
+{
+	smp_call_function(_vxtime_update_pcpu, NULL, 1, 1);
+	_vxtime_update_pcpu(NULL);
+}
+
+static int vxtime_pcpu_idle_notify(struct notifier_block *self,
+    unsigned long action, void *arg)
+{
+	if (action == IDLE_END)
+		vxtime_update_pcpu();
+
+	return (0);
+}
+
+static struct notifier_block vxtime_pcpu_idle_nb = {
+	.notifier_call = vxtime_pcpu_idle_notify,
+};
+
+static __init int vxtime_init_pcpu(void)
+{
+	seqlock_init(&vxtime.vx_seq);
+
+	/*
+	 * Don't bother updating the per-cpu data after each HLT
+	 * if we don't need to.
+	 */
+	if (!unsynchronized_tsc())
+		idle_notifier_register(&vxtime_pcpu_idle_nb);
+	
+	kthread_create(vxtime_periodic, NULL, "vxtime_periodic");
+
+	return (0);
+}
+
+core_initcall(vxtime_init_pcpu);
+
/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
static __always_inline void do_get_tz(struct timezone * tz)
{
diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h
index 14996d9..462e254 100644
--- a/include/asm-x86_64/pda.h
+++ b/include/asm-x86_64/pda.h
@@ -7,6 +7,8 @@ #include <linux/types.h>
#include <linux/cache.h>
#include <asm/page.h>

+struct vxtime_pcpu;
+
/* Per processor datastructure. %gs points to it while the kernel runs */ struct x8664_pda {
	struct task_struct *pcurrent;	/* 0  Current process */
@@ -29,6 +31,7 @@ #endif
	short isidle;
	struct mm_struct *active_mm;
	unsigned apic_timer_irqs;
+	struct vxtime_pcpu *vxtime;
} ____cacheline_aligned_in_smp;

extern struct x8664_pda *_cpu_pda[];
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index e72cfcd..f7821a6 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -49,6 +49,7 @@ extern unsigned long long monotonic_base
extern int sysctl_vsyscall;
extern int nohpet;
extern unsigned long vxtime_hz;
+extern unsigned long hpet_tick;
extern void time_init_gtod(void);

extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2)));
diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h
index b9e5320..91bad25 100644
--- a/include/asm-x86_64/timex.h
+++ b/include/asm-x86_64/timex.h
@@ -46,4 +46,6 @@ #define ARCH_HAS_READ_CURRENT_TIMER	1

extern struct vxtime_data vxtime;

+void clock_was_set(void);
+
#endif
diff --git a/include/asm-x86_64/vsyscall.h b/include/asm-x86_64/vsyscall.h
index fd452fc..df18d6a 100644
--- a/include/asm-x86_64/vsyscall.h
+++ b/include/asm-x86_64/vsyscall.h
@@ -30,6 +30,13 @@ #define VXTIME_PMTMR	3
#define VGETCPU_RDTSCP	1
#define VGETCPU_LSL	2

+struct vxtime_pcpu {
+	time_t tv_sec;
+	long tv_usec;
+	unsigned long tsc_nsquot;
+	unsigned long last_tsc;
+};
+
struct vxtime_data {
	long hpet_address;	/* HPET base address */
	int last;
@@ -37,6 +44,8 @@ struct vxtime_data {
	long quot;
	long tsc_quot;
	int mode;
+	seqlock_t vx_seq;
+	struct vxtime_pcpu pcpu[NR_CPUS] ____cacheline_aligned;
};

#define hpet_readl(a)           readl((const void __iomem *)fix_to_virt(FIX_HPET_BASE) + a)
@@ -61,7 +70,10 @@ extern int sysctl_vsyscall;

extern void vsyscall_set_cpu(int cpu);

+void vxtime_update_pcpu(void);
+
#define ARCH_HAVE_XTIME_LOCK 1
+#define ARCH_HAVE_CLOCK_WAS_SET 1

#endif /* __KERNEL__ */

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fca9302..7f59619 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -95,12 +95,14 @@ struct hrtimer_base {
	struct lock_class_key lock_key;
};

+#ifndef ARCH_HAVE_CLOCK_WAS_SET
/*
 * clock_was_set() is a NOP for non- high-resolution systems. The
 * time-sorted order guarantees that a timer does not expire early and
 * is expired in the next softirq when the clock was advanced.
 */
#define clock_was_set()		do { } while (0)
+#endif

/* Exported timer functions: */


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux