i386 optimization of the immediate values which uses a movl with code patching
to set/unset the value used to populate the register used as variable source.
Changelog:
- Use text_poke_early with cr0 WP save/restore to patch the bypass. We are doing
non atomic writes to a code region only touched by us (nobody can execute it
since we are protected by the immediate_mutex).
Signed-off-by: Mathieu Desnoyers <[email protected]>
Reviewed-by: Andi Kleen <[email protected]>
Reviewed-by: "H. Peter Anvin" <[email protected]>
Reviewed-by: Chuck Ebbert <[email protected]>
CC: Christoph Hellwig <[email protected]>
---
arch/i386/kernel/Makefile | 1
arch/i386/kernel/immediate.c | 307 +++++++++++++++++++++++++++++++++++++++++++
arch/i386/kernel/traps.c | 8 -
include/asm-i386/immediate.h | 137 +++++++++++++++++++
4 files changed, 449 insertions(+), 4 deletions(-)
Index: linux-2.6-lttng/include/asm-i386/immediate.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/include/asm-i386/immediate.h 2007-09-04 14:05:59.000000000 -0400
@@ -0,0 +1,137 @@
+#ifndef _ASM_I386_IMMEDIATE_H
+#define _ASM_I386_IMMEDIATE_H
+
+/*
+ * Immediate values. i386 architecture optimizations.
+ *
+ * (C) Copyright 2006 Mathieu Desnoyers <[email protected]>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+struct module;
+
+struct __immediate {
+ long var; /* Pointer to the identifier variable of the
+ * immediate value
+ */
+ long immediate; /*
+ * Pointer to the memory location of the
+ * immediate value within the instruction.
+ */
+ long size; /* Type size. */
+};
+
+/**
+ * immediate_read - read immediate variable
+ * @var: pointer of type immediate_*_t
+ *
+ * Reads the value of @var.
+ * Optimized version of the immediate.
+ * Do not use in __init and __exit functions. Use _immediate_read() instead.
+ * Makes sure the 2 and 4 bytes update will be atomic by aligning the immediate
+ * value. 2 bytes (short) uses a 66H prefix. If size is bigger than 4 bytes,
+ * fall back on a memory read.
+ */
+#define immediate_read(var) \
+ ({ \
+ __typeof__((var)->value) value; \
+ switch (sizeof(value)) { \
+ case 1: \
+ asm ( ".section __immediate, \"a\", @progbits;\n\t" \
+ ".long %1, (0f)+1, 1;\n\t" \
+ ".previous;\n\t" \
+ "0:\n\t" \
+ "mov %2,%0;\n\t" \
+ : "=r" (value) \
+ : "m" ((var)->value), \
+ "i" (0)); \
+ break; \
+ case 2: \
+ asm ( ".section __immediate, \"a\", @progbits;\n\t" \
+ ".long %1, (0f)+2, 2;\n\t" \
+ ".previous;\n\t" \
+ "1:\n\t" \
+ ".align 2;\n\t" \
+ "0:\n\t" \
+ "mov %2,%0;\n\t" \
+ : "=r" (value) \
+ : "m" ((var)->value), \
+ "i" (0)); \
+ break; \
+ case 4: \
+ asm ( ".section __immediate, \"a\", @progbits;\n\t" \
+ ".long %1, (0f)+1, 4;\n\t" \
+ ".previous;\n\t" \
+ "1:\n\t" \
+ ".org (1b)+(3-((1b)%%4)), 0x90;\n\t" \
+ "0:\n\t" \
+ "mov %2,%0;\n\t" \
+ : "=r" (value) \
+ : "m" ((var)->value), \
+ "i" (0)); \
+ break; \
+ default:value = (var)->value; \
+ break; \
+ }; \
+ value; \
+ })
+
+
+/**
+ * immediate_set - set immediate variable (with locking)
+ * @var: pointer of type immediate_*_t
+ * @i: required value
+ *
+ * Sets the value of @var, taking the module_mutex if required by
+ * the architecture.
+ */
+#define immediate_set(var, i) \
+ (var)->value = (i); \
+ immediate_update(1);
+
+/**
+ * _immediate_set - set immediate variable (without locking)
+ * @var: pointer of type immediate_*_t
+ * @i: required value
+ *
+ * Sets the value of @var. Must be called with module_mutex held.
+ */
+#define _immediate_set(var, i) \
+ (var)->value = (i); \
+ immediate_update(0);
+
+/**
+ * immediate_set_early - set immediate variable at early boot
+ * @var: pointer of type immediate_*_t
+ * @i: required value
+ *
+ * Sets the value of @var. Should be used for early boot updates.
+ */
+#define immediate_set_early(var, i) \
+ (var)->value = (i); \
+ immediate_update_early();
+
+/**
+ * immediate_if - if () statement depending on an immediate value
+ * @var: pointer of type immediate_*_t
+ *
+ * Use as an if () statement depending on an immediate value.
+ * Do not use in __init and __exit functions. Use _immediate_if() instead.
+ * Branch depending on an immediate value. Could eventually be optimized further
+ * by improving gcc to give the ability to patch a jump instruction instead of
+ * the value it depends on.
+ */
+#define immediate_if(var) if (unlikely(immediate_read(var)))
+
+/*
+ * Internal update functions.
+ */
+extern void immediate_update(int lock);
+extern void module_immediate_setup(struct module *mod);
+extern void immediate_update_early(void);
+extern int arch_immediate_update(const struct __immediate *immediate);
+extern void arch_immediate_update_early(const struct __immediate *immediate);
+
+#endif /* _ASM_I386_IMMEDIATE_H */
Index: linux-2.6-lttng/arch/i386/kernel/Makefile
===================================================================
--- linux-2.6-lttng.orig/arch/i386/kernel/Makefile 2007-09-04 14:04:48.000000000 -0400
+++ linux-2.6-lttng/arch/i386/kernel/Makefile 2007-09-04 14:05:59.000000000 -0400
@@ -37,6 +37,7 @@ obj-$(CONFIG_MODULES) += module.o
obj-y += sysenter.o vsyscall.o
obj-$(CONFIG_ACPI_SRAT) += srat.o
obj-$(CONFIG_EFI) += efi.o efi_stub.o
+obj-$(CONFIG_IMMEDIATE) += immediate.o
obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
obj-$(CONFIG_KGDB) += kgdb.o kgdb-jmp.o
obj-$(CONFIG_VM86) += vm86.o
Index: linux-2.6-lttng/arch/i386/kernel/immediate.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/arch/i386/kernel/immediate.c 2007-09-04 14:38:12.000000000 -0400
@@ -0,0 +1,307 @@
+/*
+ * Immediate Value - i386 architecture specific code.
+ *
+ * Rationale
+ *
+ * Required because of :
+ * - Erratum 49 fix for Intel PIII.
+ * - Still present on newer processors : Intel Core 2 Duo Processor for Intel
+ * Centrino Duo Processor Technology Specification Update, AH33.
+ * Unsynchronized Cross-Modifying Code Operations Can Cause Unexpected
+ * Instruction Execution Results.
+ *
+ * Permits immediate value modification by XMC with correct serialization.
+ *
+ * Reentrant for NMI and trap handler instrumentation. Permits XMC to a
+ * location that has preemption enabled because it involves no temporary or
+ * reused data structure.
+ *
+ * Quoting Richard J Moore, source of the information motivating this
+ * implementation which differs from the one proposed by Intel which is not
+ * suitable for kernel context (does not support NMI and would require disabling
+ * interrupts on every CPU for a long period) :
+ *
+ * "There is another issue to consider when looking into using probes other
+ * then int3:
+ *
+ * Intel erratum 54 - Unsynchronized Cross-modifying code - refers to the
+ * practice of modifying code on one processor where another has prefetched
+ * the unmodified version of the code. Intel states that unpredictable general
+ * protection faults may result if a synchronizing instruction (iret, int,
+ * int3, cpuid, etc ) is not executed on the second processor before it
+ * executes the pre-fetched out-of-date copy of the instruction.
+ *
+ * When we became aware of this I had a long discussion with Intel's
+ * microarchitecture guys. It turns out that the reason for this erratum
+ * (which incidentally Intel does not intend to fix) is because the trace
+ * cache - the stream of micorops resulting from instruction interpretation -
+ * cannot guaranteed to be valid. Reading between the lines I assume this
+ * issue arises because of optimization done in the trace cache, where it is
+ * no longer possible to identify the original instruction boundaries. If the
+ * CPU discoverers that the trace cache has been invalidated because of
+ * unsynchronized cross-modification then instruction execution will be
+ * aborted with a GPF. Further discussion with Intel revealed that replacing
+ * the first opcode byte with an int3 would not be subject to this erratum.
+ *
+ * So, is cmpxchg reliable? One has to guarantee more than mere atomicity."
+ *
+ * Overall design
+ *
+ * The algorithm proposed by Intel applies not so well in kernel context: it
+ * would imply disabling interrupts and looping on every CPUs while modifying
+ * the code and would not support instrumentation of code called from interrupt
+ * sources that cannot be disabled.
+ *
+ * Therefore, we use a different algorithm to respect Intel's erratum (see the
+ * quoted discussion above). We make sure that no CPU sees an out-of-date copy
+ * of a pre-fetched instruction by 1 - using a breakpoint, which skips the
+ * instruction that is going to be modified, 2 - issuing an IPI to every CPU to
+ * execute a sync_core(), to make sure that even when the breakpoint is removed,
+ * no cpu could possibly still have the out-of-date copy of the instruction,
+ * modify the now unused 2nd byte of the instruction, and then put back the
+ * original 1st byte of the instruction.
+ *
+ * It has exactly the same intent as the algorithm proposed by Intel, but
+ * it has less side-effects, scales better and supports NMI, SMI and MCE.
+ *
+ * Mathieu Desnoyers <[email protected]>
+ */
+
+#include <linux/notifier.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/immediate.h>
+#include <linux/kdebug.h>
+#include <linux/rcupdate.h>
+#include <linux/kprobes.h>
+
+#include <asm/cacheflush.h>
+
+#define BREAKPOINT_INSTRUCTION 0xcc
+#define BREAKPOINT_INS_LEN 1
+#define NOP_INSTRUCTION 0x90
+#define NR_NOPS 8
+
+static long target_after_int3; /* EIP of the target after the int3 */
+static long bypass_eip; /* EIP of the bypass. */
+static long bypass_after_int3; /* EIP after the end-of-bypass int3 */
+static long after_immediate; /*
+ * EIP where to resume after the
+ * single-stepping.
+ */
+
+/*
+ * Size of the movl instruction (without the immediate value) in bytes.
+ * The 2 bytes load immediate has a 66H prefix, which makes the opcode 2 bytes
+ * wide.
+ */
+static inline size_t _immediate_get_insn_size(long size)
+{
+ switch (size) {
+ case 1: return 1;
+ case 2: return 2;
+ case 4: return 1;
+ default: BUG();
+ };
+}
+
+/*
+ * Internal bypass used during value update. The bypass is skipped by the
+ * function in which it is inserted.
+ * No need to be aligned because we exclude readers from the site during
+ * update.
+ * Layout is:
+ * nop nop nop nop nop nop nop nop int3
+ * The nops are the target replaced by the instruction to single-step.
+ */
+static inline void _immediate_bypass(long *bypassaddr, long *breaknextaddr)
+{
+ asm volatile ( "jmp 2f;\n\t"
+ "0:\n\t"
+ ".space 8, 0x90;\n\t"
+ "1:\n\t"
+ "int3;\n\t"
+ "2:\n\t"
+ "movl $(0b),%0;\n\t"
+ "movl $((1b)+1),%1;\n\t"
+ : "=r" (*bypassaddr),
+ "=r" (*breaknextaddr) : );
+}
+
+static void immediate_synchronize_core(void *info)
+{
+ sync_core(); /* use cpuid to stop speculative execution */
+}
+
+/*
+ * The eip value points right after the breakpoint instruction, in the second
+ * byte of the movl.
+ * Disable preemption in the bypass to make sure no thread will be preempted in
+ * it. We can then use synchronize_sched() to make sure every bypass users have
+ * ended.
+ */
+static int immediate_notifier(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ enum die_val die_val = (enum die_val) val;
+ struct die_args *args = data;
+
+ if (!args->regs || user_mode_vm(args->regs))
+ return NOTIFY_DONE;
+
+ if (die_val == DIE_INT3) {
+ if (args->regs->eip == target_after_int3) {
+ preempt_disable();
+ args->regs->eip = bypass_eip;
+ return NOTIFY_STOP;
+ } else if (args->regs->eip == bypass_after_int3) {
+ args->regs->eip = after_immediate;
+ preempt_enable();
+ return NOTIFY_STOP;
+ }
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block immediate_notify = {
+ .notifier_call = immediate_notifier,
+ .priority = 0x7fffffff, /* we need to be notified first */
+};
+
+
+/**
+ * arch_immediate_update - update one immediate value
+ * @immediate: pointer of type const struct __immediate to update
+ *
+ * Update one immediate value. Must be called with immediate_mutex held.
+ */
+__kprobes int arch_immediate_update(const struct __immediate *immediate)
+{
+ int ret;
+ size_t insn_size = _immediate_get_insn_size(immediate->size);
+ long insn = immediate->immediate - insn_size;
+ unsigned long cr0;
+
+#ifdef CONFIG_KPROBES
+ /*
+ * Fail if a kprobe has been set on this instruction.
+ * (TODO: we could eventually do better and modify all the (possibly
+ * nested) kprobes for this site if kprobes had an API for this.
+ */
+ if (unlikely(*(unsigned char*)insn == BREAKPOINT_INSTRUCTION)) {
+ printk(KERN_WARNING "Immediate value in conflict with kprobe. "
+ "Variable at %p, "
+ "instruction at %p, size %lu\n",
+ (void*)immediate->immediate,
+ (void*)immediate->var, immediate->size);
+ return -EBUSY;
+ }
+#endif
+
+ /*
+ * If the variable and the instruction have the same value, there is
+ * nothing to do.
+ */
+ switch (immediate->size) {
+ case 1: if (*(uint8_t*)immediate->immediate
+ == *(uint8_t*)immediate->var)
+ return 0;
+ break;
+ case 2: if (*(uint16_t*)immediate->immediate
+ == *(uint16_t*)immediate->var)
+ return 0;
+ break;
+ case 4: if (*(uint32_t*)immediate->immediate
+ == *(uint32_t*)immediate->var)
+ return 0;
+ break;
+ default:return -EINVAL;
+ }
+
+ _immediate_bypass(&bypass_eip, &bypass_after_int3);
+
+ after_immediate = immediate->immediate + immediate->size;
+
+ /*
+ * Using the _early variants because nobody is executing the
+ * bypass code while we patch it. It is protected by the
+ * immediate_mutex. Since we modify the instructions non atomically (for
+ * nops), we have to use the _early variant.
+ * We must however deal with the WP flag in cr0 by ourself.
+ */
+ kernel_wp_save(cr0);
+ text_poke_early((void*)bypass_eip, (void*)insn,
+ insn_size + immediate->size);
+ /*
+ * Fill the rest with nops.
+ */
+ text_set_early((void*)(bypass_eip + insn_size + immediate->size),
+ NOP_INSTRUCTION,
+ NR_NOPS - immediate->size - insn_size);
+ kernel_wp_restore(cr0);
+
+ target_after_int3 = insn + BREAKPOINT_INS_LEN;
+ /* register_die_notifier has memory barriers */
+ register_die_notifier(&immediate_notify);
+ /* The breakpoint will single-step the bypass */
+ text_set((void*)insn, BREAKPOINT_INSTRUCTION, 1);
+ wmb();
+ /*
+ * Execute serializing instruction on each CPU.
+ * Acts as a memory barrier.
+ */
+ ret = on_each_cpu(immediate_synchronize_core, NULL, 1, 1);
+ BUG_ON(ret != 0);
+
+ text_poke((void*)(insn + insn_size), (void*)immediate->var,
+ immediate->size);
+ wmb();
+ text_set((void*)insn, *(char*)bypass_eip, 1);
+ /*
+ * Wait for all int3 handlers to end
+ * (interrupts are disabled in int3).
+ * This CPU is clearly not in a int3 handler,
+ * because int3 handler is not preemptible and
+ * there cannot be any more int3 handler called
+ * for this site, because we placed the original
+ * instruction back.
+ * synchronize_sched has memory barriers.
+ */
+ synchronize_sched();
+ unregister_die_notifier(&immediate_notify);
+ /* unregister_die_notifier has memory barriers */
+ return 0;
+}
+
+/**
+ * arch_immediate_update_early - update one immediate value at boot time
+ * @immediate: pointer of type const struct __immediate to update
+ *
+ * Update one immediate value at boot time.
+ */
+void __init arch_immediate_update_early(const struct __immediate *immediate)
+{
+ /*
+ * If the variable and the instruction have the same value, there is
+ * nothing to do.
+ */
+ switch (immediate->size) {
+ case 1: if (*(uint8_t*)immediate->immediate
+ == *(uint8_t*)immediate->var)
+ return;
+ break;
+ case 2: if (*(uint16_t*)immediate->immediate
+ == *(uint16_t*)immediate->var)
+ return;
+ break;
+ case 4: if (*(uint32_t*)immediate->immediate
+ == *(uint32_t*)immediate->var)
+ return;
+ break;
+ default:return;
+ }
+ memcpy((void*)immediate->immediate, (void*)immediate->var,
+ immediate->size);
+}
Index: linux-2.6-lttng/arch/i386/kernel/traps.c
===================================================================
--- linux-2.6-lttng.orig/arch/i386/kernel/traps.c 2007-09-04 14:04:48.000000000 -0400
+++ linux-2.6-lttng/arch/i386/kernel/traps.c 2007-09-04 14:45:22.000000000 -0400
@@ -601,7 +601,7 @@ fastcall void do_##name(struct pt_regs *
}
DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip)
-#ifndef CONFIG_KPROBES
+#if !defined(CONFIG_KPROBES) && !defined(CONFIG_IMMEDIATE)
DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
#endif
DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
@@ -843,14 +843,14 @@ void restart_nmi(void)
acpi_nmi_enable();
}
-#ifdef CONFIG_KPROBES
+#if defined(CONFIG_KPROBES) || defined(CONFIG_IMMEDIATE)
fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
{
if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
== NOTIFY_STOP)
return;
- /* This is an interrupt gate, because kprobes wants interrupts
- disabled. Normal trap handlers don't. */
+ /* This is an interrupt gate, because kprobes and immediate values wants
+ * interrupts disabled. Normal trap handlers don't. */
restore_interrupts(regs);
do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
}
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
[Index of Archives]
[Kernel Newbies]
[Netfilter]
[Bugtraq]
[Photo]
[Stuff]
[Gimp]
[Yosemite News]
[MIPS Linux]
[ARM Linux]
[Linux Security]
[Linux RAID]
[Video 4 Linux]
[Linux for the blind]
[Linux Resources]