[patch] lock validator: reenable NMIs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Subject: lock validator: reenable NMIs
From: Ingo Molnar <[email protected]>

- not touch the irqtrace state in NMI handlers
- re-enable NMIs and oprofile on i386 and x86_64

i have tested this with high-rate (10 KHz per CPU) NMI load on the 
kernel, and while previously we'd quickly get the irqtrace state 
confused and a lock validator bug message, with this patch applied it's 
now working fine.

(note that on x86_64 we only build one variant of the paranoid-exit
codepath if IRQTRACE is disabled - so there should be no difference
in code generated, as long as the validator is disabled)

Signed-off-by: Ingo Molnar <[email protected]>
---
 arch/i386/kernel/nmi.c       |   10 ---
 arch/i386/oprofile/Kconfig   |    2 
 arch/x86_64/kernel/entry.S   |  133 ++++++++++++++++++++++++-------------------
 arch/x86_64/kernel/nmi.c     |   11 ---
 arch/x86_64/oprofile/Kconfig |    2 
 include/linux/hardirq.h      |    4 -
 kernel/lockdep.c             |   16 ++---
 7 files changed, 89 insertions(+), 89 deletions(-)

Index: linux/arch/i386/kernel/nmi.c
===================================================================
--- linux.orig/arch/i386/kernel/nmi.c
+++ linux/arch/i386/kernel/nmi.c
@@ -747,16 +747,6 @@ void setup_apic_nmi_watchdog (void *unus
 {
 	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
 
-#ifdef CONFIG_LOCKDEP
-	/*
-	 * The NMI watchdog uses spinlocks (notifier chains, etc.),
-	 * so it's not lockdep-safe:
-	 */
-	nmi_watchdog = NMI_NONE;
-	printk("lockdep: disabled NMI watchdog.\n");
-
-	return;
-#endif
 	/* only support LOCAL and IO APICs for now */
 	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
 	    (nmi_watchdog != NMI_IO_APIC))
Index: linux/arch/i386/oprofile/Kconfig
===================================================================
--- linux.orig/arch/i386/oprofile/Kconfig
+++ linux/arch/i386/oprofile/Kconfig
@@ -7,7 +7,7 @@ config PROFILING
 
 config OPROFILE
 	tristate "OProfile system profiling (EXPERIMENTAL)"
-	depends on PROFILING && !LOCKDEP
+	depends on PROFILING
 	help
 	  OProfile is a profiling system capable of profiling the
 	  whole system, include the kernel, kernel modules, libraries,
Index: linux/arch/x86_64/kernel/entry.S
===================================================================
--- linux.orig/arch/x86_64/kernel/entry.S
+++ linux/arch/x86_64/kernel/entry.S
@@ -718,7 +718,7 @@ END(spurious_interrupt)
 
 	/* error code is on the stack already */
 	/* handle NMI like exceptions that can happen everywhere */
-	.macro paranoidentry sym, ist=0
+	.macro paranoidentry sym, ist=0, irqtrace=1
 	SAVE_ALL
 	cld
 	movl $1,%ebx
@@ -743,9 +743,73 @@ END(spurious_interrupt)
 	addq	$EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
 	.endif
 	cli
+	.if \irqtrace
 	TRACE_IRQS_OFF
+	.endif
 	.endm
-	
+
+	/*
+ 	 * "Paranoid" exit path from exception stack.
+  	 * Paranoid because this is used by NMIs and cannot take
+	 * any kernel state for granted.
+	 * We don't do kernel preemption checks here, because only
+	 * NMI should be common and it does not enable IRQs and
+	 * cannot get reschedule ticks.
+	 *
+	 * "trace" is 0 for the NMI handler only, because irq-tracing
+	 * is fundamentally NMI-unsafe. (we cannot change the soft and
+	 * hard flags at once, atomically)
+	 */
+	.macro paranoidexit trace=1
+	/* ebx:	no swapgs flag */
+paranoid_exit\trace:
+	testl %ebx,%ebx				/* swapgs needed? */
+	jnz paranoid_restore\trace
+	testl $3,CS(%rsp)
+	jnz   paranoid_userspace\trace
+paranoid_swapgs\trace:
+	TRACE_IRQS_IRETQ 0
+	swapgs
+paranoid_restore\trace:
+	RESTORE_ALL 8
+	iretq
+paranoid_userspace\trace:
+	GET_THREAD_INFO(%rcx)
+	movl threadinfo_flags(%rcx),%ebx
+	andl $_TIF_WORK_MASK,%ebx
+	jz paranoid_swapgs\trace
+	movq %rsp,%rdi			/* &pt_regs */
+	call sync_regs
+	movq %rax,%rsp			/* switch stack for scheduling */
+	testl $_TIF_NEED_RESCHED,%ebx
+	jnz paranoid_schedule\trace
+	movl %ebx,%edx			/* arg3: thread flags */
+	.if \trace
+	TRACE_IRQS_ON
+	.endif
+	sti
+	xorl %esi,%esi 			/* arg2: oldset */
+	movq %rsp,%rdi 			/* arg1: &pt_regs */
+	call do_notify_resume
+	cli
+	.if \trace
+	TRACE_IRQS_OFF
+	.endif
+	jmp paranoid_userspace\trace
+paranoid_schedule\trace:
+	.if \trace
+	TRACE_IRQS_ON
+	.endif
+	sti
+	call schedule
+	cli
+	.if \trace
+	TRACE_IRQS_OFF
+	.endif
+	jmp paranoid_userspace\trace
+	CFI_ENDPROC
+	.endm
+
 /*
  * Exception entry point. This expects an error code/orig_rax on the stack
  * and the exception handler in %rax.	
@@ -974,8 +1038,7 @@ KPROBE_ENTRY(debug)
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8		
 	paranoidentry do_debug, DEBUG_STACK
-	jmp paranoid_exit
-	CFI_ENDPROC
+	paranoidexit
 END(debug)
 	.previous .text
 
@@ -984,54 +1047,12 @@ KPROBE_ENTRY(nmi)
 	INTR_FRAME
 	pushq $-1
 	CFI_ADJUST_CFA_OFFSET 8
-	paranoidentry do_nmi
-	/*
- 	 * "Paranoid" exit path from exception stack.
-  	 * Paranoid because this is used by NMIs and cannot take
-	 * any kernel state for granted.
-	 * We don't do kernel preemption checks here, because only
-	 * NMI should be common and it does not enable IRQs and
-	 * cannot get reschedule ticks.
-	 */
-	/* ebx:	no swapgs flag */
-paranoid_exit:
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz paranoid_restore
-	testl $3,CS(%rsp)
-	jnz   paranoid_userspace
-paranoid_swapgs:	
-	TRACE_IRQS_IRETQ 0
-	swapgs
-paranoid_restore:	
-	RESTORE_ALL 8
-	iretq
-paranoid_userspace:	
-	GET_THREAD_INFO(%rcx)
-	movl threadinfo_flags(%rcx),%ebx
-	andl $_TIF_WORK_MASK,%ebx
-	jz paranoid_swapgs
-	movq %rsp,%rdi			/* &pt_regs */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack for scheduling */
-	testl $_TIF_NEED_RESCHED,%ebx
-	jnz paranoid_schedule
-	movl %ebx,%edx			/* arg3: thread flags */
-	TRACE_IRQS_ON
-	sti
-	xorl %esi,%esi 			/* arg2: oldset */
-	movq %rsp,%rdi 			/* arg1: &pt_regs */
-	call do_notify_resume
-	cli
-	TRACE_IRQS_OFF
-	jmp paranoid_userspace
-paranoid_schedule:
-	TRACE_IRQS_ON
-	sti
-	call schedule
-	cli
-	TRACE_IRQS_OFF
-	jmp paranoid_userspace
-	CFI_ENDPROC
+	paranoidentry do_nmi, 0, 0
+#ifdef CONFIG_TRACE_IRQFLAGS
+	paranoidexit 0
+#else
+	paranoidexit 1
+#endif
 END(nmi)
 	.previous .text
 
@@ -1040,7 +1061,7 @@ KPROBE_ENTRY(int3)
  	pushq $0
  	CFI_ADJUST_CFA_OFFSET 8
  	paranoidentry do_int3, DEBUG_STACK
- 	jmp paranoid_exit
+ 	jmp paranoid_exit1
  	CFI_ENDPROC
 END(int3)
 	.previous .text
@@ -1069,7 +1090,7 @@ END(reserved)
 ENTRY(double_fault)
 	XCPT_FRAME
 	paranoidentry do_double_fault
-	jmp paranoid_exit
+	jmp paranoid_exit1
 	CFI_ENDPROC
 END(double_fault)
 
@@ -1085,7 +1106,7 @@ END(segment_not_present)
 ENTRY(stack_segment)
 	XCPT_FRAME
 	paranoidentry do_stack_segment
-	jmp paranoid_exit
+	jmp paranoid_exit1
 	CFI_ENDPROC
 END(stack_segment)
 
@@ -1113,7 +1134,7 @@ ENTRY(machine_check)
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8	
 	paranoidentry do_machine_check
-	jmp paranoid_exit
+	jmp paranoid_exit1
 	CFI_ENDPROC
 END(machine_check)
 #endif
Index: linux/arch/x86_64/kernel/nmi.c
===================================================================
--- linux.orig/arch/x86_64/kernel/nmi.c
+++ linux/arch/x86_64/kernel/nmi.c
@@ -674,17 +674,6 @@ void setup_apic_nmi_watchdog(void *unuse
 {
 	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
 
-#ifdef CONFIG_LOCKDEP
-	/*
-	 * The NMI watchdog uses spinlocks (notifier chains, etc.),
-	 * so it's not lockdep-safe:
-	 */
-	nmi_watchdog = NMI_NONE;
-	printk("lockdep: disabled NMI watchdog.\n");
-
-	return;
-#endif
-
 	/* only support LOCAL and IO APICs for now */
 	if ((nmi_watchdog != NMI_LOCAL_APIC) &&
 	    (nmi_watchdog != NMI_IO_APIC))
Index: linux/arch/x86_64/oprofile/Kconfig
===================================================================
--- linux.orig/arch/x86_64/oprofile/Kconfig
+++ linux/arch/x86_64/oprofile/Kconfig
@@ -7,7 +7,7 @@ config PROFILING
 
 config OPROFILE
 	tristate "OProfile system profiling (EXPERIMENTAL)"
-	depends on PROFILING && !LOCKDEP
+	depends on PROFILING
 	help
 	  OProfile is a profiling system capable of profiling the
 	  whole system, include the kernel, kernel modules, libraries,
Index: linux/include/linux/hardirq.h
===================================================================
--- linux.orig/include/linux/hardirq.h
+++ linux/include/linux/hardirq.h
@@ -122,7 +122,7 @@ static inline void account_system_vtime(
  */
 extern void irq_exit(void);
 
-#define nmi_enter()		irq_enter()
-#define nmi_exit()		__irq_exit()
+#define nmi_enter()		do { lockdep_off(); irq_enter(); } while (0)
+#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
 
 #endif /* LINUX_HARDIRQ_H */
Index: linux/kernel/lockdep.c
===================================================================
--- linux.orig/kernel/lockdep.c
+++ linux/kernel/lockdep.c
@@ -1869,7 +1869,7 @@ void trace_hardirqs_on(void)
 	struct task_struct *curr = current;
 	unsigned long ip;
 
-	if (unlikely(!debug_locks))
+	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
 	if (DEBUG_WARN_ON(unlikely(!early_boot_irqs_enabled)))
@@ -1916,7 +1916,7 @@ void trace_hardirqs_off(void)
 {
 	struct task_struct *curr = current;
 
-	if (unlikely(!debug_locks))
+	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
 	if (DEBUG_WARN_ON(!irqs_disabled()))
@@ -2488,16 +2488,16 @@ void lock_acquire(struct lockdep_map *lo
 {
 	unsigned long flags;
 
+	if (unlikely(current->lockdep_recursion))
+		return;
+
 	raw_local_irq_save(flags);
 	check_flags(flags);
 
-	if (unlikely(current->lockdep_recursion))
-		goto out;
 	current->lockdep_recursion = 1;
 	__lock_acquire(lock, subtype, trylock, read, check,
 		       irqs_disabled_flags(flags), ip);
 	current->lockdep_recursion = 0;
-out:
 	raw_local_irq_restore(flags);
 }
 
@@ -2507,14 +2507,14 @@ void lock_release(struct lockdep_map *lo
 {
 	unsigned long flags;
 
+	if (unlikely(current->lockdep_recursion))
+		return;
+
 	raw_local_irq_save(flags);
 	check_flags(flags);
-	if (unlikely(current->lockdep_recursion))
-		goto out;
 	current->lockdep_recursion = 1;
 	__lock_release(lock, nested, ip);
 	current->lockdep_recursion = 0;
-out:
 	raw_local_irq_restore(flags);
 }
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux