[PATCH 7/10] x86_64 implementation of cpu bulk removal

x86_64 specific implementation of cpu bulk removal. Add the config and make
__cpu_die/__cpu_disable work with cpumask_t.

Signed-off-by: Ashok Raj <[email protected]> 
Signed-off-by: Shaohua Li <[email protected]> 
---

 linux-2.6.17-rc3-root/arch/x86_64/Kconfig          |   10 +
 linux-2.6.17-rc3-root/arch/x86_64/kernel/smpboot.c |  125 +++++++++++++++------
 2 files changed, 102 insertions(+), 33 deletions(-)

diff -puN arch/x86_64/Kconfig~x86-64-bulk-cpu-hotplug arch/x86_64/Kconfig
--- linux-2.6.17-rc3/arch/x86_64/Kconfig~x86-64-bulk-cpu-hotplug	2006-05-07 07:46:16.000000000 +0800
+++ linux-2.6.17-rc3-root/arch/x86_64/Kconfig	2006-05-07 07:46:16.000000000 +0800
@@ -369,6 +369,16 @@ config HOTPLUG_CPU
 		can be controlled through /sys/devices/system/cpu/cpu#.
 		Say N if you want to disable CPU hotplug.
 
+config BULK_CPU_REMOVE
+	bool "Support for bulk removal of CPUs (EXPERIMENTAL)"
+	depends on HOTPLUG_CPU && EXPERIMENTAL
+	help
+	  Say Y if need the ability to remove more than one cpu during cpu
+	  removal. Current mechanisms may be in-efficient when a NUMA
+	  node is being removed, which would involve removing one cpu at a
+	  time. This will let interrupts, timers and processes to be bound
+	  to a CPU that might be removed right after the current cpu is
+	  being offlined.
 
 config HPET_TIMER
 	bool
diff -puN arch/x86_64/kernel/smpboot.c~x86-64-bulk-cpu-hotplug arch/x86_64/kernel/smpboot.c
--- linux-2.6.17-rc3/arch/x86_64/kernel/smpboot.c~x86-64-bulk-cpu-hotplug	2006-05-07 07:46:16.000000000 +0800
+++ linux-2.6.17-rc3-root/arch/x86_64/kernel/smpboot.c	2006-05-07 07:46:16.000000000 +0800
@@ -1181,44 +1181,55 @@ void __init smp_cpus_done(unsigned int m
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static void remove_siblinginfo(int cpu)
+static void remove_siblinginfo(cpumask_t remove_mask)
 {
 	int sibling;
 	struct cpuinfo_x86 *c = cpu_data;
+	int cpu;
 
-	for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
-		cpu_clear(cpu, cpu_core_map[sibling]);
-		/*
-		 * last thread sibling in this cpu core going down
-		 */
-		if (cpus_weight(cpu_sibling_map[cpu]) == 1)
-			c[sibling].booted_cores--;
-	}
+	for_each_cpu_mask(cpu, remove_mask) {
+		for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
+			cpu_clear(cpu, cpu_core_map[sibling]);
+			/*
+			 * last thread sibling in this cpu core going down
+			 */
+			if (cpus_weight(cpu_sibling_map[cpu]) == 1)
+				c[sibling].booted_cores--;
+		}
 			
-	for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
-		cpu_clear(cpu, cpu_sibling_map[sibling]);
-	cpus_clear(cpu_sibling_map[cpu]);
-	cpus_clear(cpu_core_map[cpu]);
-	phys_proc_id[cpu] = BAD_APICID;
-	cpu_core_id[cpu] = BAD_APICID;
-	cpu_clear(cpu, cpu_sibling_setup_map);
+		for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+			cpu_clear(cpu, cpu_sibling_map[sibling]);
+		cpus_clear(cpu_sibling_map[cpu]);
+		cpus_clear(cpu_core_map[cpu]);
+		phys_proc_id[cpu] = BAD_APICID;
+		cpu_core_id[cpu] = BAD_APICID;
+		cpu_clear(cpu, cpu_sibling_setup_map);
+	}
 }
 
-void remove_cpu_from_maps(void)
+void remove_cpu_from_maps(cpumask_t remove_mask)
 {
-	int cpu = smp_processor_id();
+	int cpu;
 
-	cpu_clear(cpu, cpu_callout_map);
-	cpu_clear(cpu, cpu_callin_map);
-	clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
-	clear_node_cpumask(cpu);
+	for_each_cpu_mask(cpu, remove_mask) {
+		cpu_clear(cpu, cpu_callout_map);
+		cpu_clear(cpu, cpu_callin_map);
+		clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+		clear_node_cpumask(cpu);
+	}
 }
 
+static cpumask_t cpu_dead_mask = CPU_MASK_NONE;
+static cpumask_t cpu_dead_error_mask = CPU_MASK_NONE;
+static atomic_t disable_cpu_start = ATOMIC_INIT(0); /* 1:start, 2:error */
 int __cpu_disable(cpumask_t remove_mask)
 {
 	int cpu = smp_processor_id();
+	int master = 0;
 
-	BUG_ON(cpus_weight(remove_mask) != 1);
+	/* are we the master cpu? */
+	if (first_cpu(remove_mask) == cpu)
+		master = 1;
 	/*
 	 * Perhaps use cpufreq to drop frequency, but that could go
 	 * into generic code.
@@ -1227,8 +1238,28 @@ int __cpu_disable(cpumask_t remove_mask)
 	 * interrupts only being able to be serviced by the BSP.
 	 * Especially so if we're not using an IOAPIC	-zwane
 	 */
-	if (cpu == 0)
-		return -EBUSY;
+	if (master) {
+		if (cpu_isset(0, remove_mask)) {
+			/* report the error */
+			atomic_set(&disable_cpu_start, 2);
+			smp_wmb(); /* set error first */
+			cpu_set(cpu, cpu_dead_error_mask);
+			while (!cpus_equal(cpu_dead_error_mask, remove_mask))
+				cpu_relax();
+			atomic_set(&disable_cpu_start, 0);
+			cpus_clear(cpu_dead_error_mask);
+			return -EBUSY;
+		}
+		smp_mb();
+		atomic_set(&disable_cpu_start, 1);
+	} else {
+		while (atomic_read(&disable_cpu_start) == 0)
+			cpu_relax();
+		if (atomic_read(&disable_cpu_start) == 2) {
+			cpu_set(cpu, cpu_dead_error_mask);
+			return -EBUSY;
+		}
+	}
 
 	clear_local_APIC();
 
@@ -1242,11 +1273,31 @@ int __cpu_disable(cpumask_t remove_mask)
 	mdelay(1);
 
 	local_irq_disable();
-	remove_siblinginfo(cpu);
 
-	/* It's now safe to remove this processor from the online map */
+	/* It's now safe to remove cpus from the online map */
 	cpu_clear(cpu, cpu_online_map);
-	remove_cpu_from_maps();
+
+	/*
+	 * Till here, all changes master cpu can't remotely do should be
+	 * finished. Remainings don't require local cpu access.
+	 */
+	smp_mb();
+	cpu_set(cpu, cpu_dead_mask);
+	if (!master)
+		return 0;
+	/* master does cleanup */
+	while (!cpus_equal(cpu_dead_mask, remove_mask))
+		cpu_relax();
+
+	remove_siblinginfo(remove_mask);
+	remove_cpu_from_maps(remove_mask);
+
+	cpus_clear(cpu_dead_mask);
+	atomic_set(&disable_cpu_start, 0);
+	/*
+	 * Note: this depends on fixup_irqs doesn't need access other CPUs'
+	 * local CPU registers, otherwise each cpu should call this routine.
+	 */
 	fixup_irqs(cpu_online_map);
 	return 0;
 }
@@ -1255,17 +1306,25 @@ void __cpu_die(cpumask_t remove_mask)
 {
 	/* We don't do anything here: idle task is faking death itself. */
 	unsigned int i;
-	int cpu = first_cpu(remove_mask);
+	int cpu;
 
 	for (i = 0; i < 10; i++) {
 		/* They ack this in play_dead by setting CPU_DEAD */
-		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
-			printk ("CPU %d is now offline\n", cpu);
-			return;
+		for_each_cpu_mask(cpu, remove_mask) {
+			if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+				printk ("CPU %d is now offline\n", cpu);
+				cpu_clear(cpu, remove_mask);
+			}
 		}
+		if (cpus_empty(remove_mask))
+			break;
 		msleep(100);
 	}
- 	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+
+	if (!cpus_empty(remove_mask)) {
+		for_each_cpu_mask(cpu, remove_mask)
+ 			printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+	}
 }
 
 __init int setup_additional_cpus(char *s)
_
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Prev by Date: [PATCH 4/10] make sched_domain_update friendly with cpu bulk removal
Next by Date: [PATCH 10/10] cpu bulk removal interface
Previous by thread: [PATCH 4/10] make sched_domain_update friendly with cpu bulk removal
Next by thread: [PATCH 10/10] cpu bulk removal interface
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]