[Patch] 0/5 in support of hot-add memory x86_64

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello All,

 I would like to address the current state of hot-add memory and x86_64.
Broken is the word, neither add-path is functional at this time.   This
patch along with the rest of the series fixes up SPARSEMEM hot-add with
x86_64.  RESERVE_HOTADD is in a non-building state right now (since
2.6.18 or so) and I will have patches for this next.  

 My hardware is x86_64 ACPI hot-add enabled numa system (IBM x460
2node). 

There are 4 main issues that I have addressed:

1.  Merge MEMORY_HOTPLUG and RESERVE_HOT in srat.c.  Both add paths need
information from the SRAT as to node locality (RESERVE to reserve space
and MEMORY_HOTPLUG to figure out what node memory belongs to at add
time). I create a real config option for the RESERVE option in Kconfig
and share the code path.

2/3.  My hardware does not implement the optional passing along of the
pxm (node) information with the add memory event as the current acpi
driver expects.  I implement an arch_find_node call that will return the
node memory belongs to.  It uses the information saved from the SRAT. 

(Patches 1,2,3 allow intelligent numa additions of memory.  Presently
the acpi memory driver hands down -1 and things break)

4.  Kernel mapping fixup.  The kernel mapping (page table) code is
broken x86_64 for memory that is not aligned on a pud entry (pmd page).
I have fixed this limitation. 

5. ACPI fix needed for my hardware.  

There is another issue but I believe Kame already has a fix. It involves
sysfs on-lining large amounts of memory at one time instead of just a
section size. 

All comments welcome.  

keith mannthey <[email protected]>
Linux Technology Center IBM
diff -urN orig/arch/x86_64/Kconfig work/arch/x86_64/Kconfig
--- orig/arch/x86_64/Kconfig	2006-07-28 13:57:35.000000000 -0400
+++ work/arch/x86_64/Kconfig	2006-07-28 21:17:03.000000000 -0400
@@ -349,6 +349,17 @@
 
 source "mm/Kconfig"
 
+config RESERVE_HOTADD
+	bool "Reserve based hot-add memory" 
+	depends on DISCONTIGMEM && X86_64_ACPI_NUMA
+	default n
+	help
+	  This allows a reserved based approach to hot add memory.
+	  Select this option if your hardware supports hot-add memroy 
+	  via ACPI and the SRAT table.  You will need to still build in 
+	  the ACPI hot plug memory driver to do the actual add. 
+
+
 config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool y
 	depends on NUMA
diff -urN orig/arch/x86_64/mm/srat.c work/arch/x86_64/mm/srat.c
--- orig/arch/x86_64/mm/srat.c	2006-07-28 13:57:35.000000000 -0400
+++ work/arch/x86_64/mm/srat.c	2006-07-28 21:17:23.000000000 -0400
@@ -21,22 +21,13 @@
 #include <asm/numa.h>
 #include <asm/e820.h>
 
-#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
-	defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
-		&& !defined(CONFIG_MEMORY_HOTPLUG)
-#define RESERVE_HOTADD 1
-#endif
-
 static struct acpi_table_slit *acpi_slit;
 
 static nodemask_t nodes_parsed __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
-static int found_add_area __initdata;
+static int reserve_add_area __initdata;
 int hotadd_percent __initdata = 0;
-#ifndef RESERVE_HOTADD
-#define hotadd_percent 0	/* Ignore all settings */
-#endif
 
 /* Too small nodes confuse the VM badly. Usually they result
    from BIOS bugs. */
@@ -66,7 +57,7 @@
 {
 	struct bootnode *nd = &nodes[i];
 
-	if (found_add_area)
+	if (reserve_add_area)
 		return;
 
 	if (nd->start < start) {
@@ -86,7 +77,7 @@
 	int i;
 	printk(KERN_ERR "SRAT: SRAT not used.\n");
 	acpi_numa = -1;
-	found_add_area = 0;
+	reserve_add_area = 0;
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
 		apicid_to_node[i] = NUMA_NO_NODE;
 	for (i = 0; i < MAX_NUMNODES; i++)
@@ -157,7 +148,7 @@
 	       pxm, pa->apic_id, node);
 }
 
-#ifdef RESERVE_HOTADD
+#ifdef CONFIG_RESERVE_HOTADD
 /*
  * Protect against too large hotadd areas that would fill up memory.
  */
@@ -200,15 +191,38 @@
 	return 1;
 }
 
+int update_end_of_memory(unsigned long end) 
+{
+	reserve_add_area = 1;
+ 	if ((end >> PAGE_SHIFT) > end_pfn)
+		end_pfn = end >> PAGE_SHIFT;
+	return 1;
+}
+
+static inline int save_add_info(void) 
+{
+	return hotadd_percent > 0;
+}
+#else /* !CONFIG_RESERVE_HOTADD */
+int update_end_of_memory(unsigned long end) {return 0;}
+static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static inline int save_add_info(void) {return 1;}
+#else /* !CONFIG_MEMORY_HOTPLUG */
+static inline int save_add_info(void) {return 0;}
+#endif 
+#endif 
+ 
 /*
- * It is fine to add this area to the nodes data it will be used later
+ * Udate nodes_add and decide if to save info in the real nodes sturcture
+ * Both MEMORY_HOTPLUG and RESERVE_HOTADD need the nodes_add info
  * This code supports one contigious hot add area per node.
  */
 static int reserve_hotadd(int node, unsigned long start, unsigned long end)
 {
 	unsigned long s_pfn = start >> PAGE_SHIFT;
 	unsigned long e_pfn = end >> PAGE_SHIFT;
-	int changed = 0;
+	int ret = 0 , changed = 0;
 	struct bootnode *nd = &nodes_add[node];
 
 	/* I had some trouble with strange memory hotadd regions breaking
@@ -235,7 +249,6 @@
 
 	/* Looks good */
 
- 	found_add_area = 1;
 	if (nd->start == nd->end) {
  		nd->start = start;
  		nd->end = end;
@@ -252,15 +265,14 @@
 		if (!changed)
 			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
  	}
-
- 	if ((nd->end >> PAGE_SHIFT) > end_pfn)
- 		end_pfn = nd->end >> PAGE_SHIFT;
+	
+	ret = update_end_of_memory(nd->end);
 
 	if (changed)
-	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
-	return 0;
+	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",  
+							nd->start, nd->end);
+	return ret;
 }
-#endif
 
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
 void __init
@@ -279,7 +291,7 @@
 	}
 	if (ma->flags.enabled == 0)
 		return;
- 	if (ma->flags.hot_pluggable && hotadd_percent == 0)
+ 	if (ma->flags.hot_pluggable && !save_add_info())
 		return;
 	start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
 	end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
@@ -318,15 +330,13 @@
 	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
 	       nd->start, nd->end);
 
-#ifdef RESERVE_HOTADD
- 	if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
-		/* Ignore hotadd region. Undo damage */
+ 	if (ma->flags.hot_pluggable && !reserve_hotadd(node, start, end)) {
+		/* Don't reserve hotadd region. Undo damage */
 		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
 		*nd = oldnode;
 		if ((nd->start | nd->end) == 0)
 			node_clear(node, nodes_parsed);
 	}
-#endif
 }
 
 /* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -342,7 +352,6 @@
 		unsigned long e = nodes[i].end >> PAGE_SHIFT;
 		pxmram += e - s;
 		pxmram -= e820_hole_size(s, e);
-		pxmram -= nodes_add[i].end - nodes_add[i].start;
 		if ((long)pxmram < 0)
 			pxmram = 0;
 	}
@@ -422,7 +431,7 @@
 
 void __init srat_reserve_add_area(int nodeid)
 {
-	if (found_add_area && nodes_add[nodeid].end) {
+	if (reserve_add_area && nodes_add[nodeid].end) {
 		u64 total_mb;
 
 		printk(KERN_INFO "SRAT: Reserving hot-add memory space "

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux