[PATCH] change global zonelist order on NUMA v2

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Changelog from V1 -> V2
- sysctl name is changed to be relaxed_zone_order
- NORMAL->NORMAL->....->DMA->DMA->DMA order (new ordering) is now default.
  NORMAL->DMA->NORMAL->DMA order (old ordering) is optional.
- addes boot opttion to set relaxed_zone_order. ia64 is supported now.
- Added documentation

patch is against 2.6.21-rc7-mm2. tested on ia64 NUMA box. works well.

-Kame
> from hrere

Make zonelist creation policy selectable from sysctl v2.

[Description]
Assume 2 node NUMA, only node(0) has ZONE_DMA.
(ia64's ZONE_DMA is below 4GB...x86_64's ZONE_DMA32)

In this case, current default (node0's) zonelist order is

Node(0)'s NORMAL -> Node(0)'s DMA -> Node(1)"s NORMAL.

This means Node(0)'s DMA will be used before Node(1)'s NORMAL.

This patch changes *default* zone order to

Node(0)'s NORMAL -> Node(1)'s NORMAL -> Node(0)'s DMA.

But, if Node(0)'s memory is too small (near or below 4G), Node(0)'s process has
to allocate its memory from Node(1) even if there are free memory in Node(0).
Some applications/uses will dislike this.
This patch adds a knob to change zonelist ordering.

[What this patch adds]

command:
%echo 1 > /proc/sys/vm/relaxed_zone_order

Will rebuild zonelist in following order(old style).

Node(0)'s NORMAL -> Node(0)'s DMA -> Node(0)'s NORMAL.

And you can specify "relaxed_zone_order" boot option if supported by arch.
But this style zonelist can easily cause OOM-Kill because of ZONE_DMA
exhaition. be careful.

command:
echo 0 > /proc/sys/vm/relaxed_zone_order
will rebuild zonelist as
Node(0)'s NORMAL -> Node(1)'s NORMAL -> Node(0)'s DMA.

Added ia64 support and tested on ia64 2-Node NUMA. works well.

Signed-Off-By: KAMEZAWA Hiroyuki <[email protected]>

Index: linux-2.6.21-rc7-mm2/kernel/sysctl.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/kernel/sysctl.c
+++ linux-2.6.21-rc7-mm2/kernel/sysctl.c
@@ -80,6 +80,7 @@ extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
+extern int sysctl_relaxed_zone_order;
 
 #if defined(CONFIG_ADAPTIVE_READAHEAD)
 extern int readahead_ratio;
@@ -893,6 +894,15 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "relaxed_zone_order",
+		.data		= &sysctl_relaxed_zone_order,
+		.maxlen		= sizeof(sysctl_relaxed_zone_order),
+		.mode		= 0644,
+		.proc_handler	= &sysctl_relaxed_zone_order_handler,
+		.strategy	= &sysctl_intvec,
+	},
 #endif
 #if defined(CONFIG_X86_32) || \
    (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
Index: linux-2.6.21-rc7-mm2/mm/page_alloc.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/page_alloc.c
+++ linux-2.6.21-rc7-mm2/mm/page_alloc.c
@@ -2045,7 +2045,7 @@ static int __meminit build_zonelists_nod
 
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
-static int __meminitdata node_load[MAX_NUMNODES];
+static int node_load[MAX_NUMNODES];
 /**
  * find_next_best_node - find the next node that should appear in a given node's fallback list
  * @node: node whose fallback list we're appending
@@ -2060,7 +2060,7 @@ static int __meminitdata node_load[MAX_N
  * on them otherwise.
  * It returns -1 if no node is found.
  */
-static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
+static int find_next_best_node(int node, nodemask_t *used_node_mask)
 {
 	int n, val;
 	int min_val = INT_MAX;
@@ -2106,7 +2106,10 @@ static int __meminit find_next_best_node
 	return best_node;
 }
 
-static void __meminit build_zonelists(pg_data_t *pgdat)
+/*
+ * Build zonelists based on node locality.
+ */
+static void build_zonelists_locality_aware(pg_data_t *pgdat)
 {
 	int j, node, local_node;
 	enum zone_type i;
@@ -2155,6 +2158,81 @@ static void __meminit build_zonelists(pg
 	}
 }
 
+/*
+ * Build zonelist based on zone priority.
+ */
+static int node_order[MAX_NUMNODES];
+static void build_zonelists_zone_aware(pg_data_t *pgdat)
+{
+	int i, j, pos, zone_type, node, load;
+	nodemask_t used_mask;
+	int local_node, prev_node;
+	struct zone *z;
+	struct zonelist *zonelist;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		zonelist->zones[0] = NULL;
+	}
+	memset(node_order, 0, sizeof(node_order));
+	local_node = pgdat->node_id;
+	load = num_online_nodes();
+	prev_node = local_node;
+	nodes_clear(used_mask);
+	j = 0;
+	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+		int distance = node_distance(local_node, node);
+		if (distance > RECLAIM_DISTANCE)
+			zone_reclaim_mode = 1;
+		if (distance != node_distance(local_node, prev_node))
+			node_load[node] = load;
+		node_order[j++] = node;
+		prev_node = node;
+		load--;
+	}
+	/* calculate node order */
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		zonelist = pgdat->node_zonelists + i;
+		pos = 0;
+		for (zone_type = i; zone_type >= 0; zone_type--) {
+			for (j = 0; j < num_online_nodes(); j++) {
+				node = node_order[j];
+				z = &NODE_DATA(node)->node_zones[zone_type];
+				if (populated_zone(z))
+					zonelist->zones[pos++] = z;
+			}
+		}
+		zonelist->zones[pos] = NULL;
+	}
+}
+
+int sysctl_relaxed_zone_order = 0;
+
+static void build_zonelists(pg_data_t *pgdat)
+{
+	if (sysctl_relaxed_zone_order)
+		build_zonelists_locality_aware(pgdat);
+	else
+		build_zonelists_zone_aware(pgdat);
+}
+
+int sysctl_relaxed_zone_order_handler(ctl_table *table, int write,
+		struct file *file, void __user *buffer, size_t *length,
+		loff_t *ppos)
+{
+	int oldval = sysctl_relaxed_zone_order;
+	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+	if (write && (oldval != sysctl_relaxed_zone_order))
+		build_all_zonelists();
+	return 0;
+}
+
+int __init cmdline_parse_relaxed_zone_order(char *p)
+{
+	sysctl_relaxed_zone_order = 1;
+	return 0;
+}
+
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void __meminit build_zonelist_cache(pg_data_t *pgdat)
 {
@@ -2222,7 +2300,7 @@ static void __meminit build_zonelist_cac
 #endif	/* CONFIG_NUMA */
 
 /* return values int ....just for stop_machine_run() */
-static int __meminit __build_all_zonelists(void *dummy)
+static int __build_all_zonelists(void *dummy)
 {
 	int nid;
 
@@ -2233,12 +2311,13 @@ static int __meminit __build_all_zonelis
 	return 0;
 }
 
-void __meminit build_all_zonelists(void)
+void build_all_zonelists(void)
 {
 	if (system_state == SYSTEM_BOOTING) {
 		__build_all_zonelists(NULL);
 		cpuset_init_current_mems_allowed();
 	} else {
+		memset(node_load, 0, sizeof(node_load));
 		/* we have to stop all cpus to guaranntee there is no user
 		   of zonelist */
 		stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
Index: linux-2.6.21-rc7-mm2/include/linux/mmzone.h
===================================================================
--- linux-2.6.21-rc7-mm2.orig/include/linux/mmzone.h
+++ linux-2.6.21-rc7-mm2/include/linux/mmzone.h
@@ -608,6 +608,11 @@ int sysctl_min_unmapped_ratio_sysctl_han
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
 
+extern int sysctl_relaxed_zone_order_handler(struct ctl_table *, int,
+			struct file *, void __user *, size_t *, loff_t *);
+
+extern int cmdline_parse_relaxed_zone_order(char *p);
+
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
 #ifndef numa_node_id
Index: linux-2.6.21-rc7-mm2/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.21-rc7-mm2.orig/Documentation/kernel-parameters.txt
+++ linux-2.6.21-rc7-mm2/Documentation/kernel-parameters.txt
@@ -1500,6 +1500,10 @@ and is between 256 and 4096 characters. 
 			Format: <reboot_mode>[,<reboot_mode2>[,...]]
 			See arch/*/kernel/reboot.c or arch/*/kernel/process.c			
 
+	relaxed_zone_order [KNL,BOOT]
+			give memory allocation priority to locality rather
+			than zone class. See Documentation/sysctl/vm.txt
+
 	reserve=	[KNL,BUGS] Force the kernel to ignore some iomem area
 
 	reservetop=	[X86-32]
Index: linux-2.6.21-rc7-mm2/Documentation/sysctl/vm.txt
===================================================================
--- linux-2.6.21-rc7-mm2.orig/Documentation/sysctl/vm.txt
+++ linux-2.6.21-rc7-mm2/Documentation/sysctl/vm.txt
@@ -34,6 +34,7 @@ Currently, these files are in /proc/sys/
 - swap_prefetch
 - readahead_ratio
 - readahead_hit_rate
+- relaxed_zone_order
 
 ==============================================================
 
@@ -275,3 +276,24 @@ Possible values can be:
 The larger value, the more capabilities, with more possible overheads.
 
 The default value is 1.
+
+=============================================================
+
+relaxed_zone_order
+
+This sysctl is only for NUMA.
+This allows you to allocate local memory more aggresively.
+Assume 2 Node NUMA.The kernel memory allocateion order on Node(0)
+is following. relaxed_zone_order=0 in this case.(default)
+==
+Node(0)NORMAL -> Node(1)NORMAL -> Node(0)DMA -> Node(1)DMA(if any)
+==
+If set to relaxed_zone_order=1, This option changes this order to be
+==
+Node(0)NORMAL -> Node(0)DMA -> Node(1)NORMA -> Node(1)DMA
+==
+Then you can use more local memory. But, in this case, ZONE_DMA can be
+used more eagerly than default. Then, OOM-KILL in ZONE_DMA can happen easier.
+
+The default value is 0.
+
Index: linux-2.6.21-rc7-mm2/arch/ia64/mm/discontig.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/arch/ia64/mm/discontig.c
+++ linux-2.6.21-rc7-mm2/arch/ia64/mm/discontig.c
@@ -27,6 +27,8 @@
 #include <asm/numa.h>
 #include <asm/sections.h>
 
+
+early_param("relaxed_zone_order", cmdline_parse_relaxed_zone_order);
 /*
  * Track per-node information needed to setup the boot memory allocator, the
  * per-node areas, and the real VM.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux