Re: [PATCH 5/6] Filter based on a nodemask as well as a gfp_mask

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On (28/09/07 11:37), Lee Schermerhorn didst pronounce:
> Still need to fix 'nodes_intersect' -> 'nodes_intersects'.  See below.
> 
> On Fri, 2007-09-28 at 15:25 +0100, Mel Gorman wrote:
> > The MPOL_BIND policy creates a zonelist that is used for allocations belonging
> > to that thread that can use the policy_zone. As the per-node zonelist is
> > already being filtered based on a zone id, this patch adds a version of
> > __alloc_pages() that takes a nodemask for further filtering. This eliminates
> > the need for MPOL_BIND to create a custom zonelist. A positive benefit of
> > this is that allocations using MPOL_BIND now use the local-node-ordered
> > zonelist instead of a custom node-id-ordered zonelist.
> > 
> > Signed-off-by: Mel Gorman <[email protected]>
> > Acked-by: Christoph Lameter <[email protected]>
> > ---
> > 
> >  fs/buffer.c               |    2 
> >  include/linux/cpuset.h    |    4 -
> >  include/linux/gfp.h       |    4 +
> >  include/linux/mempolicy.h |    3 
> >  include/linux/mmzone.h    |   58 +++++++++++++---
> >  kernel/cpuset.c           |   18 +----
> >  mm/mempolicy.c            |  144 +++++++++++------------------------------
> >  mm/page_alloc.c           |   40 +++++++----
> >  8 files changed, 131 insertions(+), 142 deletions(-)
> > 
> > diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/fs/buffer.c linux-2.6.23-rc8-mm2-030_filter_nodemask/fs/buffer.c
> > --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/fs/buffer.c	2007-09-28 15:49:39.000000000 +0100
> > +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/fs/buffer.c	2007-09-28 15:49:57.000000000 +0100
> > @@ -376,7 +376,7 @@ static void free_more_memory(void)
> >  
> >  	for_each_online_node(nid) {
> >  		zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
> > -						gfp_zone(GFP_NOFS));
> > +						NULL, gfp_zone(GFP_NOFS));
> >  		if (zrefs->zone)
> >  			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
> >  						GFP_NOFS);
> > diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/cpuset.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/cpuset.h
> > --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/cpuset.h	2007-09-27 14:41:05.000000000 +0100
> > +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/cpuset.h	2007-09-28 15:49:57.000000000 +0100
> > @@ -28,7 +28,7 @@ void cpuset_init_current_mems_allowed(vo
> >  void cpuset_update_task_memory_state(void);
> >  #define cpuset_nodes_subset_current_mems_allowed(nodes) \
> >  		nodes_subset((nodes), current->mems_allowed)
> > -int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
> > +int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
> >  
> >  extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
> >  extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
> > @@ -103,7 +103,7 @@ static inline void cpuset_init_current_m
> >  static inline void cpuset_update_task_memory_state(void) {}
> >  #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
> >  
> > -static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
> > +static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
> >  {
> >  	return 1;
> >  }
> > diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/gfp.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/gfp.h
> > --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/gfp.h	2007-09-28 15:49:16.000000000 +0100
> > +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/gfp.h	2007-09-28 15:49:57.000000000 +0100
> > @@ -184,6 +184,10 @@ static inline void arch_alloc_page(struc
> >  extern struct page *
> >  FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
> >  
> > +extern struct page *
> > +FASTCALL(__alloc_pages_nodemask(gfp_t, unsigned int,
> > +				struct zonelist *, nodemask_t *nodemask));
> > +
> >  static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
> >  						unsigned int order)
> >  {
> > diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mempolicy.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mempolicy.h
> > --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mempolicy.h	2007-09-28 15:48:55.000000000 +0100
> > +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mempolicy.h	2007-09-28 15:49:57.000000000 +0100
> > @@ -64,9 +64,8 @@ struct mempolicy {
> >  	atomic_t refcnt;
> >  	short policy; 	/* See MPOL_* above */
> >  	union {
> > -		struct zonelist  *zonelist;	/* bind */
> >  		short 		 preferred_node; /* preferred */
> > -		nodemask_t	 nodes;		/* interleave */
> > +		nodemask_t	 nodes;		/* interleave/bind */
> >  		/* undefined for default */
> >  	} v;
> >  	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
> > diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mmzone.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mmzone.h
> > --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mmzone.h	2007-09-28 15:49:39.000000000 +0100
> > +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mmzone.h	2007-09-28 15:49:57.000000000 +0100
> > @@ -758,47 +758,85 @@ static inline void encode_zoneref(struct
> >  	zoneref->zone_idx = zone_idx(zone);
> >  }
> >  
> > +static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
> > +{
> > +#ifdef CONFIG_NUMA
> > +	return node_isset(zonelist_node_idx(zref), *nodes);
> > +#else
> > +	return 1;
> > +#endif /* CONFIG_NUMA */
> > +}
> > +
> >  /* Returns the first zone at or below highest_zoneidx in a zonelist */
> >  static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
> > +					nodemask_t *nodes,
> >  					enum zone_type highest_zoneidx)
> >  {
> >  	struct zoneref *z;
> >  
> >  	/* Find the first suitable zone to use for the allocation */
> >  	z = zonelist->_zonerefs;
> > -	while (zonelist_zone_idx(z) > highest_zoneidx)
> > -		z++;
> > +	if (likely(nodes == NULL))
> > +		while (zonelist_zone_idx(z) > highest_zoneidx)
> > +			z++;
> > +	else
> > +		while (zonelist_zone_idx(z) > highest_zoneidx ||
> > +				(z->zone && !zref_in_nodemask(z, nodes)))
> > +			z++;
> >  
> >  	return z;
> >  }
> >  
> >  /* Returns the next zone at or below highest_zoneidx in a zonelist */
> >  static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
> > +					nodemask_t *nodes,
> >  					enum zone_type highest_zoneidx)
> >  {
> > -	/* Find the next suitable zone to use for the allocation */
> > -	while (zonelist_zone_idx(z) > highest_zoneidx)
> > -		z++;
> > +	/*
> > +	 * Find the next suitable zone to use for the allocation.
> > +	 * Only filter based on nodemask if it's set
> > +	 */
> > +	if (likely(nodes == NULL))
> > +		while (zonelist_zone_idx(z) > highest_zoneidx)
> > +			z++;
> > +	else
> > +		while (zonelist_zone_idx(z) > highest_zoneidx ||
> > +				(z->zone && !zref_in_nodemask(z, nodes)))
> > +			z++;
> >  
> >  	return z;
> >  }
> >  
> >  /**
> > - * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
> > + * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
> >   * @zone - The current zone in the iterator
> >   * @z - The current pointer within zonelist->zones being iterated
> >   * @zlist - The zonelist being iterated
> >   * @highidx - The zone index of the highest zone to return
> > + * @nodemask - Nodemask allowed by the allocator
> >   *
> > - * This iterator iterates though all zones at or below a given zone index.
> > + * This iterator iterates though all zones at or below a given zone index and
> > + * within a given nodemask
> >   */
> > -#define for_each_zone_zonelist(zone, z, zlist, highidx) \
> > -	for (z = first_zones_zonelist(zlist, highidx),			\
> > +#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
> > +	for (z = first_zones_zonelist(zlist, nodemask, highidx),	\
> >  					zone = zonelist_zone(z++);	\
> >  		zone;							\
> > -		z = next_zones_zonelist(z, highidx),			\
> > +		z = next_zones_zonelist(z, nodemask, highidx),		\
> >  					zone = zonelist_zone(z++))
> >  
> > +/**
> > + * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
> > + * @zone - The current zone in the iterator
> > + * @z - The current pointer within zonelist->zones being iterated
> > + * @zlist - The zonelist being iterated
> > + * @highidx - The zone index of the highest zone to return
> > + *
> > + * This iterator iterates though all zones at or below a given zone index.
> > + */
> > +#define for_each_zone_zonelist(zone, z, zlist, highidx) \
> > +	for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
> > +
> >  #ifdef CONFIG_SPARSEMEM
> >  #include <asm/sparsemem.h>
> >  #endif
> > diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/kernel/cpuset.c linux-2.6.23-rc8-mm2-030_filter_nodemask/kernel/cpuset.c
> > --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/kernel/cpuset.c	2007-09-28 15:49:39.000000000 +0100
> > +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/kernel/cpuset.c	2007-09-28 15:49:57.000000000 +0100
> > @@ -1516,22 +1516,14 @@ nodemask_t cpuset_mems_allowed(struct ta
> >  }
> >  
> >  /**
> > - * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
> > - * @zl: the zonelist to be checked
> > + * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
> > + * @nodemask: the nodemask to be checked
> >   *
> > - * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
> > + * Are any of the nodes in the nodemask allowed in current->mems_allowed?
> >   */
> > -int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
> > +int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
> >  {
> > -	int i;
> > -
> > -	for (i = 0; zl->_zonerefs[i].zone; i++) {
> > -		int nid = zonelist_node_idx(zl->_zonerefs[i]);
> > -
> > -		if (node_isset(nid, current->mems_allowed))
> > -			return 1;
> > -	}
> > -	return 0;
> > +	return nodes_intersect(nodemask, current->mems_allowed);
>                  ^^^^^^^^^^^^^^^ -- should be nodes_intersects, I think.

Crap, you're right, I missed the warning about implicit declarations. I
apologise. This is the corrected version

diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/fs/buffer.c linux-2.6.23-rc8-mm2-030_filter_nodemask/fs/buffer.c
--- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/fs/buffer.c	2007-09-28 19:23:05.000000000 +0100
+++ linux-2.6.23-rc8-mm2-030_filter_nodemask/fs/buffer.c	2007-09-28 19:23:14.000000000 +0100
@@ -376,7 +376,7 @@ static void free_more_memory(void)
 
 	for_each_online_node(nid) {
 		zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
-						gfp_zone(GFP_NOFS));
+						NULL, gfp_zone(GFP_NOFS));
 		if (zrefs->zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 						GFP_NOFS);
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/cpuset.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/cpuset.h
--- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/cpuset.h	2007-09-28 19:22:22.000000000 +0100
+++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/cpuset.h	2007-09-28 19:23:14.000000000 +0100
@@ -28,7 +28,7 @@ void cpuset_init_current_mems_allowed(vo
 void cpuset_update_task_memory_state(void);
 #define cpuset_nodes_subset_current_mems_allowed(nodes) \
 		nodes_subset((nodes), current->mems_allowed)
-int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
+int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
 extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
 extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
@@ -103,7 +103,7 @@ static inline void cpuset_init_current_m
 static inline void cpuset_update_task_memory_state(void) {}
 #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
 
-static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
+static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 {
 	return 1;
 }
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/gfp.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/gfp.h
--- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/gfp.h	2007-09-28 19:22:56.000000000 +0100
+++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/gfp.h	2007-09-28 19:23:14.000000000 +0100
@@ -184,6 +184,10 @@ static inline void arch_alloc_page(struc
 extern struct page *
 FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
 
+extern struct page *
+FASTCALL(__alloc_pages_nodemask(gfp_t, unsigned int,
+				struct zonelist *, nodemask_t *nodemask));
+
 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mempolicy.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mempolicy.h
--- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mempolicy.h	2007-09-28 19:22:46.000000000 +0100
+++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mempolicy.h	2007-09-28 19:23:14.000000000 +0100
@@ -64,9 +64,8 @@ struct mempolicy {
 	atomic_t refcnt;
 	short policy; 	/* See MPOL_* above */
 	union {
-		struct zonelist  *zonelist;	/* bind */
 		short 		 preferred_node; /* preferred */
-		nodemask_t	 nodes;		/* interleave */
+		nodemask_t	 nodes;		/* interleave/bind */
 		/* undefined for default */
 	} v;
 	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mmzone.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mmzone.h
--- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mmzone.h	2007-09-28 19:23:05.000000000 +0100
+++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mmzone.h	2007-09-28 19:23:14.000000000 +0100
@@ -758,47 +758,85 @@ static inline void encode_zoneref(struct
 	zoneref->zone_idx = zone_idx(zone);
 }
 
+static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
+{
+#ifdef CONFIG_NUMA
+	return node_isset(zonelist_node_idx(zref), *nodes);
+#else
+	return 1;
+#endif /* CONFIG_NUMA */
+}
+
 /* Returns the first zone at or below highest_zoneidx in a zonelist */
 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
+					nodemask_t *nodes,
 					enum zone_type highest_zoneidx)
 {
 	struct zoneref *z;
 
 	/* Find the first suitable zone to use for the allocation */
 	z = zonelist->_zonerefs;
-	while (zonelist_zone_idx(z) > highest_zoneidx)
-		z++;
+	if (likely(nodes == NULL))
+		while (zonelist_zone_idx(z) > highest_zoneidx)
+			z++;
+	else
+		while (zonelist_zone_idx(z) > highest_zoneidx ||
+				(z->zone && !zref_in_nodemask(z, nodes)))
+			z++;
 
 	return z;
 }
 
 /* Returns the next zone at or below highest_zoneidx in a zonelist */
 static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
+					nodemask_t *nodes,
 					enum zone_type highest_zoneidx)
 {
-	/* Find the next suitable zone to use for the allocation */
-	while (zonelist_zone_idx(z) > highest_zoneidx)
-		z++;
+	/*
+	 * Find the next suitable zone to use for the allocation.
+	 * Only filter based on nodemask if it's set
+	 */
+	if (likely(nodes == NULL))
+		while (zonelist_zone_idx(z) > highest_zoneidx)
+			z++;
+	else
+		while (zonelist_zone_idx(z) > highest_zoneidx ||
+				(z->zone && !zref_in_nodemask(z, nodes)))
+			z++;
 
 	return z;
 }
 
 /**
- * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
+ * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
  * @zone - The current zone in the iterator
  * @z - The current pointer within zonelist->zones being iterated
  * @zlist - The zonelist being iterated
  * @highidx - The zone index of the highest zone to return
+ * @nodemask - Nodemask allowed by the allocator
  *
- * This iterator iterates though all zones at or below a given zone index.
+ * This iterator iterates though all zones at or below a given zone index and
+ * within a given nodemask
  */
-#define for_each_zone_zonelist(zone, z, zlist, highidx) \
-	for (z = first_zones_zonelist(zlist, highidx),			\
+#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
+	for (z = first_zones_zonelist(zlist, nodemask, highidx),	\
 					zone = zonelist_zone(z++);	\
 		zone;							\
-		z = next_zones_zonelist(z, highidx),			\
+		z = next_zones_zonelist(z, nodemask, highidx),		\
 					zone = zonelist_zone(z++))
 
+/**
+ * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
+ * @zone - The current zone in the iterator
+ * @z - The current pointer within zonelist->zones being iterated
+ * @zlist - The zonelist being iterated
+ * @highidx - The zone index of the highest zone to return
+ *
+ * This iterator iterates though all zones at or below a given zone index.
+ */
+#define for_each_zone_zonelist(zone, z, zlist, highidx) \
+	for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
+
 #ifdef CONFIG_SPARSEMEM
 #include <asm/sparsemem.h>
 #endif
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/kernel/cpuset.c linux-2.6.23-rc8-mm2-030_filter_nodemask/kernel/cpuset.c
--- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/kernel/cpuset.c	2007-09-28 19:23:05.000000000 +0100
+++ linux-2.6.23-rc8-mm2-030_filter_nodemask/kernel/cpuset.c	2007-09-28 19:27:01.000000000 +0100
@@ -1516,22 +1516,14 @@ nodemask_t cpuset_mems_allowed(struct ta
 }
 
 /**
- * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
- * @zl: the zonelist to be checked
+ * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
+ * @nodemask: the nodemask to be checked
  *
- * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
+ * Are any of the nodes in the nodemask allowed in current->mems_allowed?
  */
-int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
+int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 {
-	int i;
-
-	for (i = 0; zl->_zonerefs[i].zone; i++) {
-		int nid = zonelist_node_idx(zl->_zonerefs[i]);
-
-		if (node_isset(nid, current->mems_allowed))
-			return 1;
-	}
-	return 0;
+	return nodes_intersects(*nodemask, current->mems_allowed);
 }
 
 /*
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/mempolicy.c linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/mempolicy.c
--- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/mempolicy.c	2007-09-28 19:23:05.000000000 +0100
+++ linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/mempolicy.c	2007-09-28 19:23:14.000000000 +0100
@@ -134,41 +134,21 @@ static int mpol_check_policy(int mode, n
  	return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
 }
 
-/* Generate a custom zonelist for the BIND policy. */
-static struct zonelist *bind_zonelist(nodemask_t *nodes)
+/* Check that the nodemask contains at least one populated zone */
+static int is_valid_nodemask(nodemask_t *nodemask)
 {
-	struct zonelist *zl;
-	int num, max, nd;
-	enum zone_type k;
+	int nd, k;
 
-	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
-	max++;			/* space for zlcache_ptr (see mmzone.h) */
-	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
-	if (!zl)
-		return ERR_PTR(-ENOMEM);
-	zl->zlcache_ptr = NULL;
-	num = 0;
-	/* First put in the highest zones from all nodes, then all the next 
-	   lower zones etc. Avoid empty zones because the memory allocator
-	   doesn't like them. If you implement node hot removal you
-	   have to fix that. */
-	k = MAX_NR_ZONES - 1;
-	while (1) {
-		for_each_node_mask(nd, *nodes) { 
-			struct zone *z = &NODE_DATA(nd)->node_zones[k];
-			if (z->present_pages > 0) 
-				encode_zoneref(z, &zl->_zonerefs[num++]);
-		}
-		if (k == 0)
-			break;
-		k--;
-	}
-	if (num == 0) {
-		kfree(zl);
-		return ERR_PTR(-EINVAL);
+	/* Check that there is something useful in this mask */
+	k = policy_zone;
+
+	for_each_node_mask(nd, *nodemask) {
+		struct zone *z = &NODE_DATA(nd)->node_zones[k];
+		if (z->present_pages > 0)
+			return 1;
 	}
-	zl->_zonerefs[num].zone = NULL;
-	return zl;
+
+	return 0;
 }
 
 /* Create a new policy */
@@ -201,12 +181,11 @@ static struct mempolicy *mpol_new(int mo
 			policy->v.preferred_node = -1;
 		break;
 	case MPOL_BIND:
-		policy->v.zonelist = bind_zonelist(nodes);
-		if (IS_ERR(policy->v.zonelist)) {
-			void *error_code = policy->v.zonelist;
+		if (!is_valid_nodemask(nodes)) {
 			kmem_cache_free(policy_cache, policy);
-			return error_code;
+			return ERR_PTR(-EINVAL);
 		}
+		policy->v.nodes = *nodes;
 		break;
 	}
 	policy->policy = mode;
@@ -484,19 +463,12 @@ static long do_set_mempolicy(int mode, n
 /* Fill a zone bitmap for a policy */
 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 {
-	int i;
-
 	nodes_clear(*nodes);
 	switch (p->policy) {
-	case MPOL_BIND:
-		for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) {
-			struct zoneref *zref;
-			zref = &p->v.zonelist->_zonerefs[i];
-			node_set(zonelist_node_idx(zref), *nodes);
-		}
-		break;
 	case MPOL_DEFAULT:
 		break;
+	case MPOL_BIND:
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		*nodes = p->v.nodes;
 		break;
@@ -1131,6 +1103,18 @@ static struct mempolicy * get_vma_policy
 	return pol;
 }
 
+/* Return a nodemask representing a mempolicy */
+static inline nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
+{
+	/* Lower zones don't get a nodemask applied for MPOL_BIND */
+	if (unlikely(policy->policy == MPOL_BIND &&
+			gfp_zone(gfp) >= policy_zone &&
+			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)))
+		return &policy->v.nodes;
+
+	return NULL;
+}
+
 /* Return a zonelist representing a mempolicy */
 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 {
@@ -1143,11 +1127,6 @@ static struct zonelist *zonelist_policy(
 			nd = numa_node_id();
 		break;
 	case MPOL_BIND:
-		/* Lower zones don't get a policy applied */
-		/* Careful: current->mems_allowed might have moved */
-		if (gfp_zone(gfp) >= policy_zone)
-			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
-				return policy->v.zonelist;
 		/*FALL THROUGH*/
 	case MPOL_INTERLEAVE: /* should not happen */
 	case MPOL_DEFAULT:
@@ -1191,7 +1170,13 @@ unsigned slab_node(struct mempolicy *pol
 		 * Follow bind policy behavior and start allocation at the
 		 * first node.
 		 */
-		return zonelist_node_idx(policy->v.zonelist->_zonerefs);
+		struct zonelist *zonelist;
+		struct zoneref *z;
+		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
+		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
+		z = first_zones_zonelist(zonelist, &policy->v.nodes,
+							highest_zoneidx);
+		return zonelist_node_idx(z);
 	}
 
 	case MPOL_PREFERRED:
@@ -1349,7 +1334,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area
 	/*
 	 * fast path:  default or task policy
 	 */
-	return __alloc_pages(gfp, 0, zl);
+	return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
 }
 
 /**
@@ -1406,14 +1391,6 @@ struct mempolicy *__mpol_copy(struct mem
 	}
 	*new = *old;
 	atomic_set(&new->refcnt, 1);
-	if (new->policy == MPOL_BIND) {
-		int sz = ksize(old->v.zonelist);
-		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
-		if (!new->v.zonelist) {
-			kmem_cache_free(policy_cache, new);
-			return ERR_PTR(-ENOMEM);
-		}
-	}
 	return new;
 }
 
@@ -1427,21 +1404,12 @@ int __mpol_equal(struct mempolicy *a, st
 	switch (a->policy) {
 	case MPOL_DEFAULT:
 		return 1;
+	case MPOL_BIND:
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		return nodes_equal(a->v.nodes, b->v.nodes);
 	case MPOL_PREFERRED:
 		return a->v.preferred_node == b->v.preferred_node;
-	case MPOL_BIND: {
-		int i;
-		for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) {
-			struct zone *za, *zb;
-			za = zonelist_zone(&a->v.zonelist->_zonerefs[i]);
-			zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]);
-			if (za != zb)
-				return 0;
-		}
-		return b->v.zonelist->_zonerefs[i].zone == NULL;
-	}
 	default:
 		BUG();
 		return 0;
@@ -1453,8 +1421,6 @@ void __mpol_free(struct mempolicy *p)
 {
 	if (!atomic_dec_and_test(&p->refcnt))
 		return;
-	if (p->policy == MPOL_BIND)
-		kfree(p->v.zonelist);
 	p->policy = MPOL_DEFAULT;
 	kmem_cache_free(policy_cache, p);
 }
@@ -1745,6 +1711,8 @@ static void mpol_rebind_policy(struct me
 	switch (pol->policy) {
 	case MPOL_DEFAULT:
 		break;
+	case MPOL_BIND:
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
 		pol->v.nodes = tmp;
@@ -1757,32 +1725,6 @@ static void mpol_rebind_policy(struct me
 						*mpolmask, *newmask);
 		*mpolmask = *newmask;
 		break;
-	case MPOL_BIND: {
-		nodemask_t nodes;
-		struct zoneref *z;
-		struct zonelist *zonelist;
-
-		nodes_clear(nodes);
-		for (z = pol->v.zonelist->_zonerefs; z->zone; z++)
-			node_set(zonelist_node_idx(z), nodes);
-		nodes_remap(tmp, nodes, *mpolmask, *newmask);
-		nodes = tmp;
-
-		zonelist = bind_zonelist(&nodes);
-
-		/* If no mem, then zonelist is NULL and we keep old zonelist.
-		 * If that old zonelist has no remaining mems_allowed nodes,
-		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
-		 */
-
-		if (!IS_ERR(zonelist)) {
-			/* Good - got mem - substitute new zonelist */
-			kfree(pol->v.zonelist);
-			pol->v.zonelist = zonelist;
-		}
-		*mpolmask = *newmask;
-		break;
-	}
 	default:
 		BUG();
 		break;
@@ -1845,9 +1787,7 @@ static inline int mpol_to_str(char *buff
 		break;
 
 	case MPOL_BIND:
-		get_zonemask(pol, &nodes);
-		break;
-
+		/* Fall through */
 	case MPOL_INTERLEAVE:
 		nodes = pol->v.nodes;
 		break;
diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/page_alloc.c linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/page_alloc.c
--- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/page_alloc.c	2007-09-28 19:23:05.000000000 +0100
+++ linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/page_alloc.c	2007-09-28 19:23:14.000000000 +0100
@@ -1420,7 +1420,7 @@ static void zlc_mark_zone_full(struct zo
  * a page.
  */
 static struct page *
-get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 		struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
 {
 	struct zoneref *z;
@@ -1431,7 +1431,7 @@ get_page_from_freelist(gfp_t gfp_mask, u
 	int zlc_active = 0;		/* set if using zonelist_cache */
 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
 
-	z = first_zones_zonelist(zonelist, high_zoneidx);
+	z = first_zones_zonelist(zonelist, nodemask, high_zoneidx);
 	classzone_idx = zonelist_zone_idx(z);
 
 zonelist_scan:
@@ -1439,7 +1439,8 @@ zonelist_scan:
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+	for_each_zone_zonelist_nodemask(zone, z, zonelist,
+						high_zoneidx, nodemask) {
 		if (NUMA_BUILD && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
@@ -1545,9 +1546,9 @@ static void set_page_owner(struct page *
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
-struct page * fastcall
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
-		struct zonelist *zonelist)
+static struct page *
+__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
+			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	const gfp_t wait = gfp_mask & __GFP_WAIT;
 	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
@@ -1576,7 +1577,7 @@ restart:
 		return NULL;
 	}
 
-	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
 	if (page)
 		goto got_pg;
@@ -1621,7 +1622,7 @@ restart:
 	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
-	page = get_page_from_freelist(gfp_mask, order, zonelist,
+	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 						high_zoneidx, alloc_flags);
 	if (page)
 		goto got_pg;
@@ -1634,7 +1635,7 @@ rebalance:
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 nofail_alloc:
 			/* go through the zonelist yet again, ignoring mins */
-			page = get_page_from_freelist(gfp_mask, order,
+			page = get_page_from_freelist(gfp_mask, nodemask, order,
 				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
 			if (page)
 				goto got_pg;
@@ -1669,7 +1670,7 @@ nofail_alloc:
 		drain_all_local_pages();
 
 	if (likely(did_some_progress)) {
-		page = get_page_from_freelist(gfp_mask, order,
+		page = get_page_from_freelist(gfp_mask, nodemask, order,
 					zonelist, high_zoneidx, alloc_flags);
 		if (page)
 			goto got_pg;
@@ -1685,8 +1686,9 @@ nofail_alloc:
 		 * a parallel oom killing, we must fail if we're still
 		 * under heavy pressure.
 		 */
-		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-			zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+			order, zonelist, high_zoneidx,
+			ALLOC_WMARK_HIGH|ALLOC_CPUSET);
 		if (page) {
 			clear_zonelist_oom(zonelist, gfp_mask);
 			goto got_pg;
@@ -1739,6 +1741,20 @@ got_pg:
 	return page;
 }
 
+struct page * fastcall
+__alloc_pages(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist)
+{
+	return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
+}
+
+struct page * fastcall
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+		struct zonelist *zonelist, nodemask_t *nodemask)
+{
+	return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
+}
+
 EXPORT_SYMBOL(__alloc_pages);
 
 /*
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux