All of lore.kernel.org
 help / color / mirror / Atom feed
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
To: Mel Gorman <mel@csn.ul.ie>
Cc: akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, rientjes@google.com,
	kamezawa.hiroyu@jp.fujitsu.com, clameter@sgi.com
Subject: Re: [PATCH 5/6] Filter based on a nodemask as well as a gfp_mask
Date: Fri, 28 Sep 2007 11:37:02 -0400	[thread overview]
Message-ID: <1190993823.5513.10.camel@localhost> (raw)
In-Reply-To: <20070928142506.16783.99266.sendpatchset@skynet.skynet.ie>

Still need to fix 'nodes_intersect' -> 'nodes_intersects'.  See below.

On Fri, 2007-09-28 at 15:25 +0100, Mel Gorman wrote:
> The MPOL_BIND policy creates a zonelist that is used for allocations belonging
> to that thread that can use the policy_zone. As the per-node zonelist is
> already being filtered based on a zone id, this patch adds a version of
> __alloc_pages() that takes a nodemask for further filtering. This eliminates
> the need for MPOL_BIND to create a custom zonelist. A positive benefit of
> this is that allocations using MPOL_BIND now use the local-node-ordered
> zonelist instead of a custom node-id-ordered zonelist.
> 
> Signed-off-by: Mel Gorman <mel@csn.ul.ie>
> Acked-by: Christoph Lameter <clameter@sgi.com>
> ---
> 
>  fs/buffer.c               |    2 
>  include/linux/cpuset.h    |    4 -
>  include/linux/gfp.h       |    4 +
>  include/linux/mempolicy.h |    3 
>  include/linux/mmzone.h    |   58 +++++++++++++---
>  kernel/cpuset.c           |   18 +----
>  mm/mempolicy.c            |  144 +++++++++++------------------------------
>  mm/page_alloc.c           |   40 +++++++----
>  8 files changed, 131 insertions(+), 142 deletions(-)
> 
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/fs/buffer.c linux-2.6.23-rc8-mm2-030_filter_nodemask/fs/buffer.c
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/fs/buffer.c	2007-09-28 15:49:39.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/fs/buffer.c	2007-09-28 15:49:57.000000000 +0100
> @@ -376,7 +376,7 @@ static void free_more_memory(void)
>  
>  	for_each_online_node(nid) {
>  		zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
> -						gfp_zone(GFP_NOFS));
> +						NULL, gfp_zone(GFP_NOFS));
>  		if (zrefs->zone)
>  			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
>  						GFP_NOFS);
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/cpuset.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/cpuset.h
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/cpuset.h	2007-09-27 14:41:05.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/cpuset.h	2007-09-28 15:49:57.000000000 +0100
> @@ -28,7 +28,7 @@ void cpuset_init_current_mems_allowed(vo
>  void cpuset_update_task_memory_state(void);
>  #define cpuset_nodes_subset_current_mems_allowed(nodes) \
>  		nodes_subset((nodes), current->mems_allowed)
> -int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
> +int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
>  
>  extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
>  extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
> @@ -103,7 +103,7 @@ static inline void cpuset_init_current_m
>  static inline void cpuset_update_task_memory_state(void) {}
>  #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
>  
> -static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
> +static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
>  {
>  	return 1;
>  }
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/gfp.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/gfp.h
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/gfp.h	2007-09-28 15:49:16.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/gfp.h	2007-09-28 15:49:57.000000000 +0100
> @@ -184,6 +184,10 @@ static inline void arch_alloc_page(struc
>  extern struct page *
>  FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
>  
> +extern struct page *
> +FASTCALL(__alloc_pages_nodemask(gfp_t, unsigned int,
> +				struct zonelist *, nodemask_t *nodemask));
> +
>  static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
>  						unsigned int order)
>  {
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mempolicy.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mempolicy.h
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mempolicy.h	2007-09-28 15:48:55.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mempolicy.h	2007-09-28 15:49:57.000000000 +0100
> @@ -64,9 +64,8 @@ struct mempolicy {
>  	atomic_t refcnt;
>  	short policy; 	/* See MPOL_* above */
>  	union {
> -		struct zonelist  *zonelist;	/* bind */
>  		short 		 preferred_node; /* preferred */
> -		nodemask_t	 nodes;		/* interleave */
> +		nodemask_t	 nodes;		/* interleave/bind */
>  		/* undefined for default */
>  	} v;
>  	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mmzone.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mmzone.h
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mmzone.h	2007-09-28 15:49:39.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mmzone.h	2007-09-28 15:49:57.000000000 +0100
> @@ -758,47 +758,85 @@ static inline void encode_zoneref(struct
>  	zoneref->zone_idx = zone_idx(zone);
>  }
>  
> +static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
> +{
> +#ifdef CONFIG_NUMA
> +	return node_isset(zonelist_node_idx(zref), *nodes);
> +#else
> +	return 1;
> +#endif /* CONFIG_NUMA */
> +}
> +
>  /* Returns the first zone at or below highest_zoneidx in a zonelist */
>  static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
> +					nodemask_t *nodes,
>  					enum zone_type highest_zoneidx)
>  {
>  	struct zoneref *z;
>  
>  	/* Find the first suitable zone to use for the allocation */
>  	z = zonelist->_zonerefs;
> -	while (zonelist_zone_idx(z) > highest_zoneidx)
> -		z++;
> +	if (likely(nodes == NULL))
> +		while (zonelist_zone_idx(z) > highest_zoneidx)
> +			z++;
> +	else
> +		while (zonelist_zone_idx(z) > highest_zoneidx ||
> +				(z->zone && !zref_in_nodemask(z, nodes)))
> +			z++;
>  
>  	return z;
>  }
>  
>  /* Returns the next zone at or below highest_zoneidx in a zonelist */
>  static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
> +					nodemask_t *nodes,
>  					enum zone_type highest_zoneidx)
>  {
> -	/* Find the next suitable zone to use for the allocation */
> -	while (zonelist_zone_idx(z) > highest_zoneidx)
> -		z++;
> +	/*
> +	 * Find the next suitable zone to use for the allocation.
> +	 * Only filter based on nodemask if it's set
> +	 */
> +	if (likely(nodes == NULL))
> +		while (zonelist_zone_idx(z) > highest_zoneidx)
> +			z++;
> +	else
> +		while (zonelist_zone_idx(z) > highest_zoneidx ||
> +				(z->zone && !zref_in_nodemask(z, nodes)))
> +			z++;
>  
>  	return z;
>  }
>  
>  /**
> - * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
> + * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
>   * @zone - The current zone in the iterator
>   * @z - The current pointer within zonelist->zones being iterated
>   * @zlist - The zonelist being iterated
>   * @highidx - The zone index of the highest zone to return
> + * @nodemask - Nodemask allowed by the allocator
>   *
> - * This iterator iterates though all zones at or below a given zone index.
> + * This iterator iterates though all zones at or below a given zone index and
> + * within a given nodemask
>   */
> -#define for_each_zone_zonelist(zone, z, zlist, highidx) \
> -	for (z = first_zones_zonelist(zlist, highidx),			\
> +#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
> +	for (z = first_zones_zonelist(zlist, nodemask, highidx),	\
>  					zone = zonelist_zone(z++);	\
>  		zone;							\
> -		z = next_zones_zonelist(z, highidx),			\
> +		z = next_zones_zonelist(z, nodemask, highidx),		\
>  					zone = zonelist_zone(z++))
>  
> +/**
> + * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
> + * @zone - The current zone in the iterator
> + * @z - The current pointer within zonelist->zones being iterated
> + * @zlist - The zonelist being iterated
> + * @highidx - The zone index of the highest zone to return
> + *
> + * This iterator iterates though all zones at or below a given zone index.
> + */
> +#define for_each_zone_zonelist(zone, z, zlist, highidx) \
> +	for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
> +
>  #ifdef CONFIG_SPARSEMEM
>  #include <asm/sparsemem.h>
>  #endif
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/kernel/cpuset.c linux-2.6.23-rc8-mm2-030_filter_nodemask/kernel/cpuset.c
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/kernel/cpuset.c	2007-09-28 15:49:39.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/kernel/cpuset.c	2007-09-28 15:49:57.000000000 +0100
> @@ -1516,22 +1516,14 @@ nodemask_t cpuset_mems_allowed(struct ta
>  }
>  
>  /**
> - * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
> - * @zl: the zonelist to be checked
> + * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
> + * @nodemask: the nodemask to be checked
>   *
> - * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
> + * Are any of the nodes in the nodemask allowed in current->mems_allowed?
>   */
> -int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
> +int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
>  {
> -	int i;
> -
> -	for (i = 0; zl->_zonerefs[i].zone; i++) {
> -		int nid = zonelist_node_idx(zl->_zonerefs[i]);
> -
> -		if (node_isset(nid, current->mems_allowed))
> -			return 1;
> -	}
> -	return 0;
> +	return nodes_intersect(nodemask, current->mems_allowed);
                 ^^^^^^^^^^^^^^^ -- should be nodes_intersects, I think.
>  }
>  
>  /*
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/mempolicy.c linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/mempolicy.c
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/mempolicy.c	2007-09-28 15:49:39.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/mempolicy.c	2007-09-28 15:49:57.000000000 +0100
> @@ -134,41 +134,21 @@ static int mpol_check_policy(int mode, n
>   	return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
>  }
>  
> -/* Generate a custom zonelist for the BIND policy. */
> -static struct zonelist *bind_zonelist(nodemask_t *nodes)
> +/* Check that the nodemask contains at least one populated zone */
> +static int is_valid_nodemask(nodemask_t *nodemask)
>  {
> -	struct zonelist *zl;
> -	int num, max, nd;
> -	enum zone_type k;
> +	int nd, k;
>  
> -	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
> -	max++;			/* space for zlcache_ptr (see mmzone.h) */
> -	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
> -	if (!zl)
> -		return ERR_PTR(-ENOMEM);
> -	zl->zlcache_ptr = NULL;
> -	num = 0;
> -	/* First put in the highest zones from all nodes, then all the next 
> -	   lower zones etc. Avoid empty zones because the memory allocator
> -	   doesn't like them. If you implement node hot removal you
> -	   have to fix that. */
> -	k = MAX_NR_ZONES - 1;
> -	while (1) {
> -		for_each_node_mask(nd, *nodes) { 
> -			struct zone *z = &NODE_DATA(nd)->node_zones[k];
> -			if (z->present_pages > 0) 
> -				encode_zoneref(z, &zl->_zonerefs[num++]);
> -		}
> -		if (k == 0)
> -			break;
> -		k--;
> -	}
> -	if (num == 0) {
> -		kfree(zl);
> -		return ERR_PTR(-EINVAL);
> +	/* Check that there is something useful in this mask */
> +	k = policy_zone;
> +
> +	for_each_node_mask(nd, *nodemask) {
> +		struct zone *z = &NODE_DATA(nd)->node_zones[k];
> +		if (z->present_pages > 0)
> +			return 1;
>  	}
> -	zl->_zonerefs[num].zone = NULL;
> -	return zl;
> +
> +	return 0;
>  }
>  
>  /* Create a new policy */
> @@ -201,12 +181,11 @@ static struct mempolicy *mpol_new(int mo
>  			policy->v.preferred_node = -1;
>  		break;
>  	case MPOL_BIND:
> -		policy->v.zonelist = bind_zonelist(nodes);
> -		if (IS_ERR(policy->v.zonelist)) {
> -			void *error_code = policy->v.zonelist;
> +		if (!is_valid_nodemask(nodes)) {
>  			kmem_cache_free(policy_cache, policy);
> -			return error_code;
> +			return ERR_PTR(-EINVAL);
>  		}
> +		policy->v.nodes = *nodes;
>  		break;
>  	}
>  	policy->policy = mode;
> @@ -484,19 +463,12 @@ static long do_set_mempolicy(int mode, n
>  /* Fill a zone bitmap for a policy */
>  static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
>  {
> -	int i;
> -
>  	nodes_clear(*nodes);
>  	switch (p->policy) {
> -	case MPOL_BIND:
> -		for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) {
> -			struct zoneref *zref;
> -			zref = &p->v.zonelist->_zonerefs[i];
> -			node_set(zonelist_node_idx(zref), *nodes);
> -		}
> -		break;
>  	case MPOL_DEFAULT:
>  		break;
> +	case MPOL_BIND:
> +		/* Fall through */
>  	case MPOL_INTERLEAVE:
>  		*nodes = p->v.nodes;
>  		break;
> @@ -1131,6 +1103,18 @@ static struct mempolicy * get_vma_policy
>  	return pol;
>  }
>  
> +/* Return a nodemask representing a mempolicy */
> +static inline nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
> +{
> +	/* Lower zones don't get a nodemask applied for MPOL_BIND */
> +	if (unlikely(policy->policy == MPOL_BIND &&
> +			gfp_zone(gfp) >= policy_zone &&
> +			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)))
> +		return &policy->v.nodes;
> +
> +	return NULL;
> +}
> +
>  /* Return a zonelist representing a mempolicy */
>  static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
>  {
> @@ -1143,11 +1127,6 @@ static struct zonelist *zonelist_policy(
>  			nd = numa_node_id();
>  		break;
>  	case MPOL_BIND:
> -		/* Lower zones don't get a policy applied */
> -		/* Careful: current->mems_allowed might have moved */
> -		if (gfp_zone(gfp) >= policy_zone)
> -			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
> -				return policy->v.zonelist;
>  		/*FALL THROUGH*/
>  	case MPOL_INTERLEAVE: /* should not happen */
>  	case MPOL_DEFAULT:
> @@ -1191,7 +1170,13 @@ unsigned slab_node(struct mempolicy *pol
>  		 * Follow bind policy behavior and start allocation at the
>  		 * first node.
>  		 */
> -		return zonelist_node_idx(policy->v.zonelist->_zonerefs);
> +		struct zonelist *zonelist;
> +		struct zoneref *z;
> +		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
> +		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
> +		z = first_zones_zonelist(zonelist, &policy->v.nodes,
> +							highest_zoneidx);
> +		return zonelist_node_idx(z);
>  	}
>  
>  	case MPOL_PREFERRED:
> @@ -1349,7 +1334,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area
>  	/*
>  	 * fast path:  default or task policy
>  	 */
> -	return __alloc_pages(gfp, 0, zl);
> +	return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
>  }
>  
>  /**
> @@ -1406,14 +1391,6 @@ struct mempolicy *__mpol_copy(struct mem
>  	}
>  	*new = *old;
>  	atomic_set(&new->refcnt, 1);
> -	if (new->policy == MPOL_BIND) {
> -		int sz = ksize(old->v.zonelist);
> -		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
> -		if (!new->v.zonelist) {
> -			kmem_cache_free(policy_cache, new);
> -			return ERR_PTR(-ENOMEM);
> -		}
> -	}
>  	return new;
>  }
>  
> @@ -1427,21 +1404,12 @@ int __mpol_equal(struct mempolicy *a, st
>  	switch (a->policy) {
>  	case MPOL_DEFAULT:
>  		return 1;
> +	case MPOL_BIND:
> +		/* Fall through */
>  	case MPOL_INTERLEAVE:
>  		return nodes_equal(a->v.nodes, b->v.nodes);
>  	case MPOL_PREFERRED:
>  		return a->v.preferred_node == b->v.preferred_node;
> -	case MPOL_BIND: {
> -		int i;
> -		for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) {
> -			struct zone *za, *zb;
> -			za = zonelist_zone(&a->v.zonelist->_zonerefs[i]);
> -			zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]);
> -			if (za != zb)
> -				return 0;
> -		}
> -		return b->v.zonelist->_zonerefs[i].zone == NULL;
> -	}
>  	default:
>  		BUG();
>  		return 0;
> @@ -1453,8 +1421,6 @@ void __mpol_free(struct mempolicy *p)
>  {
>  	if (!atomic_dec_and_test(&p->refcnt))
>  		return;
> -	if (p->policy == MPOL_BIND)
> -		kfree(p->v.zonelist);
>  	p->policy = MPOL_DEFAULT;
>  	kmem_cache_free(policy_cache, p);
>  }
> @@ -1745,6 +1711,8 @@ static void mpol_rebind_policy(struct me
>  	switch (pol->policy) {
>  	case MPOL_DEFAULT:
>  		break;
> +	case MPOL_BIND:
> +		/* Fall through */
>  	case MPOL_INTERLEAVE:
>  		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
>  		pol->v.nodes = tmp;
> @@ -1757,32 +1725,6 @@ static void mpol_rebind_policy(struct me
>  						*mpolmask, *newmask);
>  		*mpolmask = *newmask;
>  		break;
> -	case MPOL_BIND: {
> -		nodemask_t nodes;
> -		struct zoneref *z;
> -		struct zonelist *zonelist;
> -
> -		nodes_clear(nodes);
> -		for (z = pol->v.zonelist->_zonerefs; z->zone; z++)
> -			node_set(zonelist_node_idx(z), nodes);
> -		nodes_remap(tmp, nodes, *mpolmask, *newmask);
> -		nodes = tmp;
> -
> -		zonelist = bind_zonelist(&nodes);
> -
> -		/* If no mem, then zonelist is NULL and we keep old zonelist.
> -		 * If that old zonelist has no remaining mems_allowed nodes,
> -		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
> -		 */
> -
> -		if (!IS_ERR(zonelist)) {
> -			/* Good - got mem - substitute new zonelist */
> -			kfree(pol->v.zonelist);
> -			pol->v.zonelist = zonelist;
> -		}
> -		*mpolmask = *newmask;
> -		break;
> -	}
>  	default:
>  		BUG();
>  		break;
> @@ -1845,9 +1787,7 @@ static inline int mpol_to_str(char *buff
>  		break;
>  
>  	case MPOL_BIND:
> -		get_zonemask(pol, &nodes);
> -		break;
> -
> +		/* Fall through */
>  	case MPOL_INTERLEAVE:
>  		nodes = pol->v.nodes;
>  		break;
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/page_alloc.c linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/page_alloc.c
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/page_alloc.c	2007-09-28 15:49:39.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/page_alloc.c	2007-09-28 15:49:57.000000000 +0100
> @@ -1420,7 +1420,7 @@ static void zlc_mark_zone_full(struct zo
>   * a page.
>   */
>  static struct page *
> -get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
> +get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
>  		struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
>  {
>  	struct zoneref *z;
> @@ -1431,7 +1431,7 @@ get_page_from_freelist(gfp_t gfp_mask, u
>  	int zlc_active = 0;		/* set if using zonelist_cache */
>  	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
>  
> -	z = first_zones_zonelist(zonelist, high_zoneidx);
> +	z = first_zones_zonelist(zonelist, nodemask, high_zoneidx);
>  	classzone_idx = zonelist_zone_idx(z);
>  
>  zonelist_scan:
> @@ -1439,7 +1439,8 @@ zonelist_scan:
>  	 * Scan zonelist, looking for a zone with enough free.
>  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
>  	 */
> -	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> +	for_each_zone_zonelist_nodemask(zone, z, zonelist,
> +						high_zoneidx, nodemask) {
>  		if (NUMA_BUILD && zlc_active &&
>  			!zlc_zone_worth_trying(zonelist, z, allowednodes))
>  				continue;
> @@ -1545,9 +1546,9 @@ static void set_page_owner(struct page *
>  /*
>   * This is the 'heart' of the zoned buddy allocator.
>   */
> -struct page * fastcall
> -__alloc_pages(gfp_t gfp_mask, unsigned int order,
> -		struct zonelist *zonelist)
> +static struct page *
> +__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
> +			struct zonelist *zonelist, nodemask_t *nodemask)
>  {
>  	const gfp_t wait = gfp_mask & __GFP_WAIT;
>  	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
> @@ -1576,7 +1577,7 @@ restart:
>  		return NULL;
>  	}
>  
> -	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
> +	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
>  			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
>  	if (page)
>  		goto got_pg;
> @@ -1621,7 +1622,7 @@ restart:
>  	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
>  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
>  	 */
> -	page = get_page_from_freelist(gfp_mask, order, zonelist,
> +	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
>  						high_zoneidx, alloc_flags);
>  	if (page)
>  		goto got_pg;
> @@ -1634,7 +1635,7 @@ rebalance:
>  		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
>  nofail_alloc:
>  			/* go through the zonelist yet again, ignoring mins */
> -			page = get_page_from_freelist(gfp_mask, order,
> +			page = get_page_from_freelist(gfp_mask, nodemask, order,
>  				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
>  			if (page)
>  				goto got_pg;
> @@ -1669,7 +1670,7 @@ nofail_alloc:
>  		drain_all_local_pages();
>  
>  	if (likely(did_some_progress)) {
> -		page = get_page_from_freelist(gfp_mask, order,
> +		page = get_page_from_freelist(gfp_mask, nodemask, order,
>  					zonelist, high_zoneidx, alloc_flags);
>  		if (page)
>  			goto got_pg;
> @@ -1685,8 +1686,9 @@ nofail_alloc:
>  		 * a parallel oom killing, we must fail if we're still
>  		 * under heavy pressure.
>  		 */
> -		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
> -			zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
> +		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
> +			order, zonelist, high_zoneidx,
> +			ALLOC_WMARK_HIGH|ALLOC_CPUSET);
>  		if (page) {
>  			clear_zonelist_oom(zonelist, gfp_mask);
>  			goto got_pg;
> @@ -1739,6 +1741,20 @@ got_pg:
>  	return page;
>  }
>  
> +struct page * fastcall
> +__alloc_pages(gfp_t gfp_mask, unsigned int order,
> +		struct zonelist *zonelist)
> +{
> +	return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
> +}
> +
> +struct page * fastcall
> +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
> +		struct zonelist *zonelist, nodemask_t *nodemask)
> +{
> +	return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
> +}
> +
>  EXPORT_SYMBOL(__alloc_pages);
>  
>  /*


WARNING: multiple messages have this Message-ID (diff)
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
To: Mel Gorman <mel@csn.ul.ie>
Cc: akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, rientjes@google.com,
	kamezawa.hiroyu@jp.fujitsu.com, clameter@sgi.com
Subject: Re: [PATCH 5/6] Filter based on a nodemask as well as a gfp_mask
Date: Fri, 28 Sep 2007 11:37:02 -0400	[thread overview]
Message-ID: <1190993823.5513.10.camel@localhost> (raw)
In-Reply-To: <20070928142506.16783.99266.sendpatchset@skynet.skynet.ie>

Still need to fix 'nodes_intersect' -> 'nodes_intersects'.  See below.

On Fri, 2007-09-28 at 15:25 +0100, Mel Gorman wrote:
> The MPOL_BIND policy creates a zonelist that is used for allocations belonging
> to that thread that can use the policy_zone. As the per-node zonelist is
> already being filtered based on a zone id, this patch adds a version of
> __alloc_pages() that takes a nodemask for further filtering. This eliminates
> the need for MPOL_BIND to create a custom zonelist. A positive benefit of
> this is that allocations using MPOL_BIND now use the local-node-ordered
> zonelist instead of a custom node-id-ordered zonelist.
> 
> Signed-off-by: Mel Gorman <mel@csn.ul.ie>
> Acked-by: Christoph Lameter <clameter@sgi.com>
> ---
> 
>  fs/buffer.c               |    2 
>  include/linux/cpuset.h    |    4 -
>  include/linux/gfp.h       |    4 +
>  include/linux/mempolicy.h |    3 
>  include/linux/mmzone.h    |   58 +++++++++++++---
>  kernel/cpuset.c           |   18 +----
>  mm/mempolicy.c            |  144 +++++++++++------------------------------
>  mm/page_alloc.c           |   40 +++++++----
>  8 files changed, 131 insertions(+), 142 deletions(-)
> 
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/fs/buffer.c linux-2.6.23-rc8-mm2-030_filter_nodemask/fs/buffer.c
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/fs/buffer.c	2007-09-28 15:49:39.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/fs/buffer.c	2007-09-28 15:49:57.000000000 +0100
> @@ -376,7 +376,7 @@ static void free_more_memory(void)
>  
>  	for_each_online_node(nid) {
>  		zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
> -						gfp_zone(GFP_NOFS));
> +						NULL, gfp_zone(GFP_NOFS));
>  		if (zrefs->zone)
>  			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
>  						GFP_NOFS);
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/cpuset.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/cpuset.h
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/cpuset.h	2007-09-27 14:41:05.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/cpuset.h	2007-09-28 15:49:57.000000000 +0100
> @@ -28,7 +28,7 @@ void cpuset_init_current_mems_allowed(vo
>  void cpuset_update_task_memory_state(void);
>  #define cpuset_nodes_subset_current_mems_allowed(nodes) \
>  		nodes_subset((nodes), current->mems_allowed)
> -int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
> +int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
>  
>  extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
>  extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
> @@ -103,7 +103,7 @@ static inline void cpuset_init_current_m
>  static inline void cpuset_update_task_memory_state(void) {}
>  #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
>  
> -static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
> +static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
>  {
>  	return 1;
>  }
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/gfp.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/gfp.h
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/gfp.h	2007-09-28 15:49:16.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/gfp.h	2007-09-28 15:49:57.000000000 +0100
> @@ -184,6 +184,10 @@ static inline void arch_alloc_page(struc
>  extern struct page *
>  FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
>  
> +extern struct page *
> +FASTCALL(__alloc_pages_nodemask(gfp_t, unsigned int,
> +				struct zonelist *, nodemask_t *nodemask));
> +
>  static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
>  						unsigned int order)
>  {
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mempolicy.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mempolicy.h
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mempolicy.h	2007-09-28 15:48:55.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mempolicy.h	2007-09-28 15:49:57.000000000 +0100
> @@ -64,9 +64,8 @@ struct mempolicy {
>  	atomic_t refcnt;
>  	short policy; 	/* See MPOL_* above */
>  	union {
> -		struct zonelist  *zonelist;	/* bind */
>  		short 		 preferred_node; /* preferred */
> -		nodemask_t	 nodes;		/* interleave */
> +		nodemask_t	 nodes;		/* interleave/bind */
>  		/* undefined for default */
>  	} v;
>  	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mmzone.h linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mmzone.h
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/include/linux/mmzone.h	2007-09-28 15:49:39.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/include/linux/mmzone.h	2007-09-28 15:49:57.000000000 +0100
> @@ -758,47 +758,85 @@ static inline void encode_zoneref(struct
>  	zoneref->zone_idx = zone_idx(zone);
>  }
>  
> +static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
> +{
> +#ifdef CONFIG_NUMA
> +	return node_isset(zonelist_node_idx(zref), *nodes);
> +#else
> +	return 1;
> +#endif /* CONFIG_NUMA */
> +}
> +
>  /* Returns the first zone at or below highest_zoneidx in a zonelist */
>  static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
> +					nodemask_t *nodes,
>  					enum zone_type highest_zoneidx)
>  {
>  	struct zoneref *z;
>  
>  	/* Find the first suitable zone to use for the allocation */
>  	z = zonelist->_zonerefs;
> -	while (zonelist_zone_idx(z) > highest_zoneidx)
> -		z++;
> +	if (likely(nodes == NULL))
> +		while (zonelist_zone_idx(z) > highest_zoneidx)
> +			z++;
> +	else
> +		while (zonelist_zone_idx(z) > highest_zoneidx ||
> +				(z->zone && !zref_in_nodemask(z, nodes)))
> +			z++;
>  
>  	return z;
>  }
>  
>  /* Returns the next zone at or below highest_zoneidx in a zonelist */
>  static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
> +					nodemask_t *nodes,
>  					enum zone_type highest_zoneidx)
>  {
> -	/* Find the next suitable zone to use for the allocation */
> -	while (zonelist_zone_idx(z) > highest_zoneidx)
> -		z++;
> +	/*
> +	 * Find the next suitable zone to use for the allocation.
> +	 * Only filter based on nodemask if it's set
> +	 */
> +	if (likely(nodes == NULL))
> +		while (zonelist_zone_idx(z) > highest_zoneidx)
> +			z++;
> +	else
> +		while (zonelist_zone_idx(z) > highest_zoneidx ||
> +				(z->zone && !zref_in_nodemask(z, nodes)))
> +			z++;
>  
>  	return z;
>  }
>  
>  /**
> - * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
> + * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
>   * @zone - The current zone in the iterator
>   * @z - The current pointer within zonelist->zones being iterated
>   * @zlist - The zonelist being iterated
>   * @highidx - The zone index of the highest zone to return
> + * @nodemask - Nodemask allowed by the allocator
>   *
> - * This iterator iterates though all zones at or below a given zone index.
> + * This iterator iterates though all zones at or below a given zone index and
> + * within a given nodemask
>   */
> -#define for_each_zone_zonelist(zone, z, zlist, highidx) \
> -	for (z = first_zones_zonelist(zlist, highidx),			\
> +#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
> +	for (z = first_zones_zonelist(zlist, nodemask, highidx),	\
>  					zone = zonelist_zone(z++);	\
>  		zone;							\
> -		z = next_zones_zonelist(z, highidx),			\
> +		z = next_zones_zonelist(z, nodemask, highidx),		\
>  					zone = zonelist_zone(z++))
>  
> +/**
> + * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
> + * @zone - The current zone in the iterator
> + * @z - The current pointer within zonelist->zones being iterated
> + * @zlist - The zonelist being iterated
> + * @highidx - The zone index of the highest zone to return
> + *
> + * This iterator iterates though all zones at or below a given zone index.
> + */
> +#define for_each_zone_zonelist(zone, z, zlist, highidx) \
> +	for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
> +
>  #ifdef CONFIG_SPARSEMEM
>  #include <asm/sparsemem.h>
>  #endif
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/kernel/cpuset.c linux-2.6.23-rc8-mm2-030_filter_nodemask/kernel/cpuset.c
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/kernel/cpuset.c	2007-09-28 15:49:39.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/kernel/cpuset.c	2007-09-28 15:49:57.000000000 +0100
> @@ -1516,22 +1516,14 @@ nodemask_t cpuset_mems_allowed(struct ta
>  }
>  
>  /**
> - * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
> - * @zl: the zonelist to be checked
> + * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
> + * @nodemask: the nodemask to be checked
>   *
> - * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
> + * Are any of the nodes in the nodemask allowed in current->mems_allowed?
>   */
> -int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
> +int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
>  {
> -	int i;
> -
> -	for (i = 0; zl->_zonerefs[i].zone; i++) {
> -		int nid = zonelist_node_idx(zl->_zonerefs[i]);
> -
> -		if (node_isset(nid, current->mems_allowed))
> -			return 1;
> -	}
> -	return 0;
> +	return nodes_intersect(nodemask, current->mems_allowed);
                 ^^^^^^^^^^^^^^^ -- should be nodes_intersects, I think.
>  }
>  
>  /*
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/mempolicy.c linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/mempolicy.c
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/mempolicy.c	2007-09-28 15:49:39.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/mempolicy.c	2007-09-28 15:49:57.000000000 +0100
> @@ -134,41 +134,21 @@ static int mpol_check_policy(int mode, n
>   	return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
>  }
>  
> -/* Generate a custom zonelist for the BIND policy. */
> -static struct zonelist *bind_zonelist(nodemask_t *nodes)
> +/* Check that the nodemask contains at least one populated zone */
> +static int is_valid_nodemask(nodemask_t *nodemask)
>  {
> -	struct zonelist *zl;
> -	int num, max, nd;
> -	enum zone_type k;
> +	int nd, k;
>  
> -	max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
> -	max++;			/* space for zlcache_ptr (see mmzone.h) */
> -	zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
> -	if (!zl)
> -		return ERR_PTR(-ENOMEM);
> -	zl->zlcache_ptr = NULL;
> -	num = 0;
> -	/* First put in the highest zones from all nodes, then all the next 
> -	   lower zones etc. Avoid empty zones because the memory allocator
> -	   doesn't like them. If you implement node hot removal you
> -	   have to fix that. */
> -	k = MAX_NR_ZONES - 1;
> -	while (1) {
> -		for_each_node_mask(nd, *nodes) { 
> -			struct zone *z = &NODE_DATA(nd)->node_zones[k];
> -			if (z->present_pages > 0) 
> -				encode_zoneref(z, &zl->_zonerefs[num++]);
> -		}
> -		if (k == 0)
> -			break;
> -		k--;
> -	}
> -	if (num == 0) {
> -		kfree(zl);
> -		return ERR_PTR(-EINVAL);
> +	/* Check that there is something useful in this mask */
> +	k = policy_zone;
> +
> +	for_each_node_mask(nd, *nodemask) {
> +		struct zone *z = &NODE_DATA(nd)->node_zones[k];
> +		if (z->present_pages > 0)
> +			return 1;
>  	}
> -	zl->_zonerefs[num].zone = NULL;
> -	return zl;
> +
> +	return 0;
>  }
>  
>  /* Create a new policy */
> @@ -201,12 +181,11 @@ static struct mempolicy *mpol_new(int mo
>  			policy->v.preferred_node = -1;
>  		break;
>  	case MPOL_BIND:
> -		policy->v.zonelist = bind_zonelist(nodes);
> -		if (IS_ERR(policy->v.zonelist)) {
> -			void *error_code = policy->v.zonelist;
> +		if (!is_valid_nodemask(nodes)) {
>  			kmem_cache_free(policy_cache, policy);
> -			return error_code;
> +			return ERR_PTR(-EINVAL);
>  		}
> +		policy->v.nodes = *nodes;
>  		break;
>  	}
>  	policy->policy = mode;
> @@ -484,19 +463,12 @@ static long do_set_mempolicy(int mode, n
>  /* Fill a zone bitmap for a policy */
>  static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
>  {
> -	int i;
> -
>  	nodes_clear(*nodes);
>  	switch (p->policy) {
> -	case MPOL_BIND:
> -		for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) {
> -			struct zoneref *zref;
> -			zref = &p->v.zonelist->_zonerefs[i];
> -			node_set(zonelist_node_idx(zref), *nodes);
> -		}
> -		break;
>  	case MPOL_DEFAULT:
>  		break;
> +	case MPOL_BIND:
> +		/* Fall through */
>  	case MPOL_INTERLEAVE:
>  		*nodes = p->v.nodes;
>  		break;
> @@ -1131,6 +1103,18 @@ static struct mempolicy * get_vma_policy
>  	return pol;
>  }
>  
> +/* Return a nodemask representing a mempolicy */
> +static inline nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
> +{
> +	/* Lower zones don't get a nodemask applied for MPOL_BIND */
> +	if (unlikely(policy->policy == MPOL_BIND &&
> +			gfp_zone(gfp) >= policy_zone &&
> +			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)))
> +		return &policy->v.nodes;
> +
> +	return NULL;
> +}
> +
>  /* Return a zonelist representing a mempolicy */
>  static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
>  {
> @@ -1143,11 +1127,6 @@ static struct zonelist *zonelist_policy(
>  			nd = numa_node_id();
>  		break;
>  	case MPOL_BIND:
> -		/* Lower zones don't get a policy applied */
> -		/* Careful: current->mems_allowed might have moved */
> -		if (gfp_zone(gfp) >= policy_zone)
> -			if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
> -				return policy->v.zonelist;
>  		/*FALL THROUGH*/
>  	case MPOL_INTERLEAVE: /* should not happen */
>  	case MPOL_DEFAULT:
> @@ -1191,7 +1170,13 @@ unsigned slab_node(struct mempolicy *pol
>  		 * Follow bind policy behavior and start allocation at the
>  		 * first node.
>  		 */
> -		return zonelist_node_idx(policy->v.zonelist->_zonerefs);
> +		struct zonelist *zonelist;
> +		struct zoneref *z;
> +		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
> +		zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
> +		z = first_zones_zonelist(zonelist, &policy->v.nodes,
> +							highest_zoneidx);
> +		return zonelist_node_idx(z);
>  	}
>  
>  	case MPOL_PREFERRED:
> @@ -1349,7 +1334,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area
>  	/*
>  	 * fast path:  default or task policy
>  	 */
> -	return __alloc_pages(gfp, 0, zl);
> +	return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
>  }
>  
>  /**
> @@ -1406,14 +1391,6 @@ struct mempolicy *__mpol_copy(struct mem
>  	}
>  	*new = *old;
>  	atomic_set(&new->refcnt, 1);
> -	if (new->policy == MPOL_BIND) {
> -		int sz = ksize(old->v.zonelist);
> -		new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
> -		if (!new->v.zonelist) {
> -			kmem_cache_free(policy_cache, new);
> -			return ERR_PTR(-ENOMEM);
> -		}
> -	}
>  	return new;
>  }
>  
> @@ -1427,21 +1404,12 @@ int __mpol_equal(struct mempolicy *a, st
>  	switch (a->policy) {
>  	case MPOL_DEFAULT:
>  		return 1;
> +	case MPOL_BIND:
> +		/* Fall through */
>  	case MPOL_INTERLEAVE:
>  		return nodes_equal(a->v.nodes, b->v.nodes);
>  	case MPOL_PREFERRED:
>  		return a->v.preferred_node == b->v.preferred_node;
> -	case MPOL_BIND: {
> -		int i;
> -		for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) {
> -			struct zone *za, *zb;
> -			za = zonelist_zone(&a->v.zonelist->_zonerefs[i]);
> -			zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]);
> -			if (za != zb)
> -				return 0;
> -		}
> -		return b->v.zonelist->_zonerefs[i].zone == NULL;
> -	}
>  	default:
>  		BUG();
>  		return 0;
> @@ -1453,8 +1421,6 @@ void __mpol_free(struct mempolicy *p)
>  {
>  	if (!atomic_dec_and_test(&p->refcnt))
>  		return;
> -	if (p->policy == MPOL_BIND)
> -		kfree(p->v.zonelist);
>  	p->policy = MPOL_DEFAULT;
>  	kmem_cache_free(policy_cache, p);
>  }
> @@ -1745,6 +1711,8 @@ static void mpol_rebind_policy(struct me
>  	switch (pol->policy) {
>  	case MPOL_DEFAULT:
>  		break;
> +	case MPOL_BIND:
> +		/* Fall through */
>  	case MPOL_INTERLEAVE:
>  		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
>  		pol->v.nodes = tmp;
> @@ -1757,32 +1725,6 @@ static void mpol_rebind_policy(struct me
>  						*mpolmask, *newmask);
>  		*mpolmask = *newmask;
>  		break;
> -	case MPOL_BIND: {
> -		nodemask_t nodes;
> -		struct zoneref *z;
> -		struct zonelist *zonelist;
> -
> -		nodes_clear(nodes);
> -		for (z = pol->v.zonelist->_zonerefs; z->zone; z++)
> -			node_set(zonelist_node_idx(z), nodes);
> -		nodes_remap(tmp, nodes, *mpolmask, *newmask);
> -		nodes = tmp;
> -
> -		zonelist = bind_zonelist(&nodes);
> -
> -		/* If no mem, then zonelist is NULL and we keep old zonelist.
> -		 * If that old zonelist has no remaining mems_allowed nodes,
> -		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
> -		 */
> -
> -		if (!IS_ERR(zonelist)) {
> -			/* Good - got mem - substitute new zonelist */
> -			kfree(pol->v.zonelist);
> -			pol->v.zonelist = zonelist;
> -		}
> -		*mpolmask = *newmask;
> -		break;
> -	}
>  	default:
>  		BUG();
>  		break;
> @@ -1845,9 +1787,7 @@ static inline int mpol_to_str(char *buff
>  		break;
>  
>  	case MPOL_BIND:
> -		get_zonemask(pol, &nodes);
> -		break;
> -
> +		/* Fall through */
>  	case MPOL_INTERLEAVE:
>  		nodes = pol->v.nodes;
>  		break;
> diff -rup -X /usr/src/patchset-0.6/bin//dontdiff linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/page_alloc.c linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/page_alloc.c
> --- linux-2.6.23-rc8-mm2-020_zoneid_zonelist/mm/page_alloc.c	2007-09-28 15:49:39.000000000 +0100
> +++ linux-2.6.23-rc8-mm2-030_filter_nodemask/mm/page_alloc.c	2007-09-28 15:49:57.000000000 +0100
> @@ -1420,7 +1420,7 @@ static void zlc_mark_zone_full(struct zo
>   * a page.
>   */
>  static struct page *
> -get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
> +get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
>  		struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
>  {
>  	struct zoneref *z;
> @@ -1431,7 +1431,7 @@ get_page_from_freelist(gfp_t gfp_mask, u
>  	int zlc_active = 0;		/* set if using zonelist_cache */
>  	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
>  
> -	z = first_zones_zonelist(zonelist, high_zoneidx);
> +	z = first_zones_zonelist(zonelist, nodemask, high_zoneidx);
>  	classzone_idx = zonelist_zone_idx(z);
>  
>  zonelist_scan:
> @@ -1439,7 +1439,8 @@ zonelist_scan:
>  	 * Scan zonelist, looking for a zone with enough free.
>  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
>  	 */
> -	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> +	for_each_zone_zonelist_nodemask(zone, z, zonelist,
> +						high_zoneidx, nodemask) {
>  		if (NUMA_BUILD && zlc_active &&
>  			!zlc_zone_worth_trying(zonelist, z, allowednodes))
>  				continue;
> @@ -1545,9 +1546,9 @@ static void set_page_owner(struct page *
>  /*
>   * This is the 'heart' of the zoned buddy allocator.
>   */
> -struct page * fastcall
> -__alloc_pages(gfp_t gfp_mask, unsigned int order,
> -		struct zonelist *zonelist)
> +static struct page *
> +__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
> +			struct zonelist *zonelist, nodemask_t *nodemask)
>  {
>  	const gfp_t wait = gfp_mask & __GFP_WAIT;
>  	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
> @@ -1576,7 +1577,7 @@ restart:
>  		return NULL;
>  	}
>  
> -	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
> +	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
>  			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
>  	if (page)
>  		goto got_pg;
> @@ -1621,7 +1622,7 @@ restart:
>  	 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
>  	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
>  	 */
> -	page = get_page_from_freelist(gfp_mask, order, zonelist,
> +	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
>  						high_zoneidx, alloc_flags);
>  	if (page)
>  		goto got_pg;
> @@ -1634,7 +1635,7 @@ rebalance:
>  		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
>  nofail_alloc:
>  			/* go through the zonelist yet again, ignoring mins */
> -			page = get_page_from_freelist(gfp_mask, order,
> +			page = get_page_from_freelist(gfp_mask, nodemask, order,
>  				zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
>  			if (page)
>  				goto got_pg;
> @@ -1669,7 +1670,7 @@ nofail_alloc:
>  		drain_all_local_pages();
>  
>  	if (likely(did_some_progress)) {
> -		page = get_page_from_freelist(gfp_mask, order,
> +		page = get_page_from_freelist(gfp_mask, nodemask, order,
>  					zonelist, high_zoneidx, alloc_flags);
>  		if (page)
>  			goto got_pg;
> @@ -1685,8 +1686,9 @@ nofail_alloc:
>  		 * a parallel oom killing, we must fail if we're still
>  		 * under heavy pressure.
>  		 */
> -		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
> -			zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
> +		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
> +			order, zonelist, high_zoneidx,
> +			ALLOC_WMARK_HIGH|ALLOC_CPUSET);
>  		if (page) {
>  			clear_zonelist_oom(zonelist, gfp_mask);
>  			goto got_pg;
> @@ -1739,6 +1741,20 @@ got_pg:
>  	return page;
>  }
>  
> +struct page * fastcall
> +__alloc_pages(gfp_t gfp_mask, unsigned int order,
> +		struct zonelist *zonelist)
> +{
> +	return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
> +}
> +
> +struct page * fastcall
> +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
> +		struct zonelist *zonelist, nodemask_t *nodemask)
> +{
> +	return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
> +}
> +
>  EXPORT_SYMBOL(__alloc_pages);
>  
>  /*

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2007-09-28 15:38 UTC|newest]

Thread overview: 70+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-09-28 14:23 [PATCH 0/6] Use one zonelist per node instead of multiple zonelists v8 Mel Gorman
2007-09-28 14:23 ` Mel Gorman
2007-09-28 14:23 ` [PATCH 1/6] Use zonelists instead of zones when direct reclaiming pages Mel Gorman
2007-09-28 14:23   ` Mel Gorman
2007-09-28 14:24 ` [PATCH 2/6] Introduce node_zonelist() for accessing the zonelist for a GFP mask Mel Gorman
2007-09-28 14:24   ` Mel Gorman
2007-09-28 14:24 ` [PATCH 3/6] Use two zonelist that are filtered by " Mel Gorman
2007-09-28 14:24   ` Mel Gorman
2007-09-28 14:24 ` [PATCH 4/6] Have zonelist contains structs with both a zone pointer and zone_idx Mel Gorman
2007-09-28 14:24   ` Mel Gorman
2007-10-17  3:22   ` David Rientjes
2007-10-17  3:22     ` David Rientjes
2007-09-28 14:25 ` [PATCH 5/6] Filter based on a nodemask as well as a gfp_mask Mel Gorman
2007-09-28 14:25   ` Mel Gorman
2007-09-28 15:37   ` Lee Schermerhorn [this message]
2007-09-28 15:37     ` Lee Schermerhorn
2007-09-28 18:28     ` Mel Gorman
2007-09-28 18:28       ` Mel Gorman
2007-09-28 18:38       ` Paul Jackson
2007-09-28 18:38         ` Paul Jackson
2007-09-28 21:03       ` Lee Schermerhorn
2007-09-28 21:03         ` Lee Schermerhorn
2007-09-28 14:25 ` [PATCH 6/6] Use one zonelist that is filtered by nodemask Mel Gorman
2007-09-28 14:25   ` Mel Gorman
2007-10-09  1:11   ` Nishanth Aravamudan
2007-10-09  1:11     ` Nishanth Aravamudan
2007-10-09  1:56     ` Christoph Lameter
2007-10-09  1:56       ` Christoph Lameter
2007-10-09  3:17       ` Nishanth Aravamudan
2007-10-09  3:17         ` Nishanth Aravamudan
2007-10-09 15:40     ` Mel Gorman
2007-10-09 15:40       ` Mel Gorman
2007-10-09 16:25       ` Nishanth Aravamudan
2007-10-09 16:25         ` Nishanth Aravamudan
2007-10-09 18:47         ` Christoph Lameter
2007-10-09 18:47           ` Christoph Lameter
2007-10-09 18:12       ` Nishanth Aravamudan
2007-10-09 18:12         ` Nishanth Aravamudan
2007-10-10 15:53       ` Lee Schermerhorn
2007-10-10 15:53         ` Lee Schermerhorn
2007-10-10 16:05         ` Nishanth Aravamudan
2007-10-10 16:05           ` Nishanth Aravamudan
2007-10-10 16:09         ` Mel Gorman
2007-10-10 16:09           ` Mel Gorman
  -- strict thread matches above, loose matches on Subject: below --
2007-11-09 14:32 [PATCH 0/6] Use one zonelist per node instead of multiple zonelists v9 Mel Gorman
2007-11-09 14:34 ` [PATCH 5/6] Filter based on a nodemask as well as a gfp_mask Mel Gorman
2007-11-09 14:34   ` Mel Gorman
2008-02-29  5:01   ` Paul Jackson
2008-02-29  5:01     ` Paul Jackson
2008-02-29 14:49     ` Lee Schermerhorn
2008-02-29 14:49       ` Lee Schermerhorn
2007-09-13 17:52 [PATCH 0/6] Use one zonelist per node instead of multiple zonelists v7 Mel Gorman
2007-09-13 17:53 ` [PATCH 5/6] Filter based on a nodemask as well as a gfp_mask Mel Gorman
2007-09-13 17:53   ` Mel Gorman
2007-09-12 21:04 [PATCH 0/6] Use one zonelist per node instead of multiple zonelists v6 Mel Gorman
2007-09-12 21:06 ` [PATCH 5/6] Filter based on a nodemask as well as a gfp_mask Mel Gorman
2007-09-12 21:06   ` Mel Gorman
2007-09-12 21:23   ` Christoph Lameter
2007-09-12 21:23     ` Christoph Lameter
2007-09-13 10:25     ` Mel Gorman
2007-09-13 10:25       ` Mel Gorman
2007-09-13 15:49   ` Lee Schermerhorn
2007-09-13 15:49     ` Lee Schermerhorn
2007-09-11 21:30 [PATCH 0/6] Use one zonelist per node instead of multiple zonelists v5 (resend) Mel Gorman
2007-09-11 21:31 ` [PATCH 5/6] Filter based on a nodemask as well as a gfp_mask Mel Gorman
2007-09-11 21:31   ` Mel Gorman
2007-09-11 15:19 [PATCH 0/6] Use one zonelist per node instead of multiple zonelists v5 Mel Gorman
2007-09-11 15:21 ` [PATCH 5/6] Filter based on a nodemask as well as a gfp_mask Mel Gorman
2007-09-11 15:21   ` Mel Gorman
2007-08-17 20:16 [PATCH 0/6] Use one zonelist per node instead of multiple zonelists v4 Mel Gorman
2007-08-17 20:18 ` [PATCH 5/6] Filter based on a nodemask as well as a gfp_mask Mel Gorman
2007-08-17 20:18   ` Mel Gorman
2007-08-17 21:29   ` Christoph Lameter
2007-08-17 21:29     ` Christoph Lameter
2007-08-21  9:12     ` Mel Gorman
2007-08-21  9:12       ` Mel Gorman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1190993823.5513.10.camel@localhost \
    --to=lee.schermerhorn@hp.com \
    --cc=akpm@linux-foundation.org \
    --cc=clameter@sgi.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mel@csn.ul.ie \
    --cc=rientjes@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.