public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] String conversions for memory policy
@ 2005-07-29 18:38 Christoph Lameter
  2005-07-29 22:20 ` Paul Jackson
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-07-29 18:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, pj

This patch adds two new functions to mm/mempolicy.c that allow the conversion
of a memorypolicy to a textual representation and vice versa.

The patch provides necessary functions for the merging of numa_maps into 
smap.

Signed-off-by: Christoph Lameter <christoph@lameter.com>

Index: linux-2.6.13-rc3-mm3/mm/mempolicy.c
===================================================================
--- linux-2.6.13-rc3-mm3.orig/mm/mempolicy.c	2005-07-29 10:37:26.000000000 -0700
+++ linux-2.6.13-rc3-mm3/mm/mempolicy.c	2005-07-29 11:08:13.000000000 -0700
@@ -1170,3 +1170,214 @@
 {
 	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
 }
+
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
+ */
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+	char *p = buffer;
+	char *e = buffer + maxlen;
+	int first = 1;
+	int node;
+	struct zone **z;
+
+	if (!pol || pol->policy == MPOL_DEFAULT) {
+		strcpy(buffer,"default");
+		return 7;
+	}
+
+	if (pol->policy == MPOL_PREFERRED) {
+		if (e < p + 8 /* fixed string size */ + 4 /* max len of node number */)
+			return -ENOSPC;
+
+		sprintf(p, "prefer=%d", pol->v.preferred_node);
+		return strlen(buffer);
+
+	} else if (pol->policy == MPOL_BIND) {
+
+		if (e < p + 9 + 4)
+			return -ENOSPC;
+
+		p+= sprintf(p, "bind={");
+
+		for (z = pol->v.zonelist->zones; *z ; *z++) {
+			if (!first)
+				*p++ = ',';
+			else
+				first = 0;
+			if (e < p + 2 + 4 + strlen((*z)->name))
+				return -ENOSPC;
+			p += sprintf(p, "%d/%s", (*z)->zone_pgdat->node_id, (*z)->name);
+		}
+
+		*p++ = '}';
+		*p++ = 0;
+		return p-buffer;
+
+	} else if (pol->policy == MPOL_INTERLEAVE) {
+
+		if (e < p + 14 + 4)
+			return -ENOSPC;
+
+		p += sprintf(p, "interleave={");
+
+		for_each_node(node)
+			if (test_bit(node, pol->v.nodes)) {
+				if (!first)
+					*p++ = ',';
+				else
+					first = 0;
+				if (e < p + 2 /* min bytes that follow */ + 4 /* node number */)
+					return -ENOSPC;
+				p += sprintf(p, "%d", node);
+			}
+
+		*p++ = '}';
+		*p++ = 0;
+		return p-buffer;
+	}
+	BUG();
+	return -EFAULT;
+}
+
+/*
+ * Convert a representation of a memory policy from text
+ * form to binary.
+ *
+ * Returns either a memory policy or NULL for error.
+ */
+struct mempolicy *str_to_mpol(char *buffer, char **end)
+{
+	char *p;
+	struct mempolicy *pol;
+	int node;
+	size_t size;
+
+	if (strnicmp(buffer, "default", 7) == 0) {
+
+		*end = buffer + 7;
+		return &default_policy;
+
+	}
+
+	pol = __mpol_copy(&default_policy);
+	if (IS_ERR(pol))
+		return NULL;
+
+	if (strnicmp(buffer, "prefer=", 7) == 0) {
+
+		node = simple_strtoul(buffer + 7, &p, 10);
+		if (node >= MAX_NUMNODES || !node_online(node))
+			goto out;
+
+		pol->policy = MPOL_PREFERRED;
+		pol->v.preferred_node = node;
+
+	} else if (strnicmp(buffer, "interleave={", 12) == 0) {
+
+		pol->policy = MPOL_INTERLEAVE;
+		p = buffer + 12;
+		bitmap_zero(pol->v.nodes, MAX_NUMNODES);
+
+		do {
+			node = simple_strtoul(p, &p, 10);
+
+			/* Check here for cpuset restrictions on nodes */
+			if (node >= MAX_NUMNODES || !node_online(node))
+				goto out;
+			set_bit(node, pol->v.nodes);
+
+		} while (*p++ == ',');
+
+		if (p[-1] != '}' || bitmap_empty(pol->v.nodes, MAX_NUMNODES))
+			goto out;
+
+	} else if (strnicmp(buffer, "bind={", 6) == 0) {
+
+		struct zonelist *zonelist = kmalloc(sizeof(struct zonelist), GFP_KERNEL);
+		struct zone **z = zonelist->zones;
+		struct zonelist *new;
+
+		pol->policy = MPOL_BIND;
+		p = buffer + 6;
+
+		do {
+			pg_data_t *pgdat;
+			struct zone *zone = NULL;
+
+			node = simple_strtoul(p, &p, 10);
+
+			/* Try to find the pgdat for the specified node */
+			for_each_pgdat(pgdat) {
+				if (pgdat->node_id == node) {
+					zone = pgdat->node_zones;
+					break;
+				}
+			}
+			if (!zone || node >= MAX_NUMNODES || !node_online(node))
+				goto bind_out;
+
+			/*
+			 * If there is no zone specified then take the first
+			 * zone. Otherwise we need to look for a matching name
+			 */
+			if (*p == '/') {
+				char *start = ++p;
+				struct zone *q;
+				struct zone *found = NULL;
+
+				/* Find end of the zone name */
+				while (*p && *p != ',' && *p != '}')
+					p++;
+
+				if (start == p)
+					goto bind_out;
+				/*
+				 * Go through the zones in this node and check
+				 * if any have the name we are looking for
+				 */
+				for(q = zone; q < zone + MAX_NR_ZONES; q++) {
+					if (strnicmp(q->name, start, p-start) == 0) {
+						found = q;
+						break;
+					}
+				}
+				zone = found;
+			}
+
+			if (!zone || z > zonelist->zones + MAX_NUMNODES * MAX_NR_ZONES)
+				goto bind_out;
+			*z++ = zone;
+
+		} while (*p++ == ',');
+
+		if (p[-1] != '}') {
+bind_out:
+			kfree(zonelist);
+			goto out;
+		}
+
+		/* Allocate only the necessary elements */
+		*z++ = NULL;
+		size = (z - zonelist->zones) * sizeof(struct zonelist *);
+		new = kmalloc(size, GFP_KERNEL);
+		if (!new)
+			goto out;
+		memcpy(new, zonelist, size);
+		kfree(zonelist);
+
+		pol->v.zonelist = new;
+
+	} else {
+out:
+		__mpol_free(pol);
+		return NULL;
+	}
+
+	*end = p;
+	return pol;
+}
+
Index: linux-2.6.13-rc3-mm3/include/linux/mempolicy.h
===================================================================
--- linux-2.6.13-rc3-mm3.orig/include/linux/mempolicy.h	2005-07-29 10:37:26.000000000 -0700
+++ linux-2.6.13-rc3-mm3/include/linux/mempolicy.h	2005-07-29 11:08:13.000000000 -0700
@@ -157,6 +157,10 @@
 extern void numa_policy_init(void);
 extern struct mempolicy default_policy;
 
+/* Conversion functions */
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
+struct mempolicy *str_to_mpol(char *buffer, char **end);
+
 #else
 
 struct mempolicy {};

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-29 18:38 [PATCH] String conversions for memory policy Christoph Lameter
@ 2005-07-29 22:20 ` Paul Jackson
  2005-07-30  0:53   ` Christoph Lameter
  0 siblings, 1 reply; 44+ messages in thread
From: Paul Jackson @ 2005-07-29 22:20 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm

Hmmm ... my previous reaction to this conversion code, when you posted
an ancestor of this patch on linux-mm, got buried in the crypically
terse term 'complex API', and that thread went on to discuss matters
of greater substance, such as whether one thread could or should
manipulate the memory policies of another.

I guess it's time I dealt directly with this code ...

My first suspicion on reading it is that it might partially duplicate
some string conversion code and syntax that is already present in
the kernel for other purposes.  For example the nodelists might
replicate the lists of numbers supported by bitmap_scnlistprintf
and bitmap_parselist.

However, since no documentation is presented to describe this new
micro-micro-language for describing memory policies, it is difficult to
know what the syntax is, without reverse engineering it from the code.

So ... how about documenting this string format, then we can discuss
both the syntax and its implementation.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-29 22:20 ` Paul Jackson
@ 2005-07-30  0:53   ` Christoph Lameter
  2005-07-30  5:54     ` Paul Jackson
  2005-07-30  6:00     ` Paul Jackson
  0 siblings, 2 replies; 44+ messages in thread
From: Christoph Lameter @ 2005-07-30  0:53 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm

On Fri, 29 Jul 2005, Paul Jackson wrote:

> My first suspicion on reading it is that it might partially duplicate
> some string conversion code and syntax that is already present in
> the kernel for other purposes.  For example the nodelists might
> replicate the lists of numbers supported by bitmap_scnlistprintf
> and bitmap_parselist.

Ahh. Yes these allow to simplify things somewhat and allow more 
flexiblity when specifying nodelists for interleave. Wish there was 
something similar for zonelists. Thanks.

> micro-micro-language for describing memory policies, it is difficult to
> know what the syntax is, without reverse engineering it from the code.

Umm.. that is what we do all day but maybe you could have a look at my 
initial patch which actually included a description of the syntax which 
is a straighforward representation of the possible variations on a memory 
policy.

Diff to use bitmap_scnlistprintf and bitmap_parselist.

Index: linux-2.6.13-rc3-mm3/mm/mempolicy.c
===================================================================
--- linux-2.6.13-rc3-mm3.orig/mm/mempolicy.c	2005-07-29 17:49:41.000000000 -0700
+++ linux-2.6.13-rc3-mm3/mm/mempolicy.c	2005-07-29 17:52:13.000000000 -0700
@@ -1181,7 +1181,6 @@
 	char *p = buffer;
 	char *e = buffer + maxlen;
 	int first = 1;
-	int node;
 	struct zone **z;
 
 	if (!pol || pol->policy == MPOL_DEFAULT) {
@@ -1223,18 +1222,7 @@
 			return -ENOSPC;
 
 		p += sprintf(p, "interleave={");
-
-		for_each_node(node)
-			if (test_bit(node, pol->v.nodes)) {
-				if (!first)
-					*p++ = ',';
-				else
-					first = 0;
-				if (e < p + 2 /* min bytes that follow */ + 4 /* node number */)
-					return -ENOSPC;
-				p += sprintf(p, "%d", node);
-			}
-
+		p += bitmap_scnprintf(p, e-p-2, pol->v.nodes, MAX_NUMNODES);
 		*p++ = '}';
 		*p++ = 0;
 		return p-buffer;
@@ -1279,20 +1267,8 @@
 	} else if (strnicmp(buffer, "interleave={", 12) == 0) {
 
 		pol->policy = MPOL_INTERLEAVE;
-		p = buffer + 12;
-		bitmap_zero(pol->v.nodes, MAX_NUMNODES);
-
-		do {
-			node = simple_strtoul(p, &p, 10);
-
-			/* Check here for cpuset restrictions on nodes */
-			if (node >= MAX_NUMNODES || !node_online(node))
-				goto out;
-			set_bit(node, pol->v.nodes);
-
-		} while (*p++ == ',');
-
-		if (p[-1] != '}' || bitmap_empty(pol->v.nodes, MAX_NUMNODES))
+		if (bitmap_parselist(buffer + 12, pol->v.nodes, MAX_NUMNODES) ||
+		    bitmap_empty(pol->v.nodes, MAX_NUMNODES))
 			goto out;
 
 	} else if (strnicmp(buffer, "bind={", 6) == 0) {

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-30  0:53   ` Christoph Lameter
@ 2005-07-30  5:54     ` Paul Jackson
  2005-07-30 15:48       ` Christoph Lameter
  2005-07-30  6:00     ` Paul Jackson
  1 sibling, 1 reply; 44+ messages in thread
From: Paul Jackson @ 2005-07-30  5:54 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm

Christoph wrote:
> you could have a look at my initial patch which actually
> included a description of the syntax 

Yes - I could (I did actually).

Something like that should be part of the record here.  Not everyone
has your earlier linux-mm email thread right at hand.


> Diff to use bitmap_scnlistprintf and bitmap_parselist.

That was quick - thanks.

Once we have a clear description of this syntax in the record,
I anticipate raising as an issue that this syntax does not have a
single integer or string token value per file (or at most, an array
or list of comparable integer values).

But first things first.  Until any lurkers can easily see what is being
proposed, it is inconvenient to carry on discussions of alternatives
and their relative merits.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-30  0:53   ` Christoph Lameter
  2005-07-30  5:54     ` Paul Jackson
@ 2005-07-30  6:00     ` Paul Jackson
  2005-07-30 17:56       ` Christoph Lameter
  1 sibling, 1 reply; 44+ messages in thread
From: Paul Jackson @ 2005-07-30  6:00 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm

Oh - I should have mentioned this before - if you are displaying and
parsing node lists (nodemask_t) then there are wrappers for these
bitmap routines in linux/nodemask.h:

 * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
 * int nodemask_parse(ubuf, ulen, mask)   Parse ascii string as nodemask
 * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
 * int nodelist_parse(buf, map)           Parse ascii string as nodelist

See nodemask.h for other such useful routines.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-30  5:54     ` Paul Jackson
@ 2005-07-30 15:48       ` Christoph Lameter
  2005-07-31  1:45         ` Paul Jackson
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-07-30 15:48 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm

On Fri, 29 Jul 2005, Paul Jackson wrote:

> Once we have a clear description of this syntax in the record,
> I anticipate raising as an issue that this syntax does not have a
> single integer or string token value per file (or at most, an array
> or list of comparable integer values).

The current patch only outputs the memory policy via
smaps/emaps. However, this could be construet as a single string
describing the policy.

>From my earlier post:

default                 -> Reset allocation policy to default
prefer=<node>           -> Prefer allocation on specified node
interleave={nodelist}   -> Interleaved allocation on the given nodes
bind={zonelist}         -> Restrict allocation to the specified zones.

Zones are specified by either only providing the node number or using the
notation zone/name. I.e. 3/normal 1/high 0/dma etc.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-30  6:00     ` Paul Jackson
@ 2005-07-30 17:56       ` Christoph Lameter
  2005-07-31  1:14         ` Paul Jackson
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-07-30 17:56 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm

On Fri, 29 Jul 2005, Paul Jackson wrote:

> Oh - I should have mentioned this before - if you are displaying and
> parsing node lists (nodemask_t) then there are wrappers for these
> bitmap routines in linux/nodemask.h:
> 
>  * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing
>  * int nodemask_parse(ubuf, ulen, mask)   Parse ascii string as nodemask
>  * int nodelist_scnprintf(buf, len, mask) Format nodemask as list for printing
>  * int nodelist_parse(buf, map)           Parse ascii string as nodelist
> 
> See nodemask.h for other such useful routines.

Hmmm. Would be great if we had a nodelist in the policy structure but it 
is a zonelist instead and my conversion functions deal with zonelists and 
not nodelists. Strangely mpol_new converts a node bit mask into a 
list of zones.

As a consequence the zonelist always begins with zones of the lowest node. 
Page allocation will therefore always fill up the lower nodes before going
to the higher nodes. Is this what we intended? I would have expected that 
BIND would restrict to a set of nodes and then allocate locally on the 
node the process is running on if possible. That is at least what I 
gather from reading the documentation.

Maybe I do not fully understand what is going on here but this kind of 
implementation for bind must be hurting performance of applications by 
not allocating pages locally. It also puts extremely high 
memory pressure on lower numbered nodes.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-30 17:56       ` Christoph Lameter
@ 2005-07-31  1:14         ` Paul Jackson
  2005-07-31  1:21           ` Christoph Lameter
  0 siblings, 1 reply; 44+ messages in thread
From: Paul Jackson @ 2005-07-31  1:14 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm

Christoph wrote:
> Strangely mpol_new converts a node bit mask into a 
> list of zones.

It's not strange.  The numa interface calls mbind, set_mempolicy
and get_mempolicy, between kernel and user, don't deal in zones.
They deal in nodes.

I suspect you should do the same.  I doubt we want to expose
zones to user space here.

I will leave it to Andi to address your questions about memory
pressure on lower numbered nodes when using MPOL_BIND.  I suspect
that for some of the uses you consider here, such as binding to a
set of more than one node, that cpusets will provide capabilities
closer to what you have in mind.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-31  1:14         ` Paul Jackson
@ 2005-07-31  1:21           ` Christoph Lameter
  2005-07-31  1:50             ` Paul Jackson
  2005-07-31  2:01             ` Paul Jackson
  0 siblings, 2 replies; 44+ messages in thread
From: Christoph Lameter @ 2005-07-31  1:21 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm

On Sat, 30 Jul 2005, Paul Jackson wrote:

> I suspect you should do the same.  I doubt we want to expose
> zones to user space here.

The problem is how to convert them back for display. Match the zones in 
groups of three to the zones in a node and then print out the node?
 
> I will leave it to Andi to address your questions about memory
> pressure on lower numbered nodes when using MPOL_BIND.  I suspect
> that for some of the uses you consider here, such as binding to a
> set of more than one node, that cpusets will provide capabilities
> closer to what you have in mind.

This has nothing to do with what is in my mind. The implementation is not 
conforming to the documentation. This is from "man numa_set_bind":

numa_bind binds the current thread and its children to the
       nodes  specified  in  nodemask.  They will only run on the
       CPUs of the specified nodes  and  only  able  to  allocate
       memory  from them.  This function is equivalent to calling
       numa_run_on_node_mask and numa_set_membind with  the  same
       argument.

numa_set_membind sets the  memory  allocation  mask.   The
       thread  will  only  allocate  memory from the nodes set in
       nodemask.   Passing  an  argument  of   numa_no_nodes   or
       numa_all_nodes turns off memory binding to specific nodes.


I do not get from this that memory is allocated always beginning with the 
lowest node first but that the nodes from which allocations can be done is 
restricted.

Who else would know details about memory policies?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-30 15:48       ` Christoph Lameter
@ 2005-07-31  1:45         ` Paul Jackson
  0 siblings, 0 replies; 44+ messages in thread
From: Paul Jackson @ 2005-07-31  1:45 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm

Christoph wrote:
> this could be construet as a single string

I said "single token."  I meant as in a single word, not a string of
multiple tokens having its own internal syntax.  Heck - if by 'string'
you just mean a sequence of bytes, then I could construe any data
representation as a 'string'.

I don't know if it is a good idea in this case, but each time we add
another pseudo file system interface to the kernel, we should make
an effort to keep the contents of each such pseudo file to a number,
a list of comparable numbers, or a single word.

I'm pretty sure this convention (single number or word per file,
or at most a list of numbers) has been encouraged elsewhere, though
I don't see where offhand.

I do think it is a good idea to be reluctant to introduce new magic
syntax, such as seen in "bind={3/normal 1/high 0/dma}" in kernel-user
interfaces.  Though, as I noted in my previous post a few minutes ago,
I sure hope we don't need zones here anyway - just nodes.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-31  1:21           ` Christoph Lameter
@ 2005-07-31  1:50             ` Paul Jackson
  2005-07-31  2:01             ` Paul Jackson
  1 sibling, 0 replies; 44+ messages in thread
From: Paul Jackson @ 2005-07-31  1:50 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm

> Who else would know details about memory policies?

Andi Kleen.  I do not know of anyone else who can
respond to the points you observe.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-31  1:21           ` Christoph Lameter
  2005-07-31  1:50             ` Paul Jackson
@ 2005-07-31  2:01             ` Paul Jackson
  2005-07-31  2:05               ` Christoph Lameter
  1 sibling, 1 reply; 44+ messages in thread
From: Paul Jackson @ 2005-07-31  2:01 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm

Christoph wrote:
> The problem is how to convert them back for display. Match the zones in 
> groups of three to the zones in a node and then print out the node?

What does get_mempolicy(2) do?  I doubt it assumes groups of three.
See further the case for MPOL_BIND, in mm/mempolicy.c:get_zonemask().

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-31  2:01             ` Paul Jackson
@ 2005-07-31  2:05               ` Christoph Lameter
  2005-07-31  2:12                 ` Paul Jackson
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-07-31  2:05 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm

On Sat, 30 Jul 2005, Paul Jackson wrote:

> Christoph wrote:
> > The problem is how to convert them back for display. Match the zones in 
> > groups of three to the zones in a node and then print out the node?
> 
> What does get_mempolicy(2) do?  I doubt it assumes groups of three.
> See further the case for MPOL_BIND, in mm/mempolicy.c:get_zonemask().

It sets all the node bits by looping over all zones using 
zone->zone_pgdat->node_id.



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-31  2:05               ` Christoph Lameter
@ 2005-07-31  2:12                 ` Paul Jackson
  2005-07-31  2:15                   ` Christoph Lameter
  2005-08-01 18:48                   ` Christoph Lameter
  0 siblings, 2 replies; 44+ messages in thread
From: Paul Jackson @ 2005-07-31  2:12 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm

Christoph wrote:
> It sets all the node bits by looping over all zones using 
> zone->zone_pgdat->node_id.

Yes, for 'all zones' in the mempolicy.

Would it make sense for you to do the same thing, when displaying
mempolicies in /proc/<pid>/numa_policy?

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-31  2:12                 ` Paul Jackson
@ 2005-07-31  2:15                   ` Christoph Lameter
  2005-08-01 18:48                   ` Christoph Lameter
  1 sibling, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2005-07-31  2:15 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm

On Sat, 30 Jul 2005, Paul Jackson wrote:

> Christoph wrote:
> > It sets all the node bits by looping over all zones using 
> > zone->zone_pgdat->node_id.
> 
> Yes, for 'all zones' in the mempolicy.
> 
> Would it make sense for you to do the same thing, when displaying
> mempolicies in /proc/<pid>/numa_policy?

Of course. New rev is already in the works. This will simplify things 
quite a bit. Great.



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-07-31  2:12                 ` Paul Jackson
  2005-07-31  2:15                   ` Christoph Lameter
@ 2005-08-01 18:48                   ` Christoph Lameter
  2005-08-01 18:54                     ` Christoph Lameter
                                       ` (2 more replies)
  1 sibling, 3 replies; 44+ messages in thread
From: Christoph Lameter @ 2005-08-01 18:48 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm, ak

On Sat, 30 Jul 2005, Paul Jackson wrote:

> Would it make sense for you to do the same thing, when displaying
> mempolicies in /proc/<pid>/numa_policy?

Here is a new rev of the conversions patch. Thanks for all the feedback:

---

String conversions for memory policy

This patch adds two new functions to mm/mempolicy.c that allow the conversion
of a memory policy to a textual representation and vice versa.

Syntax of textual representation:

default
preferred=<nodenumber>
bind=<nodelist>
interleave=<nodelist>

Signed-off-by: Christoph Lameter <christoph@lameter.com>

Index: linux-2.6.13-rc4/mm/mempolicy.c
===================================================================
--- linux-2.6.13-rc4.orig/mm/mempolicy.c	2005-08-01 10:49:36.000000000 -0700
+++ linux-2.6.13-rc4/mm/mempolicy.c	2005-08-01 11:27:33.000000000 -0700
@@ -1170,3 +1170,100 @@
 {
 	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
 }
+
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
+ */
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+	char *p = buffer;
+	char *e = buffer + maxlen;
+
+	if (!pol || pol->policy == MPOL_DEFAULT) {
+
+		if (e < p + 8)
+			return -ENOSPC;
+
+		strcpy(buffer,"default");
+		p += 7;
+
+	} else if (pol->policy == MPOL_PREFERRED) {
+
+		if (e < p + 8 /* fixed string size */ + 4 /* max len of node number */)
+			return -ENOSPC;
+
+		p += sprintf(p, "prefer=%d", pol->v.preferred_node);
+
+	} else if (pol->policy == MPOL_BIND) {
+		nodemask_t nodes;
+
+		get_zonemask(pol, nodes.bits);
+
+		if (e < p + 8)
+			return -ENOSPC;
+
+		p += sprintf(p, "bind=");
+	 	p += nodelist_scnprintf(p, e - p, nodes);
+
+	} else if (pol->policy == MPOL_INTERLEAVE) {
+
+		if (e < p + 14)
+			return -ENOSPC;
+
+		p += sprintf(p, "interleave=");
+	 	p += nodelist_scnprintf(p, e - p, * (nodemask_t *)pol->v.nodes);
+
+	} else {
+
+		BUG();
+		return -EFAULT;
+
+	}
+	return p - buffer;
+}
+
+/*
+ * Convert a representation of a memory policy from text
+ * form to binary.
+ *
+ * Returns either a memory policy or NULL for error.
+ */
+struct mempolicy *str_to_mpol(char *buffer)
+{
+	nodemask_t nodes;
+	int mode;
+
+	if (strnicmp(buffer, "default", 7) == 0)
+		return &default_policy;
+
+	if (strnicmp(buffer, "prefer=", 7) == 0) {
+		int node;
+
+		mode = MPOL_PREFERRED;
+		node = simple_strtoul(buffer + 7, NULL, 10);
+		if (node >= MAX_NUMNODES || !node_online(node))
+			return NULL;
+
+		nodes_clear(nodes);
+		node_set(node, nodes);
+
+	} else if (strnicmp(buffer, "interleave=", 11) == 0) {
+
+		mode = MPOL_INTERLEAVE;
+		if (nodelist_parse(buffer + 11, nodes) ||
+		    nodes_empty(nodes))
+			return NULL;
+
+	} else if (strnicmp(buffer, "bind=", 5) == 0) {
+
+		mode = MPOL_BIND;
+		if (nodelist_parse(buffer+5, nodes))
+			return NULL;
+
+	}
+
+	return mpol_new(MPOL_BIND, nodes.bits);
+}
+
Index: linux-2.6.13-rc4/include/linux/mempolicy.h
===================================================================
--- linux-2.6.13-rc4.orig/include/linux/mempolicy.h	2005-08-01 10:49:35.000000000 -0700
+++ linux-2.6.13-rc4/include/linux/mempolicy.h	2005-08-01 10:49:41.000000000 -0700
@@ -157,6 +157,10 @@
 extern void numa_policy_init(void);
 extern struct mempolicy default_policy;
 
+/* Conversion functions */
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
+struct mempolicy *str_to_mpol(char *buffer);
+
 #else
 
 struct mempolicy {};

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-01 18:48                   ` Christoph Lameter
@ 2005-08-01 18:54                     ` Christoph Lameter
  2005-08-01 23:03                     ` Paul Jackson
  2005-08-03  8:48                     ` Andi Kleen
  2 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2005-08-01 18:54 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm, ak

The full patchset that also includes the modifications to 
/proc/<pid>/ smaps to support outputting numa information can be found at

ftp://ftp.kernel.org/pub/linux/kernel/people/christoph/numa/2.6.13-rc4-mm1.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-01 18:48                   ` Christoph Lameter
  2005-08-01 18:54                     ` Christoph Lameter
@ 2005-08-01 23:03                     ` Paul Jackson
  2005-08-01 23:24                       ` Christoph Lameter
  2005-08-03  8:48                     ` Andi Kleen
  2 siblings, 1 reply; 44+ messages in thread
From: Paul Jackson @ 2005-08-01 23:03 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm, ak

Ok - its definitely getting shorter and sweeter.  Good.

There are quite a few small magic numbers - is there someway to code
these out?  See also my comment below, mentioning kernel/power/main.c.
Changing one of the strings should not break the code until the
corresponding magic string length numbers are changed to match.

+               p += 7;
+               if (e < p + 8 /* fixed string size */ + 4 /* max len of node number */)
+               if (e < p + 8)
+               if (e < p + 14)
+       if (strnicmp(buffer, "default", 7) == 0)
+       if (strnicmp(buffer, "prefer=", 7) == 0) {
+               node = simple_strtoul(buffer + 7, NULL, 10);
+       } else if (strnicmp(buffer, "interleave=", 11) == 0) {
+               if (nodelist_parse(buffer + 11, nodes) ||
+       } else if (strnicmp(buffer, "bind=", 5) == 0) {
+               if (nodelist_parse(buffer+5, nodes))

And the code should not break, silently overwriting kernel stack, if
and when someone tries this on a 10,000 node system (the '4' max len of
node number).  There are snprintf style routines to avoid such risks.

I'd be tempted to code for just lower case matches, not case
insensitive, which seems like gratuitous flexibility to me.

The existing numa code has a routine get_nodes(), in mm/mempolicy.c,
that obtains the nodelist from the arguments to the mbind or
set_mempolicy system call, and does a fair bit of checking on the
nodelist, in the context of the current task (the task whose mempolicy
is to be changed).  Only after this checking can the resulting policy
and nodelist be applied.  In the other parts of your intended changes,
where you expect to use the results of this code to convert a string
to a numa policy, where to you expect to duplicate the portion
of get_nodes() that checks the nodelist and policy, in the target
tasks context?  This might involve refactoring get_nodes() into two
routines - one to obtain a nodemask_t representation of the nodelist
passed in via the set_mempolicy/mbind system calls, and the other to
perform the checking.

In your related patch, you show how to merge this display of mempolicy
into the new /proc/<pid>/smap (rss size of each memory area) file.
But this smap file (or, as you renamed it, emap file) is read-only,
so far as I can tell.  It enables display of information, but not
changing it.  How do you propose to support changing a memory policy?
How will the other half of this patch, that converts strings to
memory policies, be used?  If I knew that (you've probably said,
and I've probably forgotten - sorry) then I might then ask whether
that other half should not also be used to display memory policies,
instead of adapting smap aka emap.

There is an alternative representation of this information which
I have in mind, but cannot yet evaluate the usefulness of, due to
the above aspects that I don't understand yet.  Instead of one file
(or one line in a file) having one of the following formats:

    default
    preferred=<nodenumber>
    bind=<nodelist>
    interleave=<nodelist>

Would it be better to have two files, the first of which has one of
the strings:

    default
    preferred
    bind
    interleave

and the second of which has a nodelist, the significance of which
depends on the policy specified in the first.  These files might be
called "numa_policy" and "numa_nodelist", or some such.  Perhaps they
would be both read and write (display and parse), which would remove
the need for a separate read-only display in the smap/emap file.
Since I am still missing some "big picture" stuff here, I don't know
if this option is worth considering or not.  But it would honor the
ideal of a single token, number or list of numbers per file.

Consider as an example the display and parsing of pm_states by the
state_show() and state_store() routines of kernel/power/main.c, for
ways to code this that separate the data (list of strings parsed
and the mapping between constants and strings) from the code logic,
and that use strlen() to avoid magic length numbers.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-01 23:03                     ` Paul Jackson
@ 2005-08-01 23:24                       ` Christoph Lameter
  2005-08-01 23:59                         ` Paul Jackson
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-08-01 23:24 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm, ak

On Mon, 1 Aug 2005, Paul Jackson wrote:

> In your related patch, you show how to merge this display of mempolicy
> into the new /proc/<pid>/smap (rss size of each memory area) file.
> But this smap file (or, as you renamed it, emap file) is read-only,
> so far as I can tell.  It enables display of information, but not
> changing it.  How do you propose to support changing a memory policy?

That is not clear yet. We need to discuss that. First I thought of just
having a patch that creates /proc/<pid>/numa_policy which allows
to read and write the process policy (write via notifier not directly).

> that other half should not also be used to display memory policies,
> instead of adapting smap aka emap.

e/smap is a generic way to get information about storage use of a vma. 
Therefore displaying numa node information etc there makes a lot of 
sense.

> Would it be better to have two files, the first of which has one of
> the strings:

Yes. I initially had something like that in mind and have a partial 
implementation but such an approach gets extremely complicated and 
difficult to handle since you need to create multiple 
levels of directories in /proc/<pid>/xx. It is easier to obtain the 
complete memory information from a single file. We already have that in 
/proc/<pid>/maps.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-01 23:24                       ` Christoph Lameter
@ 2005-08-01 23:59                         ` Paul Jackson
  2005-08-02  0:03                           ` Christoph Lameter
  2005-08-02  0:15                           ` Christoph Lameter
  0 siblings, 2 replies; 44+ messages in thread
From: Paul Jackson @ 2005-08-01 23:59 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm, ak

Christoph wrote:
> you need to create multiple 
> levels of directories in /proc/<pid>/xx

You do?

Where's the new multiple directory levels in the two files:

	/proc/<pid>/numa_policy		# contains one word
	/proc/<pid>/numa_nodelist	# contains one nodelist

There are some obvious negatives to this idea, but having to
"create multiple levels of directories" doesn't seem to be
one of them.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-01 23:59                         ` Paul Jackson
@ 2005-08-02  0:03                           ` Christoph Lameter
  2005-08-02  0:15                           ` Christoph Lameter
  1 sibling, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2005-08-02  0:03 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm, ak

On Mon, 1 Aug 2005, Paul Jackson wrote:

> Christoph wrote:
> > you need to create multiple 
> > levels of directories in /proc/<pid>/xx
> 
> You do?
> 
> Where's the new multiple directory levels in the two files:
> 
> 	/proc/<pid>/numa_policy		# contains one word
> 	/proc/<pid>/numa_nodelist	# contains one nodelist
> 
> There are some obvious negatives to this idea, but having to
> "create multiple levels of directories" doesn't seem to be
> one of them.

One would need /proc/<pid>/<vma>/policy and /proc/<pid>/<vma>/nodelist 
to control the allocation in the vmas.



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-01 23:59                         ` Paul Jackson
  2005-08-02  0:03                           ` Christoph Lameter
@ 2005-08-02  0:15                           ` Christoph Lameter
  2005-08-02  5:33                             ` Paul Jackson
  1 sibling, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-08-02  0:15 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm, ak

New version without the magic numbers ...

---

String conversions for memory policy

This patch adds two new functions to mm/mempolicy.c that allow the conversion
of a memory policy to a textual representation and vice versa.

Syntax of textual representation:

default
preferred=<nodenumber>
bind=<nodelist>
interleave=<nodelist>

Signed-off-by: Christoph Lameter <christoph@lameter.com>

Index: linux-2.6.13-rc4-mm1/mm/mempolicy.c
===================================================================
--- linux-2.6.13-rc4-mm1.orig/mm/mempolicy.c	2005-08-01 12:59:49.000000000 -0700
+++ linux-2.6.13-rc4-mm1/mm/mempolicy.c	2005-08-01 17:14:08.000000000 -0700
@@ -1170,3 +1170,95 @@
 {
 	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
 }
+
+static const char *policy_types[] = { "default", "prefer", "bind", "interleave" };
+
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
+ */
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+	char *p = buffer;
+	int l;
+	nodemask_t nodes;
+	mode = pol ? pol->policy : MPOL_DEFAULT;
+
+	switch (mode) {
+	case MPOL_DEFAULT:
+		nodes_clear(nodes);
+		break;
+
+	case MPOL_PREFERRED:
+		nodes_clear(nodes);
+		node_set(pol->v.preferred_node, nodes);
+		break;
+
+	case MPOL_BIND:
+		get_zonemask(pol, nodes.bits);
+		break;
+
+	case MPOL_INTERLEAVE:
+		nodes = * (nodemask_t *)pol->v.nodes;
+		break;
+
+	default:
+		BUG();
+		return -EFAULT;
+	}
+
+	l = strlen(policy_types[mode]);
+	if (buffer + maxlen > p + l + 1)
+		return -ENOSPC;
+	strcpy(p, policy_types[mode]);
+	p += l;
+
+	if (!nodes_empty(nodes)) {
+
+		if (buffer + maxlen > p + 2)
+			return -ENOSPC;
+
+		*p++ = '=';
+	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+	}
+	return p - buffer;
+}
+
+/*
+ * Convert a representation of a memory policy from text
+ * form to binary.
+ *
+ * Returns either a memory policy or NULL for error.
+ */
+struct mempolicy *str_to_mpol(char *buffer)
+{
+	nodemask_t nodes;
+	int mode;
+	int l;
+
+	for(mode = 0; mode <= MPOL_MAX; mode++) {
+		l = strlen(policy_types[mode]);
+		if (strnicmp(policy_types[mode], buffer, l)
+			&& (mode == MPOL_DEFAULT || buffer[l] == '='))
+				break;
+	}
+
+	if (mode > MPOL_MAX)
+		return NULL;
+
+	if (mode == MPOL_DEFAULT)
+		return &default_policy;
+
+	if (nodelist_parse(buffer + l + 1, nodes) || nodes_empty(nodes))
+		return NULL;
+
+	/* Update current mems_allowed */
+	cpuset_update_current_mems_allowed();
+	/* Ignore nodes not set in current->mems_allowed */
+	cpuset_restrict_to_mems_allowed(nodes.bits);
+
+	if (!mpol_check_policy(mode, nodes.bits))
+		return NULL;
+	return mpol_new(mode, nodes.bits);
+}
Index: linux-2.6.13-rc4-mm1/include/linux/mempolicy.h
===================================================================
--- linux-2.6.13-rc4-mm1.orig/include/linux/mempolicy.h	2005-08-01 12:59:45.000000000 -0700
+++ linux-2.6.13-rc4-mm1/include/linux/mempolicy.h	2005-08-01 13:00:01.000000000 -0700
@@ -157,6 +157,10 @@
 extern void numa_policy_init(void);
 extern struct mempolicy default_policy;
 
+/* Conversion functions */
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
+struct mempolicy *str_to_mpol(char *buffer);
+
 #else
 
 struct mempolicy {};

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-02  0:15                           ` Christoph Lameter
@ 2005-08-02  5:33                             ` Paul Jackson
  2005-08-02  6:11                               ` Christoph Lameter
  2005-08-02 17:49                               ` Christoph Lameter
  0 siblings, 2 replies; 44+ messages in thread
From: Paul Jackson @ 2005-08-02  5:33 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm, ak

Christoph wrote:
> New version without the magic numbers ...

Good.

===

How about replacing:

  static const char *policy_types[] = { "default", "prefer", "bind", "interleave" };

with:

  static const char *policy_types[] = {
	[MPOL_DEFAULT]    = "default",
	[MPOL_PREFERRED]  = "prefer",
	[MPOL_BIND]       = "bind",
	[MPOL_INTERLEAVE] = "interleave"
  };

so that the mapping of the MPOL_* define constants to strings is
explicit, not implicit.

===

Maybe I need more caffeine, but the following tests seem backwards:

+	if (buffer + maxlen > p + l + 1)
+		return -ENOSPC;

and

+		if (buffer + maxlen > p + 2)
+			return -ENOSPC;

===

Can the following:

+	int l;
+	...
+
+	l = strlen(policy_types[mode]);
+	if (buffer + maxlen > p + l + 1)
+		return -ENOSPC;
+	strcpy(p, policy_types[mode]);
+	p += l;
+
+	if (!nodes_empty(nodes)) {
+
+		if (buffer + maxlen > p + 2)
+			return -ENOSPC;
+
+		*p++ = '=';
+	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+	}

be rewritten to:

	char *bufend = buffer + maxlen;
	...

	p += scnprintf(p, bufend - p, "%s", policy_types[mode]);
	if (!nodes_empty(nodes)) {
		p += scnprintf(p, bufend - p, "=");
		p += nodelist_scnprintf(p, bufend - p, nodes);
	}
	if (p >= bufend - 1)
		return -ENOSPC;

Less code, more consistent code for each buffer addition, fails with
ENOSPC in the case that the nodelist only partially fits rather than
truncating it without notice, and fixes any possible problem with the
above tests being backwards - by removing the tests ;).

This suggested replacement code does have one weakness, in that it
cannot distinguish the case that the buffer was exactly the right
size from the case it was too small, and errors with -ENOSPC in
either case.  I don't think that this case is worth adding extra
logic to distinguish, in this situation.

===

+	for(mode = 0; mode <= MPOL_MAX; mode++) {

Space after 'for'

===

There should probably be comments that these routines must
execute in the context of the task whose mempolicies are
being displayed or modified.

==

There should probably be a doc style comment introducing
the mpol_to_str() and str_to_mpol() routines, as described
in Documentation/kernel-doc-nano-HOWTO.txt.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-02  5:33                             ` Paul Jackson
@ 2005-08-02  6:11                               ` Christoph Lameter
  2005-08-03  5:44                                 ` Paul Jackson
  2005-08-02 17:49                               ` Christoph Lameter
  1 sibling, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-08-02  6:11 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm, ak

Could you rework the patch to all of that? Seems that you have a clear 
vision as to where to go.

On Mon, 1 Aug 2005, Paul Jackson wrote:

> Christoph wrote:
> > New version without the magic numbers ...
> 
> Good.
> 
> ===
> 
> How about replacing:
> 
>   static const char *policy_types[] = { "default", "prefer", "bind", "interleave" };
> 
> with:
> 
>   static const char *policy_types[] = {
> 	[MPOL_DEFAULT]    = "default",
> 	[MPOL_PREFERRED]  = "prefer",
> 	[MPOL_BIND]       = "bind",
> 	[MPOL_INTERLEAVE] = "interleave"
>   };
> 
> so that the mapping of the MPOL_* define constants to strings is
> explicit, not implicit.
> 
> ===
> 
> Maybe I need more caffeine, but the following tests seem backwards:
> 
> +	if (buffer + maxlen > p + l + 1)
> +		return -ENOSPC;
> 
> and
> 
> +		if (buffer + maxlen > p + 2)
> +			return -ENOSPC;
> 
> ===
> 
> Can the following:
> 
> +	int l;
> +	...
> +
> +	l = strlen(policy_types[mode]);
> +	if (buffer + maxlen > p + l + 1)
> +		return -ENOSPC;
> +	strcpy(p, policy_types[mode]);
> +	p += l;
> +
> +	if (!nodes_empty(nodes)) {
> +
> +		if (buffer + maxlen > p + 2)
> +			return -ENOSPC;
> +
> +		*p++ = '=';
> +	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
> +	}
> 
> be rewritten to:
> 
> 	char *bufend = buffer + maxlen;
> 	...
> 
> 	p += scnprintf(p, bufend - p, "%s", policy_types[mode]);
> 	if (!nodes_empty(nodes)) {
> 		p += scnprintf(p, bufend - p, "=");
> 		p += nodelist_scnprintf(p, bufend - p, nodes);
> 	}
> 	if (p >= bufend - 1)
> 		return -ENOSPC;
> 
> Less code, more consistent code for each buffer addition, fails with
> ENOSPC in the case that the nodelist only partially fits rather than
> truncating it without notice, and fixes any possible problem with the
> above tests being backwards - by removing the tests ;).
> 
> This suggested replacement code does have one weakness, in that it
> cannot distinguish the case that the buffer was exactly the right
> size from the case it was too small, and errors with -ENOSPC in
> either case.  I don't think that this case is worth adding extra
> logic to distinguish, in this situation.
> 
> ===
> 
> +	for(mode = 0; mode <= MPOL_MAX; mode++) {
> 
> Space after 'for'
> 
> ===
> 
> There should probably be comments that these routines must
> execute in the context of the task whose mempolicies are
> being displayed or modified.
> 
> ==
> 
> There should probably be a doc style comment introducing
> the mpol_to_str() and str_to_mpol() routines, as described
> in Documentation/kernel-doc-nano-HOWTO.txt.
> 
> -- 
>                   I won't rest till it's the best ...
>                   Programmer, Linux Scalability
>                   Paul Jackson <pj@sgi.com> 1.925.600.0401
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-02  5:33                             ` Paul Jackson
  2005-08-02  6:11                               ` Christoph Lameter
@ 2005-08-02 17:49                               ` Christoph Lameter
  2005-08-03  5:42                                 ` Paul Jackson
  1 sibling, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-08-02 17:49 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm, ak

On Mon, 1 Aug 2005, Paul Jackson wrote:

> There should probably be comments that these routines must
> execute in the context of the task whose mempolicies are
> being displayed or modified.

Ok. The str_to_mpol function needs to be executed from the task whose 
memory policies are to be displayed but why would mpol_to_str need that?

I am quite concerned about policy layer due to :

1. No separation between sys_ and do_ functions. Its not easy to 
   use any of the policy functions from the kernel itself.
2. The bind strangeness. Does not preserve locality of allocation and 
   simply uses lower nodes first. This is quite
   surprising and not mentioned in the documentation.
3. Mixing nodemask_t and bitmaps (probably due to user space variable
   bitmap sizes). May not be clean due to not doing 1.
4. Unexpected side effects of some functions (like get_nodes updating the
   current mems allowed, the filtering by nodes allowed).

All of this probably makes the policy layer difficult to maintain. Thus 
only a restricted set of people that are competent on it.

New patch:

---

String conversions for memory policy

This patch adds two new functions to mm/mempolicy.c that allow the conversion
of a memory policy to a textual representation and vice versa.

Syntax of textual representation:

default
preferred=<nodenumber>
bind=<nodelist>
interleave=<nodelist>

Signed-off-by: Christoph Lameter <christoph@lameter.com>

Index: linux-2.6.13-rc4-mm1/mm/mempolicy.c
===================================================================
--- linux-2.6.13-rc4-mm1.orig/mm/mempolicy.c	2005-08-01 12:59:49.000000000 -0700
+++ linux-2.6.13-rc4-mm1/mm/mempolicy.c	2005-08-02 10:38:50.000000000 -0700
@@ -1170,3 +1170,101 @@
 {
 	sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
 }
+
+static const char *policy_types[] = { "default", "prefer", "bind", "interleave" };
+
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
+ *
+ * May be called from outside of the context of a task.
+ */
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+	char *p = buffer;
+	int l;
+	nodemask_t nodes;
+	int mode = pol ? pol->policy : MPOL_DEFAULT;
+
+	switch (mode) {
+	case MPOL_DEFAULT:
+		nodes_clear(nodes);
+		break;
+
+	case MPOL_PREFERRED:
+		nodes_clear(nodes);
+		node_set(pol->v.preferred_node, nodes);
+		break;
+
+	case MPOL_BIND:
+		get_zonemask(pol, nodes.bits);
+		break;
+
+	case MPOL_INTERLEAVE:
+		nodes = * (nodemask_t *)pol->v.nodes;
+		break;
+
+	default:
+		BUG();
+		return -EFAULT;
+	}
+
+	l = strlen(policy_types[mode]);
+	if (buffer + maxlen < p + l + 1)
+		return -ENOSPC;
+
+	strcpy(p, policy_types[mode]);
+	p += l;
+
+	if (!nodes_empty(nodes)) {
+
+		if (buffer + maxlen < p + 2)
+			return -ENOSPC;
+
+		*p++ = '=';
+	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+	}
+	return p - buffer;
+}
+
+/*
+ * Convert a representation of a memory policy from text
+ * form to binary. This can only be done from the tasks that
+ * is using the memory policy since the nodes used by a policy
+ * may be restricted by the cpuset.
+ *
+ * Returns either a memory policy or NULL for error.
+ */
+struct mempolicy *str_to_mpol(char *buffer)
+{
+	nodemask_t nodes;
+	int mode;
+	int l;
+
+	for (mode = 0; mode <= MPOL_MAX; mode++) {
+		l = strlen(policy_types[mode]);
+		if (strnicmp(policy_types[mode], buffer, l)
+			&& (mode == MPOL_DEFAULT || buffer[l] == '='))
+				break;
+	}
+
+	if (mode > MPOL_MAX)
+		return NULL;
+
+	if (mode == MPOL_DEFAULT)
+		return &default_policy;
+
+	if (nodelist_parse(buffer + l + 1, nodes) || nodes_empty(nodes))
+		return NULL;
+
+	/* Update current mems_allowed */
+	cpuset_update_current_mems_allowed();
+	/* Ignore nodes not set in current->mems_allowed */
+	cpuset_restrict_to_mems_allowed(nodes.bits);
+
+	if (!mpol_check_policy(mode, nodes.bits))
+		return NULL;
+
+	return mpol_new(mode, nodes.bits);
+}
Index: linux-2.6.13-rc4-mm1/include/linux/mempolicy.h
===================================================================
--- linux-2.6.13-rc4-mm1.orig/include/linux/mempolicy.h	2005-08-01 12:59:45.000000000 -0700
+++ linux-2.6.13-rc4-mm1/include/linux/mempolicy.h	2005-08-01 13:00:01.000000000 -0700
@@ -157,6 +157,10 @@
 extern void numa_policy_init(void);
 extern struct mempolicy default_policy;
 
+/* Conversion functions */
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
+struct mempolicy *str_to_mpol(char *buffer);
+
 #else
 
 struct mempolicy {};

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-02 17:49                               ` Christoph Lameter
@ 2005-08-03  5:42                                 ` Paul Jackson
  0 siblings, 0 replies; 44+ messages in thread
From: Paul Jackson @ 2005-08-03  5:42 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm, ak

Christoph wrote:
> Ok. The str_to_mpol function needs to be executed from the task whose 
> memory policies are to be displayed but why would mpol_to_str need that?

You are probably right.  I doubt mpol_to_str needs that.  I was
probably painting with too wide a brush.


> I am quite concerned about policy layer due to :

I will leave this portion of the discussion to others.

I attempted to tackle some related concerns in a patch I sent lkml on
Aug 2, 2004 (exactly 1 year ago ;) under the Subject:

  subset zonelists and big numa friendly mempolicy MPOL_MBIND
  http://www.ussg.iu.edu/hypermail/linux/kernel/0408.0/0398.html

However this patch was unsuccessful, and I have not had any further
great insights to present here (if ever I had any ;).

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-02  6:11                               ` Christoph Lameter
@ 2005-08-03  5:44                                 ` Paul Jackson
  2005-08-03 16:19                                   ` Christoph Lameter
  0 siblings, 1 reply; 44+ messages in thread
From: Paul Jackson @ 2005-08-03  5:44 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: linux-kernel, akpm, ak

Christoph wrote:
> Could you rework the patch to all of that?

Next week - I am on vacation at the moment.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-01 18:48                   ` Christoph Lameter
  2005-08-01 18:54                     ` Christoph Lameter
  2005-08-01 23:03                     ` Paul Jackson
@ 2005-08-03  8:48                     ` Andi Kleen
  2005-08-04 14:11                       ` Christoph Lameter
  2 siblings, 1 reply; 44+ messages in thread
From: Andi Kleen @ 2005-08-03  8:48 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Paul Jackson, linux-kernel, akpm, ak

On Mon, Aug 01, 2005 at 11:48:55AM -0700, Christoph Lameter wrote:
> On Sat, 30 Jul 2005, Paul Jackson wrote:
> 
> > Would it make sense for you to do the same thing, when displaying
> > mempolicies in /proc/<pid>/numa_policy?
> 
> Here is a new rev of the conversions patch. Thanks for all the feedback:

I really hate this whole /proc/<pid>/numa_policy thing. /proc/<pid>/maps
was imho always a desaster (hard to parse, slow etc.). Also external
access of NUMA policies has interesting locking issues. I intentionally
didn't add something like that when I designed the original
NUMA API. Please don't add it.

-Andi

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-03  5:44                                 ` Paul Jackson
@ 2005-08-03 16:19                                   ` Christoph Lameter
  0 siblings, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2005-08-03 16:19 UTC (permalink / raw)
  To: Paul Jackson; +Cc: linux-kernel, akpm, ak

On Tue, 2 Aug 2005, Paul Jackson wrote:

> Christoph wrote:
> > Could you rework the patch to all of that?
> 
> Next week - I am on vacation at the moment.

I have done all the fixups necessary ..... Just the candy that you wanted 
is missing.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-03  8:48                     ` Andi Kleen
@ 2005-08-04 14:11                       ` Christoph Lameter
  2005-08-04 14:29                         ` Andi Kleen
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-08-04 14:11 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Paul Jackson, linux-kernel, akpm

On Wed, 3 Aug 2005, Andi Kleen wrote:

> I really hate this whole /proc/<pid>/numa_policy thing. /proc/<pid>/maps
> was imho always a desaster (hard to parse, slow etc.). Also external
> access of NUMA policies has interesting locking issues. I intentionally
> didn't add something like that when I designed the original
> NUMA API. Please don't add it.

You designed a NUMA API to control a process memory access patterns 
without the ability to view or modify the policies in use?

The locking issues for the policy information in the task_struct could be 
solved by having a thread execute a function that either sets or gets the 
memory policy. The vma policies already have a locking mechanism.
 
This piece here only does conversion to a string representation so it 
should not be affected by locking issues. Processes need to do proper 
locking when using the conversion functions.


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-04 14:11                       ` Christoph Lameter
@ 2005-08-04 14:29                         ` Andi Kleen
  2005-08-04 16:31                           ` Christoph Lameter
  0 siblings, 1 reply; 44+ messages in thread
From: Andi Kleen @ 2005-08-04 14:29 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andi Kleen, Paul Jackson, linux-kernel, akpm

On Thu, Aug 04, 2005 at 07:11:36AM -0700, Christoph Lameter wrote:
> On Wed, 3 Aug 2005, Andi Kleen wrote:
> 
> > I really hate this whole /proc/<pid>/numa_policy thing. /proc/<pid>/maps
> > was imho always a desaster (hard to parse, slow etc.). Also external
> > access of NUMA policies has interesting locking issues. I intentionally
> > didn't add something like that when I designed the original
> > NUMA API. Please don't add it.
> 
> You designed a NUMA API to control a process memory access patterns 
> without the ability to view or modify the policies in use?

Processes internally can get the information if they want.
Externally I didn't expose it intentionally to avoid locking problems

> The locking issues for the policy information in the task_struct could be 
> solved by having a thread execute a function that either sets or gets the 
> memory policy. The vma policies already have a locking mechanism.

But why? It all only adds complexity. Keep it simple please.

>  
> This piece here only does conversion to a string representation so it 
> should not be affected by locking issues. Processes need to do proper 
> locking when using the conversion functions.

It's useless.

-Andi

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-04 14:29                         ` Andi Kleen
@ 2005-08-04 16:31                           ` Christoph Lameter
  2005-08-04 17:08                             ` Andi Kleen
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-08-04 16:31 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Paul Jackson, linux-kernel, akpm

On Thu, 4 Aug 2005, Andi Kleen wrote:

> > You designed a NUMA API to control a process memory access patterns 
> > without the ability to view or modify the policies in use?
> 
> Processes internally can get the information if they want.
> Externally I didn't expose it intentionally to avoid locking problems

The fundamental purpose of a memory policy layer is to control memory 
allocation  on a system. If there no way to obtain that policy 
information and no way to control the policies then what is the purpose of 
the memory policies? Would it not be better to implement most what is 
currently in the kernel in user space since its restricted to a process 
anyways?

> > The locking issues for the policy information in the task_struct could be 
> > solved by having a thread execute a function that either sets or gets the 
> > memory policy. The vma policies already have a locking mechanism.
> 
> But why? It all only adds complexity. Keep it simple please.

Because its not possible to view and control memory policies otherwise.

> > This piece here only does conversion to a string representation so it 
> > should not be affected by locking issues. Processes need to do proper 
> > locking when using the conversion functions.
> 
> It's useless.

So your point of view is that there will be no control and monitoring of 
the memory usage and policies?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] String conversions for memory policy
  2005-08-04 16:31                           ` Christoph Lameter
@ 2005-08-04 17:08                             ` Andi Kleen
  2005-08-04 17:34                               ` NUMA policy interface Christoph Lameter
  0 siblings, 1 reply; 44+ messages in thread
From: Andi Kleen @ 2005-08-04 17:08 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andi Kleen, Paul Jackson, linux-kernel, akpm

> > > This piece here only does conversion to a string representation so it 
> > > should not be affected by locking issues. Processes need to do proper 
> > > locking when using the conversion functions.
> > 
> > It's useless.
> 
> So your point of view is that there will be no control and monitoring of 
> the memory usage and policies?

External control is implemented for named objects and for process policy.
A process can also monitor its own policies if it wants.

I think the payoff for external monitoring of policies vs complexity 
and cleanliness of interface and long term code impact is too bad to make 
it an attractive option.


-Andi


^ permalink raw reply	[flat|nested] 44+ messages in thread

* NUMA policy interface
  2005-08-04 17:08                             ` Andi Kleen
@ 2005-08-04 17:34                               ` Christoph Lameter
  2005-08-04 21:14                                 ` Andi Kleen
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-08-04 17:34 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Paul Jackson, linux-kernel, linux-mm

On Thu, 4 Aug 2005, Andi Kleen wrote:

> > So your point of view is that there will be no control and monitoring of 
> > the memory usage and policies?
> 
> External control is implemented for named objects and for process policy.
> A process can also monitor its own policies if it wants.

Named objects like files and not processes and/or threads? But then these 
named objects do not have memory allocated to them.

> I think the payoff for external monitoring of policies vs complexity 
> and cleanliness of interface and long term code impact is too bad to make 
> it an attractive option.

Well the implementation has the following issues right now:

1. BIND policy implemented in a way that fills up nodes from the lowest 
   to the higest instead of allocating memory on the local node.

2. No separation between sys_ and do_ functions. Therefore difficult
   to use from kernel context.

3. Functions have weird side effect (f.e. get_nodes updating 
   and using cpuset policies). Code is therefore difficult 
   to maintain.

4. Uses bitmaps instead of nodemask_t.

5. No means to figure out where the memory was allocated although
   mempoliy.c implements scans over ptes that would allow that 
   determination.
 
6. Needs hook into page migration layer to move pages to either conform
   to policy or to move them menually.

The long term impact of this missing functionality is already showing 
in the numbers of workarounds that I have seen at a various sites, 

The code is currently complex and difficult to handle because some of the 
issues mentioned above. We need to fix this in order to have clean code 
and in order to control future complexity.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: NUMA policy interface
  2005-08-04 17:34                               ` NUMA policy interface Christoph Lameter
@ 2005-08-04 21:14                                 ` Andi Kleen
  2005-08-04 21:21                                   ` Christoph Lameter
  0 siblings, 1 reply; 44+ messages in thread
From: Andi Kleen @ 2005-08-04 21:14 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Paul Jackson, linux-kernel, linux-mm

> 1. BIND policy implemented in a way that fills up nodes from the lowest 
>    to the higest instead of allocating memory on the local node.

Hmm, there was a patch from PJ for that at some point. Not sure why it 
was not merged. iirc the first implementation was too complex, but
there was a second reasonable one.

> 
> 2. No separation between sys_ and do_ functions. Therefore difficult
>    to use from kernel context.

set_fs(KERNEL_DS)
Some policies can be even set without that.

There are already kernel users BTW that prove you wrong.


> 3. Functions have weird side effect (f.e. get_nodes updating 
>    and using cpuset policies). Code is therefore difficult 
>    to maintain.

Agreed that should be cleaned up.

> 4. Uses bitmaps instead of nodemask_t.

Should be easy to fix if someone is motivated.  When I wrote the code
nodemask_t didn't exist yet, and when it was merged it wasn't 
converted over. Not a big deal.

> 
> 5. No means to figure out where the memory was allocated although
>    mempoliy.c implements scans over ptes that would allow that 
>    determination.

You lost me here.

>  
> 6. Needs hook into page migration layer to move pages to either conform
>    to policy or to move them menually.

Does it really? So far my feedback from all users I talked to is that they only
use a small subset of the functionality, even what is there is too complex.
Nobody with a real app so far has asked me for page migration.

There was one implementation of simple page migration in Steve L.'s patches,
but that was just because it was too hard to handle one corner case
otherwise.


> The long term impact of this missing functionality is already showing 
> in the numbers of workarounds that I have seen at a various sites, 

Examples? 

-Andi


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: NUMA policy interface
  2005-08-04 21:14                                 ` Andi Kleen
@ 2005-08-04 21:21                                   ` Christoph Lameter
  2005-08-04 21:41                                     ` Andi Kleen
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-08-04 21:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Paul Jackson, linux-kernel, linux-mm

On Thu, 4 Aug 2005, Andi Kleen wrote:

> > 1. BIND policy implemented in a way that fills up nodes from the lowest 
> >    to the higest instead of allocating memory on the local node.
> 
> Hmm, there was a patch from PJ for that at some point. Not sure why it 
> was not merged. iirc the first implementation was too complex, but
> there was a second reasonable one.

Yes he mentioned that patch earlier in this thread.

> > 5. No means to figure out where the memory was allocated although
> >    mempoliy.c implements scans over ptes that would allow that 
> >    determination.
> 
> You lost me here.

There is this scan over the page table that verifies if all nodes are 
allocated according to the policy. That scan could easily be used to 
provide a map to the application (and to /proc/<pid>/smap) of where the
memory was allocated.
 
> > 6. Needs hook into page migration layer to move pages to either conform
> >    to policy or to move them menually.
> 
> Does it really? So far my feedback from all users I talked to is that they only
> use a small subset of the functionality, even what is there is too complex.
> Nobody with a real app so far has asked me for page migration.

Maybe we have different customers. My feedback is consistently that this 
is a very urgently feature needed.
 
> There was one implementation of simple page migration in Steve L.'s patches,
> but that was just because it was too hard to handle one corner case
> otherwise.

There is a page migration implementation in the hotplug patchset.

> > The long term impact of this missing functionality is already showing 
> > in the numbers of workarounds that I have seen at a various sites, 
> 
> Examples? 

Two of the high profile ones are NASA and APA. One person from the APA 
posted in one of our earlier discussions.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: NUMA policy interface
  2005-08-04 21:21                                   ` Christoph Lameter
@ 2005-08-04 21:41                                     ` Andi Kleen
  2005-08-04 22:19                                       ` Christoph Lameter
  0 siblings, 1 reply; 44+ messages in thread
From: Andi Kleen @ 2005-08-04 21:41 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andi Kleen, Paul Jackson, linux-kernel, linux-mm

On Thu, Aug 04, 2005 at 02:21:09PM -0700, Christoph Lameter wrote:
> Yes he mentioned that patch earlier in this thread.
> 
> > > 5. No means to figure out where the memory was allocated although
> > >    mempoliy.c implements scans over ptes that would allow that 
> > >    determination.
> > 
> > You lost me here.
> 
> There is this scan over the page table that verifies if all nodes are 
> allocated according to the policy. That scan could easily be used to 
> provide a map to the application (and to /proc/<pid>/smap) of where the

The application can already get it. But it's an ugly feature
that I only used for debugging and I was actually considering
to remove it.

Doing it for external users is a completely different thing though.
I still think those have business in messing with other people's
virtual addresses. In addition I expect it will cause problems
longer term
(did you ever look why mmap on /proc/*/mem is not allowed - it used
to be long ago, but it was impossible to make it work race free and
before that was always a gapping security hole) 

> > > The long term impact of this missing functionality is already showing 
> > > in the numbers of workarounds that I have seen at a various sites, 
> > 
> > Examples? 
> 
> Two of the high profile ones are NASA and APA. One person from the APA 
> posted in one of our earlier discussions.

Ok. I think for those the swapoff per process is the right because
simplest and easiest solution. No complex patch sets needed,
just some changes to an existing code path.

If they cannot afford enough disk space it might be possible
to do the page migration in swap cache like Hugh proposed.

-Andi


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: NUMA policy interface
  2005-08-04 21:41                                     ` Andi Kleen
@ 2005-08-04 22:19                                       ` Christoph Lameter
  2005-08-04 22:44                                         ` Mike Kravetz
  2005-08-04 23:40                                         ` Andi Kleen
  0 siblings, 2 replies; 44+ messages in thread
From: Christoph Lameter @ 2005-08-04 22:19 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Paul Jackson, linux-kernel, linux-mm

On Thu, 4 Aug 2005, Andi Kleen wrote:

> > There is this scan over the page table that verifies if all nodes are 
> > allocated according to the policy. That scan could easily be used to 
> > provide a map to the application (and to /proc/<pid>/smap) of where the
> 
> The application can already get it. But it's an ugly feature
> that I only used for debugging and I was actually considering
> to remove it.
> 
> Doing it for external users is a completely different thing though.
> I still think those have business in messing with other people's
> virtual addresses. In addition I expect it will cause problems
> longer term
> (did you ever look why mmap on /proc/*/mem is not allowed - it used
> to be long ago, but it was impossible to make it work race free and
> before that was always a gapping security hole) 

The proc stuff is fake anyways. I would not worry about that. The biggest 
worry is the locking mechanism to make this clean.

There are three possibilites:

1. do what cpusets is doing by versioning.

2. Have the task notifier access the task_struct information.
See http://lwn.net/Articles/145232/ "A new path to the refrigerator"

3. Maybe the easiest: Require mmap_sem to be taken for all policy 
accesses. Currently its only require for vma policies. Then we need
to make a copy of the policy at some point so that alloc_pages can
access policy information lock free. This may also allow us to fix
the bind issue if we would f.e. keep a bitmap in the taskstruct or (ab)use 
the cpusets map.

> If they cannot afford enough disk space it might be possible
> to do the page migration in swap cache like Hugh proposed.

This code already exist in the memory hotplug code base and Ray already 
had a working implementation for page migration. The migration code will 
also be necessary in order to relocate pages with ECC single bit failures 
that Russ is working on (of course that will only work for some pages) and
for Mel Gorman's defragmentation approach (if we ever get the split into 
differnet types of memory chunks in).

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: NUMA policy interface
  2005-08-04 22:19                                       ` Christoph Lameter
@ 2005-08-04 22:44                                         ` Mike Kravetz
  2005-08-04 23:40                                         ` Andi Kleen
  1 sibling, 0 replies; 44+ messages in thread
From: Mike Kravetz @ 2005-08-04 22:44 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andi Kleen, Paul Jackson, linux-kernel, linux-mm

On Thu, Aug 04, 2005 at 03:19:52PM -0700, Christoph Lameter wrote:
> This code already exist in the memory hotplug code base and Ray already 
> had a working implementation for page migration. The migration code will 
> also be necessary in order to relocate pages with ECC single bit failures 
> that Russ is working on (of course that will only work for some pages) and
> for Mel Gorman's defragmentation approach (if we ever get the split into 
> differnet types of memory chunks in).

Yup, we need page migration for memory hotplug.  However, for hotplug
we are not too concerned about where the pages are migrated to.  Our
primary concern is to move them out of the block/section that we want
to offline.  Suspect this is the same for pages with ECC single bit
failures.  In fact, this is one possible use of the hotplug code.
Notice a failure.  Migrate all pages off the containing DIMM.  Offline
section corresponding to DIMM.  Replace the DIMM.  Online section
corresponding to DIMM.  Of course, your hardware needs to be able to
do this.

-- 
Mike

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: NUMA policy interface
  2005-08-04 22:19                                       ` Christoph Lameter
  2005-08-04 22:44                                         ` Mike Kravetz
@ 2005-08-04 23:40                                         ` Andi Kleen
  2005-08-04 23:49                                           ` Christoph Lameter
  1 sibling, 1 reply; 44+ messages in thread
From: Andi Kleen @ 2005-08-04 23:40 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andi Kleen, Paul Jackson, linux-kernel, linux-mm

On Thu, Aug 04, 2005 at 03:19:52PM -0700, Christoph Lameter wrote:
> There are three possibilites:
> 
> 1. do what cpusets is doing by versioning.
> 
> 2. Have the task notifier access the task_struct information.
> See http://lwn.net/Articles/145232/ "A new path to the refrigerator"
> 
> 3. Maybe the easiest: Require mmap_sem to be taken for all policy 
> accesses. Currently its only require for vma policies. Then we need
> to make a copy of the policy at some point so that alloc_pages can
> access policy information lock free. This may also allow us to fix
> the bind issue if we would f.e. keep a bitmap in the taskstruct or (ab)use 
> the cpusets map.

None of them seem very attractive to me.  I would prefer to just
not support external accesses keeping things lean and fast.


> > If they cannot afford enough disk space it might be possible
> > to do the page migration in swap cache like Hugh proposed.
> 
> This code already exist in the memory hotplug code base and Ray already 
> had a working implementation for page migration. The migration code will 
> also be necessary in order to relocate pages with ECC single bit failures 
> that Russ is working on (of course that will only work for some pages) and
> for Mel Gorman's defragmentation approach (if we ever get the split into 
> differnet types of memory chunks in).

Individual physical page migration is quite different from
address space migration.

-Andi

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: NUMA policy interface
  2005-08-04 23:40                                         ` Andi Kleen
@ 2005-08-04 23:49                                           ` Christoph Lameter
  2005-08-05  9:16                                             ` Andi Kleen
  0 siblings, 1 reply; 44+ messages in thread
From: Christoph Lameter @ 2005-08-04 23:49 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Paul Jackson, linux-kernel, linux-mm

On Fri, 5 Aug 2005, Andi Kleen wrote:

> None of them seem very attractive to me.  I would prefer to just
> not support external accesses keeping things lean and fast.

That is a surprising statement given what we just discussed. Things 
are not lean and fast but weirdly screwed up. The policy layer is 
significantly impacted by historical contingencies rather than designed in 
a clean way. It cannot even deliver the functionality it was designed to 
deliver (see BIND).

> Individual physical page migration is quite different from
> address space migration.

Address space migration? That is something new in this discussion. So 
could you explain what you mean by that? I have looked at page migration 
in a variety of contexts and could not see much difference.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: NUMA policy interface
  2005-08-04 23:49                                           ` Christoph Lameter
@ 2005-08-05  9:16                                             ` Andi Kleen
  2005-08-05 14:52                                               ` Christoph Lameter
  2005-08-05 14:58                                               ` Christoph Lameter
  0 siblings, 2 replies; 44+ messages in thread
From: Andi Kleen @ 2005-08-05  9:16 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Andi Kleen, Paul Jackson, linux-kernel, linux-mm

On Thu, Aug 04, 2005 at 04:49:33PM -0700, Christoph Lameter wrote:
> On Fri, 5 Aug 2005, Andi Kleen wrote:
> 
> > None of them seem very attractive to me.  I would prefer to just
> > not support external accesses keeping things lean and fast.
> 
> That is a surprising statement given what we just discussed. Things 
> are not lean and fast but weirdly screwed up. The policy layer is 
> significantly impacted by historical contingencies rather than designed in 
> a clean way. It cannot even deliver the functionality it was designed to 
> deliver (see BIND).

That seems like a unfair description to me. While things are not
perfect they are definitely not as bad as you're trying to paint them.

> 
> > Individual physical page migration is quite different from
> > address space migration.
> 
> Address space migration? That is something new in this discussion. So 
> could you explain what you mean by that? I have looked at page migration 
> in a variety of contexts and could not see much difference.

MCE page migration just puts a physical page to somewhere else.
memory hotplug migration does the same for multiple pages from
different processes.

Page migration like you're asking for migrates whole processes.

-Andi


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: NUMA policy interface
  2005-08-05  9:16                                             ` Andi Kleen
@ 2005-08-05 14:52                                               ` Christoph Lameter
  2005-08-05 14:58                                               ` Christoph Lameter
  1 sibling, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2005-08-05 14:52 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Paul Jackson, linux-kernel, linux-mm

On Fri, 5 Aug 2005, Andi Kleen wrote:

> > Address space migration? That is something new in this discussion. So 
> > could you explain what you mean by that? I have looked at page migration 
> > in a variety of contexts and could not see much difference.
> 
> MCE page migration just puts a physical page to somewhere else.
> memory hotplug migration does the same for multiple pages from
> different processes.
> 
> Page migration like you're asking for migrates whole processes.

No I am asking for the migration of parts of a process. Hotplug migration 
and MCE page migration do the same.



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: NUMA policy interface
  2005-08-05  9:16                                             ` Andi Kleen
  2005-08-05 14:52                                               ` Christoph Lameter
@ 2005-08-05 14:58                                               ` Christoph Lameter
  1 sibling, 0 replies; 44+ messages in thread
From: Christoph Lameter @ 2005-08-05 14:58 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Paul Jackson, linux-kernel, linux-mm

On Fri, 5 Aug 2005, Andi Kleen wrote:

> > a clean way. It cannot even deliver the functionality it was designed to 
> > deliver (see BIND).
> 
> That seems like a unfair description to me. While things are not
> perfect they are definitely not as bad as you're trying to paint them.

Sorry this went to far in the heat of the discussion. But 
the BIND functionality is truly not where its supposed to be.

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2005-08-05 15:03 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-07-29 18:38 [PATCH] String conversions for memory policy Christoph Lameter
2005-07-29 22:20 ` Paul Jackson
2005-07-30  0:53   ` Christoph Lameter
2005-07-30  5:54     ` Paul Jackson
2005-07-30 15:48       ` Christoph Lameter
2005-07-31  1:45         ` Paul Jackson
2005-07-30  6:00     ` Paul Jackson
2005-07-30 17:56       ` Christoph Lameter
2005-07-31  1:14         ` Paul Jackson
2005-07-31  1:21           ` Christoph Lameter
2005-07-31  1:50             ` Paul Jackson
2005-07-31  2:01             ` Paul Jackson
2005-07-31  2:05               ` Christoph Lameter
2005-07-31  2:12                 ` Paul Jackson
2005-07-31  2:15                   ` Christoph Lameter
2005-08-01 18:48                   ` Christoph Lameter
2005-08-01 18:54                     ` Christoph Lameter
2005-08-01 23:03                     ` Paul Jackson
2005-08-01 23:24                       ` Christoph Lameter
2005-08-01 23:59                         ` Paul Jackson
2005-08-02  0:03                           ` Christoph Lameter
2005-08-02  0:15                           ` Christoph Lameter
2005-08-02  5:33                             ` Paul Jackson
2005-08-02  6:11                               ` Christoph Lameter
2005-08-03  5:44                                 ` Paul Jackson
2005-08-03 16:19                                   ` Christoph Lameter
2005-08-02 17:49                               ` Christoph Lameter
2005-08-03  5:42                                 ` Paul Jackson
2005-08-03  8:48                     ` Andi Kleen
2005-08-04 14:11                       ` Christoph Lameter
2005-08-04 14:29                         ` Andi Kleen
2005-08-04 16:31                           ` Christoph Lameter
2005-08-04 17:08                             ` Andi Kleen
2005-08-04 17:34                               ` NUMA policy interface Christoph Lameter
2005-08-04 21:14                                 ` Andi Kleen
2005-08-04 21:21                                   ` Christoph Lameter
2005-08-04 21:41                                     ` Andi Kleen
2005-08-04 22:19                                       ` Christoph Lameter
2005-08-04 22:44                                         ` Mike Kravetz
2005-08-04 23:40                                         ` Andi Kleen
2005-08-04 23:49                                           ` Christoph Lameter
2005-08-05  9:16                                             ` Andi Kleen
2005-08-05 14:52                                               ` Christoph Lameter
2005-08-05 14:58                                               ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox