[patch 1/2] cpusets: extract mmarray loading from update

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [patch 1/2] cpusets: extract mmarray loading from update_nodemask
@ 2007-10-25 22:54 David Rientjes
  2007-10-25 22:54 ` [patch 2/2] cpusets: add interleave_over_allowed option David Rientjes
  0 siblings, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-25 22:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Andi Kleen, Paul Jackson, Christoph Lameter, Lee Schermerhorn,
	linux-kernel

Extract a helper function from update_nodemask() to load an array of
mm_struct pointers with references to each task's mm_struct that is
currently attached to a given cpuset.

This will be used later for other purposes where memory policies need to
be rebound for each task attached to a cpuset.

Cc: Andi Kleen <ak@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 kernel/cpuset.c |  130 ++++++++++++++++++++++++++++++++++---------------------
 1 files changed, 81 insertions(+), 49 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -702,6 +702,79 @@ done:
 	/* Don't kfree(doms) -- partition_sched_domains() does that. */
 }
 
+/*
+ * Loads mmarray with pointers to all the mm_struct's of tasks attached to
+ * cpuset cs.
+ *
+ * The reference count to each mm is incremented before loading it into the
+ * array, so put_cpuset_mm_array() must be called after this function to
+ * decrement each reference count and free the memory allocated for mmarray
+ * via this function.
+ */
+static struct mm_struct **get_cpuset_mm_array(const struct cpuset *cs,
+					      int *ntasks)
+{
+	struct mm_struct **mmarray;
+	struct task_struct *p;
+	struct cgroup_iter it;
+	int count;
+	int fudge;
+
+	*ntasks = 0;
+	fudge = 10;				/* spare mmarray[] slots */
+	fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */
+	/*
+	 * Allocate mmarray[] to hold mm reference for each task in cpuset cs.
+	 * Can't kmalloc GFP_KERNEL while holding tasklist_lock.  We could use
+	 * GFP_ATOMIC, but with a few more lines of code, we can retry until
+	 * we get a big enough mmarray[] w/o using GFP_ATOMIC.
+	 */
+	while (1) {
+		count = cgroup_task_count(cs->css.cgroup);  /* guess */
+		count += fudge;
+		mmarray = kmalloc(count * sizeof(*mmarray), GFP_KERNEL);
+		if (!mmarray)
+			return NULL;
+		read_lock(&tasklist_lock);		/* block fork */
+		if (cgroup_task_count(cs->css.cgroup) <= count)
+			break;				/* got enough */
+		read_unlock(&tasklist_lock);		/* try again */
+		kfree(mmarray);
+	}
+
+	/* Load up mmarray[] with mm reference for each task in cpuset. */
+	cgroup_iter_start(cs->css.cgroup, &it);
+	while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
+		struct mm_struct *mm;
+
+		if (*ntasks >= count) {
+			printk(KERN_WARNING
+				"Cpuset mempolicy rebind incomplete.\n");
+			break;
+		}
+		mm = get_task_mm(p);
+		if (!mm)
+			continue;
+		mmarray[(*ntasks)++] = mm;
+	}
+	cgroup_iter_end(cs->css.cgroup, &it);
+	read_unlock(&tasklist_lock);
+	return mmarray;
+}
+
+/*
+ * Decrements the reference count to each mm in mmarray and frees the memory
+ * allocated for mmarray.
+ *
+ * To be used in conjunction with get_cpuset_mm_array().
+ */
+static void put_cpuset_mm_array(struct mm_struct **mmarray, int ntasks)
+{
+	while (ntasks-- > 0)
+		mmput(mmarray[ntasks]);
+	kfree(mmarray);
+}
+
 static inline int started_after_time(struct task_struct *t1,
 				     struct timespec *time,
 				     struct task_struct *t2)
@@ -915,13 +988,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
 	nodemask_t oldmem;
-	struct task_struct *p;
 	struct mm_struct **mmarray;
-	int i, n, ntasks;
+	int i, n;
 	int migrate;
-	int fudge;
 	int retval;
-	struct cgroup_iter it;
 
 	/*
 	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -963,50 +1033,12 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 	mutex_unlock(&callback_mutex);
 
 	cpuset_being_rebound = cs;		/* causes mpol_copy() rebind */
-
-	fudge = 10;				/* spare mmarray[] slots */
-	fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */
 	retval = -ENOMEM;
-
-	/*
-	 * Allocate mmarray[] to hold mm reference for each task
-	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
-	 * tasklist_lock.  We could use GFP_ATOMIC, but with a
-	 * few more lines of code, we can retry until we get a big
-	 * enough mmarray[] w/o using GFP_ATOMIC.
-	 */
-	while (1) {
-		ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
-		ntasks += fudge;
-		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
-		if (!mmarray)
-			goto done;
-		read_lock(&tasklist_lock);		/* block fork */
-		if (cgroup_task_count(cs->css.cgroup) <= ntasks)
-			break;				/* got enough */
-		read_unlock(&tasklist_lock);		/* try again */
-		kfree(mmarray);
-	}
-
-	n = 0;
-
-	/* Load up mmarray[] with mm reference for each task in cpuset. */
-	cgroup_iter_start(cs->css.cgroup, &it);
-	while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
-		struct mm_struct *mm;
-
-		if (n >= ntasks) {
-			printk(KERN_WARNING
-				"Cpuset mempolicy rebind incomplete.\n");
-			break;
-		}
-		mm = get_task_mm(p);
-		if (!mm)
-			continue;
-		mmarray[n++] = mm;
-	}
-	cgroup_iter_end(cs->css.cgroup, &it);
-	read_unlock(&tasklist_lock);
+	mmarray = get_cpuset_mm_array(cs, &n);
+	if (!mmarray)
+		goto done;
+	if (!n)
+		goto done_success;
 
 	/*
 	 * Now that we've dropped the tasklist spinlock, we can
@@ -1028,12 +1060,12 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 		mpol_rebind_mm(mm, &cs->mems_allowed);
 		if (migrate)
 			cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
-		mmput(mm);
 	}
 
 	/* We're done rebinding vma's to this cpusets new mems_allowed. */
-	kfree(mmarray);
 	cpuset_being_rebound = NULL;
+done_success:
+	put_cpuset_mm_array(mmarray, n);
 	retval = 0;
 done:
 	return retval;

^ permalink raw reply	[flat|nested] 98+ messages in thread

* [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-25 22:54 [patch 1/2] cpusets: extract mmarray loading from update_nodemask David Rientjes
@ 2007-10-25 22:54 ` David Rientjes
  2007-10-25 23:37   ` Christoph Lameter
  2007-10-26  1:13   ` Paul Jackson
  0 siblings, 2 replies; 98+ messages in thread
From: David Rientjes @ 2007-10-25 22:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Andi Kleen, Paul Jackson, Christoph Lameter, Lee Schermerhorn,
	linux-kernel

Adds a new 'interleave_over_allowed' option to cpusets.

When a task with an MPOL_INTERLEAVE memory policy is attached to a cpuset
with this option set, the interleaved nodemask becomes the cpuset's
mems_allowed.  When the cpuset's mems_allowed changes, the interleaved
nodemask for all tasks with MPOL_INTERLEAVE memory policies is also
updated to be the new mems_allowed nodemask.

This allows applications to specify that they want to interleave over all
nodes that they are allowed to access.  This set of nodes can be changed
at any time via the cpuset interface and each individual memory policy is
updated to reflect the changes for all attached tasks when this option is
set.

Cc: Andi Kleen <ak@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
 Documentation/cpusets.txt |   30 +++++++++++++++++++-
 include/linux/cpuset.h    |    6 ++++
 kernel/cpuset.c           |   64 +++++++++++++++++++++++++++++++++++++++++++++
 mm/mempolicy.c            |    6 ++++
 4 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -20,7 +20,8 @@ CONTENTS:
   1.5 What is memory_pressure ?
   1.6 What is memory spread ?
   1.7 What is sched_load_balance ?
-  1.8 How do I use cpusets ?
+  1.8 What is interleave_over_allowed ?
+  1.9 How do I use cpusets ?
 2. Usage Examples and Syntax
   2.1 Basic Usage
   2.2 Adding/removing cpus
@@ -497,7 +498,32 @@ the cpuset code to update these sched domains, it compares the new
 partition requested with the current, and updates its sched domains,
 removing the old and adding the new, for each change.
 
-1.8 How do I use cpusets ?
+1.8 What is interleave_over_allowed ?
+-------------------------------------
+
+Tasks may specify a memory policy of MPOL_INTERLEAVE with the desired
+result of interleaving memory allocations over their set of allowed
+nodes.
+
+Since the set of allowed nodes may change via cpusets (through the
+'mems' file) without knowledge to the application, a mechanism needs
+to exist such that applications can specify that they desire to
+interleave over all nodes to which they have access.  This avoids a
+constant get_mempolicy() and set_mempolicy() loop to update an
+interleaved memory policy that respects both its cpuset's mems_allowed
+and the intent of the application.
+
+When interleave_over_allowed is set, all attached tasks with
+MPOL_INTERLEAVE memory policies automatically interleave over all
+available cpuset nodes regardless of what nodemask was passed to
+set_mempolicy().  When the cpuset's mems change, all attached tasks
+with interleaved policies automatically gets updated with the new
+nodemask.
+
+The value of 'interleave_over_allowed' is inherited from a cpuset's
+parent upon creation.
+
+1.9 How do I use cpusets ?
 --------------------------
 
 In order to minimize the impact of cpusets on critical kernel
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -77,6 +77,7 @@ static inline int cpuset_do_slab_mem_spread(void)
 extern void cpuset_track_online_nodes(void);
 
 extern int current_cpuset_is_being_rebound(void);
+extern nodemask_t current_cpuset_interleaved_mems(void);
 
 #else /* !CONFIG_CPUSETS */
 
@@ -157,6 +158,11 @@ static inline int current_cpuset_is_being_rebound(void)
 	return 0;
 }
 
+static inline nodemask_t current_cpuset_interleaved_mems(void)
+{
+	return NODE_MASK_NONE;
+}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -121,6 +121,7 @@ typedef enum {
 	CS_SCHED_LOAD_BALANCE,
 	CS_SPREAD_PAGE,
 	CS_SPREAD_SLAB,
+	CS_INTERLEAVE,
 } cpuset_flagbits_t;
 
 /* convenient tests for these bits */
@@ -154,6 +155,11 @@ static inline int is_spread_slab(const struct cpuset *cs)
 	return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
 
+static inline int is_interleave_over_allowed(const struct cpuset *cs)
+{
+	return test_bit(CS_INTERLEAVE, &cs->flags);
+}
+
 /*
  * Increment this integer everytime any cpuset changes its
  * mems_allowed value.  Users of cpusets can track this generation
@@ -1089,6 +1095,46 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
 	return 0;
 }
 
+/* Rebinds the memory policies of all tasks attached to cs.
+ *
+ * Call with cgroup_mutex held.
+ */
+static int update_interleave(struct cpuset *cs, char *buf)
+{
+	struct mm_struct **mmarray;
+	int ntasks;
+	int i;
+
+	if (!simple_strtoul(buf, NULL, 10)) {
+		clear_bit(CS_INTERLEAVE, &cs->flags);
+		return 0;
+	}
+
+	mmarray = get_cpuset_mm_array(cs, &ntasks);
+	if (!mmarray)
+		return -ENOMEM;
+	if (!ntasks)
+		goto done;
+
+	for (i = 0; i < ntasks; i++)
+		mpol_rebind_mm(mmarray[i], &cs->mems_allowed);
+done:
+	put_cpuset_mm_array(mmarray, ntasks);
+	set_bit(CS_INTERLEAVE, &cs->flags);
+	return 0;
+}
+
+nodemask_t current_cpuset_interleaved_mems(void)
+{
+	nodemask_t mask = NODE_MASK_NONE;
+
+	mutex_lock(&callback_mutex);
+	if (is_interleave_over_allowed(task_cs(current)))
+		mask = task_cs(current)->mems_allowed;
+	mutex_unlock(&callback_mutex);
+	return mask;
+}
+
 /*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
@@ -1283,6 +1329,7 @@ typedef enum {
 	FILE_MEMORY_PRESSURE,
 	FILE_SPREAD_PAGE,
 	FILE_SPREAD_SLAB,
+	FILE_INTERLEAVE_OVER_ALLOWED,
 } cpuset_filetype_t;
 
 static ssize_t cpuset_common_file_write(struct cgroup *cont,
@@ -1350,6 +1397,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
 		retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
 		cs->mems_generation = cpuset_mems_generation++;
 		break;
+	case FILE_INTERLEAVE_OVER_ALLOWED:
+		retval = update_interleave(cs, buffer);
+		break;
 	default:
 		retval = -EINVAL;
 		goto out2;
@@ -1446,6 +1496,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
 	case FILE_SPREAD_SLAB:
 		*s++ = is_spread_slab(cs) ? '1' : '0';
 		break;
+	case FILE_INTERLEAVE_OVER_ALLOWED:
+		*s++ = is_interleave_over_allowed(cs) ? '1' : '0';
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1536,6 +1589,13 @@ static struct cftype cft_spread_slab = {
 	.private = FILE_SPREAD_SLAB,
 };
 
+static struct cftype cft_interleave_over_allowed = {
+	.name = "interleave_over_allowed",
+	.read = cpuset_common_file_read,
+	.write = cpuset_common_file_write,
+	.private = FILE_INTERLEAVE_OVER_ALLOWED,
+};
+
 static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	int err;
@@ -1558,6 +1618,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 		return err;
 	if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0)
 		return err;
+	if ((err = cgroup_add_file(cont, ss, &cft_interleave_over_allowed)) < 0)
+		return err;
 	/* memory_pressure_enabled is in root cpuset only */
 	if (err == 0 && !cont->parent)
 		err = cgroup_add_file(cont, ss,
@@ -1633,6 +1695,8 @@ static struct cgroup_subsys_state *cpuset_create(
 		set_bit(CS_SPREAD_PAGE, &cs->flags);
 	if (is_spread_slab(parent))
 		set_bit(CS_SPREAD_SLAB, &cs->flags);
+	if (is_interleave_over_allowed(parent))
+		set_bit(CS_INTERLEAVE, &cs->flags);
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	cs->cpus_allowed = CPU_MASK_NONE;
 	cs->mems_allowed = NODE_MASK_NONE;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1739,6 +1739,12 @@ static void mpol_rebind_policy(struct mempolicy *pol,
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_INTERLEAVE:
+		tmp = current_cpuset_interleaved_mems();
+		if (!nodes_empty(tmp)) {
+			pol->v.nodes = tmp;
+			*mpolmask = tmp;
+			break;
+		}
 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
 		pol->v.nodes = tmp;
 		*mpolmask = *newmask;

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-25 22:54 ` [patch 2/2] cpusets: add interleave_over_allowed option David Rientjes
@ 2007-10-25 23:37   ` Christoph Lameter
  2007-10-25 23:56     ` David Rientjes
  2007-10-26  1:13   ` Paul Jackson
  1 sibling, 1 reply; 98+ messages in thread
From: Christoph Lameter @ 2007-10-25 23:37 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Andi Kleen, Paul Jackson, Lee Schermerhorn,
	linux-kernel

On Thu, 25 Oct 2007, David Rientjes wrote:

> Adds a new 'interleave_over_allowed' option to cpusets.
> 
> When a task with an MPOL_INTERLEAVE memory policy is attached to a cpuset
> with this option set, the interleaved nodemask becomes the cpuset's
> mems_allowed.  When the cpuset's mems_allowed changes, the interleaved
> nodemask for all tasks with MPOL_INTERLEAVE memory policies is also
> updated to be the new mems_allowed nodemask.
> 
> This allows applications to specify that they want to interleave over all
> nodes that they are allowed to access.  This set of nodes can be changed
> at any time via the cpuset interface and each individual memory policy is
> updated to reflect the changes for all attached tasks when this option is
> set.

More interactions between cpusets and memory policies. We have to be 
careful here to keep clean semantics.

Isnt it a bit surprising for an application that has set up a custom 
MPOL_INTERLEAVE policy if the nodes suddenly change because of a cpuset or 
mems_allowed change?



^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-25 23:37   ` Christoph Lameter
@ 2007-10-25 23:56     ` David Rientjes
  2007-10-26  0:28       ` Christoph Lameter
  0 siblings, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-25 23:56 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, Andi Kleen, Paul Jackson, Lee Schermerhorn,
	linux-kernel

On Thu, 25 Oct 2007, Christoph Lameter wrote:

> More interactions between cpusets and memory policies. We have to be 
> careful here to keep clean semantics.
> 

I agree.

> Isnt it a bit surprising for an application that has set up a custom 
> MPOL_INTERLEAVE policy if the nodes suddenly change because of a cpuset or 
> mems_allowed change?
> 

Every MPOL_INTERLEAVE policy is a custom policy that the application has 
setup.  If you don't use cpusets at all, the nodemask you pass to 
set_mempolicy() with MPOL_INTERLEAVE is static and won't change without 
the application's knowledge.  It has full control over the nodemask that 
it desires to interleave over.

The problem occurs when you add cpusets into the mix and permit the 
allowed nodes to change without knowledge to the application.  Right now, 
a simple remap is done so if the cardinality of the set of nodes 
decreases, you're interleaving over a smaller number of nodes.  If the 
cardinality increases, your interleaved nodemask isn't expanded.  That's 
the problem that we're facing.  The remap itself is troublesome because it 
doesn't take into account the user's desire for a custom nodemask to be 
used anyway; it could remap an interleaved policy over several nodes that 
will already be contended with one another.

Normally, MPOL_INTERLEAVE is used to reduce bus contention to improve the 
throughput of the application.  If you remap the number of nodes to 
interleave over, which is currently how it's done when mems_allowed 
changes, you could actually be increasing latency because you're 
interleaving over the same bus.

This isn't a memory policy problem because all it does is effect a 
specific policy over a set of nodes.  With my change, cpusets are required 
to update the interleaved nodemask if the user specified that they desire 
the feature with interleave_over_allowed.  Cpusets are, after all, the 
ones that changed the mems_allowed in the first place and invalidated our 
custom interleave policy.  We simply can't make inferences about what we 
should do, so we allow the creator of the cpuset to specify it for us.  So 
the proper place to modify an interleaved policy is in cpusets and not 
mempolicy itself.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-25 23:56     ` David Rientjes
@ 2007-10-26  0:28       ` Christoph Lameter
  2007-10-26  1:55         ` Paul Jackson
  2007-10-26 15:18         ` Lee Schermerhorn
  0 siblings, 2 replies; 98+ messages in thread
From: Christoph Lameter @ 2007-10-26  0:28 UTC (permalink / raw)
  To: David Rientjes
  Cc: Andrew Morton, Andi Kleen, Paul Jackson, Lee Schermerhorn,
	linux-kernel

On Thu, 25 Oct 2007, David Rientjes wrote:

> The problem occurs when you add cpusets into the mix and permit the 
> allowed nodes to change without knowledge to the application.  Right now, 
> a simple remap is done so if the cardinality of the set of nodes 
> decreases, you're interleaving over a smaller number of nodes.  If the 
> cardinality increases, your interleaved nodemask isn't expanded.  That's 
> the problem that we're facing.  The remap itself is troublesome because it 
> doesn't take into account the user's desire for a custom nodemask to be 
> used anyway; it could remap an interleaved policy over several nodes that 
> will already be contended with one another.

Right. So I think we are fine if the application cannot setup boundaries 
for interleave.


> Normally, MPOL_INTERLEAVE is used to reduce bus contention to improve the 
> throughput of the application.  If you remap the number of nodes to 
> interleave over, which is currently how it's done when mems_allowed 
> changes, you could actually be increasing latency because you're 
> interleaving over the same bus.

Well you may hit some nodes more than others so a slight performance 
degradataion.

> This isn't a memory policy problem because all it does is effect a 
> specific policy over a set of nodes.  With my change, cpusets are required 
> to update the interleaved nodemask if the user specified that they desire 
> the feature with interleave_over_allowed.  Cpusets are, after all, the 
> ones that changed the mems_allowed in the first place and invalidated our 
> custom interleave policy.  We simply can't make inferences about what we 
> should do, so we allow the creator of the cpuset to specify it for us.  So 
> the proper place to modify an interleaved policy is in cpusets and not 
> mempolicy itself.

With that MPOL_INTERLEAVE would be context dependent and no longer 
needs translation. Lee had similar ideas. Lee: Could we make 
MPOL_INTERLEAVE generally cpuset context dependent?


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-25 22:54 ` [patch 2/2] cpusets: add interleave_over_allowed option David Rientjes
  2007-10-25 23:37   ` Christoph Lameter
@ 2007-10-26  1:13   ` Paul Jackson
  2007-10-26  1:30     ` David Rientjes
  1 sibling, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-26  1:13 UTC (permalink / raw)
  To: David Rientjes; +Cc: akpm, ak, clameter, Lee.Schermerhorn, linux-kernel

I'm probably going to be ok with this ... after a bit.

1) First concern - my primary issue:

    One thing I really want to change, the name of the per-cpuset file
    that controls this option.  You call it "interleave_over_allowed".

    Take a look at the existing per-cpuset file names:

	    $ grep 'name = "' kernel/cpuset.c
	    .name = "cpuset",
	    .name = "cpus",
	    .name = "mems",
	    .name = "cpu_exclusive",
	    .name = "mem_exclusive",
	    .name = "sched_load_balance",
	    .name = "memory_migrate",
	    .name = "memory_pressure_enabled",
	    .name = "memory_pressure",
	    .name = "memory_spread_page",
	    .name = "memory_spread_slab",
	    .name = "cpuset",

    The name of every memory related option starts with "mem" or "memory",
    and the name of every memory interleave related option starts with
    "memory_spread_*".

    Can we call this "memory_spread_user" instead, or something else
    matching "memory_spread_*" ?

    The names of things in the public API's are a big issue of mine.

2) Second concern - lessor code clarity issue:

    The logic surrounding current_cpuset_interleaved_mems() seems a tad
    opaque to me.  It appears on the surface as if the memory policy code,
    in mm/mempolicy.c, is getting a nodemask from the cpuset code by
    calling this routine, as if there were an independent per-cpuset
    nodemask stating over what nodes to interleave for MPOL_INTERLEAVE.

    But all that is returned is either (1) an empty node mask or (2) the
    current tasks allowed cpu mask.  If an empty mask is returned, this
    tells the MPOL_INTERLEAVE code to use the mask the user specified in
    an earlier set_mempolicy MPOL_INTERLEAVE call.  If a non-empty mask
    is returned, then the previous user specified mask is ignored and
    that non-empty mask (just all the current cpusets allowed nodes) is
    used instead.

    Restating this in pseudo code, from your patch, the mempolicy.c
    MPOL_INTERLEAVE code to rebind memory policies after a cpuset
    changes reads:
	tmp = current_cpuset_interleaved_mems();
	if tmp empty:
		rebind over tmp (all the cpusets allowed nodes)
		break;
	rebind over the set_mempolicy MPOL_INTERLEAVE specified mask
	break;

    The above code is assymmetric, and the returning of a nodemask is
    an illusion, suggesting that cpusets might have an interleaved
    nodemask separate from the allowed memory nodemask.

    How about instead of your current_cpuset_interleaved_mems() routine
    that returns a nodemask, rather have a routine that returns a Boolean,
    indicating whether this new flag is set, used as in:
	if (cpuset_is_memory_spread_user())
		tmp = cpuset_current_mems_allowed();
	else
		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
	pol->v.nodes = tmp;

    I'll wager this saves a few bytes of kernel text space as well.

3) Maybe I haven't had enough caffiene yet third issue:

    The existing kernel code for mm/mempolicy.c:mpol_rebind_policy()
    looks buggy to me.  The node_remap() call for the MPOL_INTERLEAVE
    case seems like it should come before, not after, updating mpolmask
    to the newmask.  Fixing that, and consolidating the multiple lines
    doing "*mpolmask = *newmask" for each case, into a single such line
    at the end of the switch(){} statement, results in the following
    patch.  Could you confirm my suspicions and push this one too.
    It should be a part of your patch set, so we don't waste Andrew's
    time resolving the inevitable patch collisions we'll see otherwise.

--- 2.6.23-mm1.orig/mm/mempolicy.c	2007-10-16 18:55:34.745039423 -0700
+++ 2.6.23-mm1/mm/mempolicy.c	2007-10-25 18:06:08.474742762 -0700
@@ -1741,14 +1741,12 @@ static void mpol_rebind_policy(struct me
 	case MPOL_INTERLEAVE:
 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
 		pol->v.nodes = tmp;
-		*mpolmask = *newmask;
 		current->il_next = node_remap(current->il_next,
 						*mpolmask, *newmask);
 		break;
 	case MPOL_PREFERRED:
 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
 						*mpolmask, *newmask);
-		*mpolmask = *newmask;
 		break;
 	case MPOL_BIND: {
 		nodemask_t nodes;
@@ -1773,13 +1771,14 @@ static void mpol_rebind_policy(struct me
 			kfree(pol->v.zonelist);
 			pol->v.zonelist = zonelist;
 		}
-		*mpolmask = *newmask;
 		break;
 	}
 	default:
 		BUG();
 		break;
 	}
+
+	*mpolmask = *newmask;
 }
 
 /*


Thanks.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26  1:13   ` Paul Jackson
@ 2007-10-26  1:30     ` David Rientjes
  0 siblings, 0 replies; 98+ messages in thread
From: David Rientjes @ 2007-10-26  1:30 UTC (permalink / raw)
  To: Paul Jackson; +Cc: akpm, ak, clameter, Lee.Schermerhorn, linux-kernel

On Thu, 25 Oct 2007, Paul Jackson wrote:

>     Can we call this "memory_spread_user" instead, or something else
>     matching "memory_spread_*" ?
> 

Sounds better.  I was hoping somebody was going to come forward with an 
alternative that sounded better than interleave_over_allowed.

>     How about instead of your current_cpuset_interleaved_mems() routine
>     that returns a nodemask, rather have a routine that returns a Boolean,
>     indicating whether this new flag is set, used as in:
> 	if (cpuset_is_memory_spread_user())
> 		tmp = cpuset_current_mems_allowed();
> 	else
> 		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
> 	pol->v.nodes = tmp;
> 

That sounds reasonable, it will simply be a wrapper around 
is_interleave_over_allowed() or what we're now calling is_spread_user().

>     The existing kernel code for mm/mempolicy.c:mpol_rebind_policy()
>     looks buggy to me.  The node_remap() call for the MPOL_INTERLEAVE
>     case seems like it should come before, not after, updating mpolmask
>     to the newmask.  Fixing that, and consolidating the multiple lines
>     doing "*mpolmask = *newmask" for each case, into a single such line
>     at the end of the switch(){} statement, results in the following
>     patch.  Could you confirm my suspicions and push this one too.
>     It should be a part of your patch set, so we don't waste Andrew's
>     time resolving the inevitable patch collisions we'll see otherwise.
> 

For setting current->il_next, both cases work but yours will be better 
balanced for the next interleaved allocation.  I'll apply it to my 
patchset.

Thanks for the review.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26  0:28       ` Christoph Lameter
@ 2007-10-26  1:55         ` Paul Jackson
  2007-10-26  2:11           ` David Rientjes
  2007-10-26 15:18         ` Lee Schermerhorn
  1 sibling, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-26  1:55 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: rientjes, akpm, ak, Lee.Schermerhorn, linux-kernel

Christoph wrote:
> With that MPOL_INTERLEAVE would be context dependent and no longer 
> needs translation. Lee had similar ideas. Lee: Could we make 
> MPOL_INTERLEAVE generally cpuset context dependent?

Well ... MPOL_INTERLEAVE already is essentially cpuset relative.

So long as the cpuset size (number of allowed memory nodes) doesn't
change, whatever MPOL_INTERLEAVE you set is remapped whenever the
cpusets 'mems' changes, preserving the cpuset relative interleaving.

The problem, as David explains, comes when cpusets change sizes.
When the cpuset gets smaller, one can still do a pretty good job,
scrunching down the interleave nodes in proportion.  But when the
cpuset gets larger, it's not clear how to convert a subset of a
smaller set, to an equivalent subset of a larger set.

The existing code handled this last case by saying screw it -- don't
expand the set of interleave nodes when the cpuset 'mems' grows.

David's new code handles this last case by adding a new per-cpuset
Boolean that adds a new alternative, forcing all the tasks using
MPOL_INTERLEAVE in that cpuset, anytime thereafter that the cpusets
'mems' changes, to get interleaved over the entire cpuset.

Now that I spell it out that way, I am having second thoughts about
this one.  It's another special case palliative, given that we can't
give the user what they really want.

David - could you describe the real world situation in which you
are finding that this new 'interleave_over_allowed' option, aka
'memory_spread_user', is useful?  I'm not always opposed to special
case solutions; but they do usually require special case needs to
justify them ;).

I suspect that the general case solution would require having the user
pass in two nodemasks, call them ALL and SUBSET, requesting that
relative to the ALL nodes, interleave be done on the SUBSET nodes.
That way, even if say the task happened to be running in a cpuset with
a -single- allowed memory node at the moment, it could express its user
memory interleave memory needs for the general case of any number of
nodes.  Then for whatever nodes were currently allowed by the cpuset
to that task at any point, the nodes_remap() logic could be done to
derive from the ALL and SUBSET masks, and the current allowed mask,
what nodes to interleave that tasks user allocations over.

This would require a new set_mempolicy API, and might not be worth it.

If David has a compelling use case, it is simple enough that it
might well be worth doing, even though it's not the general case
solution.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26  1:55         ` Paul Jackson
@ 2007-10-26  2:11           ` David Rientjes
  2007-10-26  2:29             ` Paul Jackson
  2007-10-26 15:30             ` Lee Schermerhorn
  0 siblings, 2 replies; 98+ messages in thread
From: David Rientjes @ 2007-10-26  2:11 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Christoph Lameter, akpm, ak, Lee.Schermerhorn, linux-kernel

On Thu, 25 Oct 2007, Paul Jackson wrote:

> David - could you describe the real world situation in which you
> are finding that this new 'interleave_over_allowed' option, aka
> 'memory_spread_user', is useful?  I'm not always opposed to special
> case solutions; but they do usually require special case needs to
> justify them ;).
> 

Yes, when a task with MPOL_INTERLEAVE has its cpuset mems_allowed expanded 
to include more memory.  The task itself can't access all that memory with 
the memory policy of its choice.

Since the cpuset has changed the mems_allowed of the task without its 
knowledge, it would require a constant get_mempolicy() and set_mempolicy() 
loop in the application to catch these changes.  That's obviously not in 
the best interest of anyone.

So my change allows those tasks that have already expressed the desire to 
interleave their memory with MPOL_INTERLEAVE to always use the full range 
of memory available that is dynamically changing beneath them as a result 
of cpusets.  Keep in mind that it is still possible to request an 
interleave only over a subset of allowed mems: but you must do it when you 
create the interleaved mempolicy after it has been attached to the cpuset.
set_mempolicy() changes are always honored.

The only other way to support such a feature is through a modification to 
mempolicies themselves, which Lee has already proposed.  The problem with 
that is it requires mempolicy support for cpuset cases and modification to 
the set_mempolicy() API.  My solution presents a cpuset fix for a cpuset 
problem.

> I suspect that the general case solution would require having the user
> pass in two nodemasks, call them ALL and SUBSET, requesting that
> relative to the ALL nodes, interleave be done on the SUBSET nodes.
> That way, even if say the task happened to be running in a cpuset with
> a -single- allowed memory node at the moment, it could express its user
> memory interleave memory needs for the general case of any number of
> nodes.  Then for whatever nodes were currently allowed by the cpuset
> to that task at any point, the nodes_remap() logic could be done to
> derive from the ALL and SUBSET masks, and the current allowed mask,
> what nodes to interleave that tasks user allocations over.
> 

I find it hard to believe that a single cpuset with a single 
memory_spread_user boolean is going to include multiple tasks that request 
interleaved mempolicies over differing nodes within the cpuset's 
mems_allowed.  That, to me, is the special case.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26  2:11           ` David Rientjes
@ 2007-10-26  2:29             ` Paul Jackson
  2007-10-26  2:45               ` David Rientjes
  2007-10-26 15:30             ` Lee Schermerhorn
  1 sibling, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-26  2:29 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, akpm, ak, Lee.Schermerhorn, linux-kernel

> Yes, when a task with MPOL_INTERLEAVE has its cpuset mems_allowed expanded 
> to include more memory.  The task itself can't access all that memory with 
> the memory policy of its choice.

That much I could have guessed (did guess, actually.)

Are you seeing this in a real world situation?  Can you describe the
situation?  I don't mean just describing how it looks to this kernel
code, but what is going on in the system, what sort of job mix or
applications, what kind of users, ...  In short, a "use case", or brief
approximation thereto.  See further:

  http://en.wikipedia.org/wiki/Use_case

I have no need of a full blown use case; just a three sentence
mini-story should suffice.  But it should (if you can, without
revealing proprietary knowledge) describe a situation you have
actual need of addressing.

> So my change allows those tasks that have already expressed the
> desire to interleave their memory with MPOL_INTERLEAVE to always
> use the full range of memory available that is dynamically changing
> beneath them as a result of cpusets.

Yup, that it does.  Note that it is a special case -- "the full range",
not any application controlled specific subset thereof, short of
reissuing set_mempolicy() calls anytime that the applications cpuset
'mems' changes.

> The only other way to support such a feature is through a modification to 
> mempolicies themselves, which Lee has already proposed.  The problem with 
> that is it requires mempolicy support for cpuset cases and modification to 
> the set_mempolicy() API.

Do you have a link to what Lee proposed?  I agree that a full general
solution would seem to require a new or changed set_mempolicy API,
which may well be more than we want to do, absent a more compelling
"use case" requiring it than we have now.

> I find it hard to believe that a single cpuset with a single
> memory_spread_user boolean is going to include multiple tasks that
> request interleaved mempolicies over differing nodes within the
> cpuset's mems_allowed.  That, to me, is the special case.

That may well be, to you.  To me, pretty much -all- uses of
set_mempolicy() are special cases ;).  I have no way of telling
whether or not there are users who would require multiple tasks
in the same cpuset to have different interleave masks, but since
the API clearly supports that (except when changing cpuset 'mems'
settings mess things up), I have been presuming that somewhere in
the universe, such users exist or might come to exist.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26  2:29             ` Paul Jackson
@ 2007-10-26  2:45               ` David Rientjes
  2007-10-26  3:14                 ` Paul Jackson
  0 siblings, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-26  2:45 UTC (permalink / raw)
  To: Paul Jackson; +Cc: clameter, akpm, ak, Lee.Schermerhorn, linux-kernel

On Thu, 25 Oct 2007, Paul Jackson wrote:

> Are you seeing this in a real world situation?  Can you describe the
> situation?  I don't mean just describing how it looks to this kernel
> code, but what is going on in the system, what sort of job mix or
> applications, what kind of users, ...  In short, a "use case", or brief
> approximation thereto.  See further:
> 

Yes, when using cpusets for resource control.  If memory pressure is being 
felt for that cpuset and additional mems are added to alleviate possible 
OOM conditions, it is insufficient to allow tasks within that cpuset to 
continue using memory policies that prohibit them from taking advantage of 
the extra memory.

The best remedy for that situation is to give the cpuset owner the option 
of allowing tasks with MPOL_INTERLEAVE policies to always interleave over 
the entire set of available mems so they can be dynamically expanded and 
contracted at will without triggering OOM conditions.

> Do you have a link to what Lee proposed?  I agree that a full general
> solution would seem to require a new or changed set_mempolicy API,
> which may well be more than we want to do, absent a more compelling
> "use case" requiring it than we have now.
> 

http://marc.info/?l=linux-mm&m=118849999128086

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26  2:45               ` David Rientjes
@ 2007-10-26  3:14                 ` Paul Jackson
  2007-10-26  3:58                   ` David Rientjes
  0 siblings, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-26  3:14 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, akpm, ak, Lee.Schermerhorn, linux-kernel

David wrote:
> Yes, when using cpusets for resource control.  If memory pressure is being 
> felt for that cpuset and additional mems are added to alleviate possible 
> OOM conditions, it is insufficient to allow tasks within that cpuset to 
> continue using memory policies that prohibit them from taking advantage of 
> the extra memory.

Well ... "resource control" is a tad thin for a decent "use case".

But ok ... that's a little more compelling.

The user space man pages for set_mempolicy(2) are now even more
behind the curve, by not mentioning that MPOL_INTERLEAVE's mask
might mean nothing, if (1) in a cpuset marked memory_spread_user,
(2) after the cpuset has changed 'mems'.

I wonder if there is any way to fix that.  Who does the man pages
for Linux system calls?

Hmmm ... that reminds me ... the period of time between when the
task issues the set_mempolicy(2) MPOL_INTERLEAVE call and when some
cpuset 'mems' change subsequently moves its memory placement is an
anomaly here. During that period of time, the MPOL_INTERLEAVE mask
-does- apply, even if a subset of the 'mems' in the tasks cpuset.
This could result in test cases missing some failures.  If they
test with a particular, carefully crafted MPOL_INTERLEAVE mask
that is a proper (strictly less than) subset of the nodes allowed
in the cpuset, they might not notice that their code is broken if
they happen to be in a memory_spread_user cpuset after a 'mems'
change has jammed the entire cpusets 'mems' into their interleave
mask.

Perhaps we should make it so that doing a set_mempolicy(2) call
to set MPOL_INTERLEAVE immediately changes the memory policy to
the cpusets mems_allowed.

A key advantage in doing this would be that the set_mempolicy user
documentation could simply state that the MPOL_INTERLEAVE mask is
ignored when in a cpuset marked memory_spread_user, instead interleaving
over all the memory nodes in the cpuset.  This would be quite a bit
simpler and clearer than saying that the cpusets nodes are used only
after subsequent cpuset 'mems' changes.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26  3:14                 ` Paul Jackson
@ 2007-10-26  3:58                   ` David Rientjes
  2007-10-26  4:34                     ` Paul Jackson
  2007-10-26 15:37                     ` Lee Schermerhorn
  0 siblings, 2 replies; 98+ messages in thread
From: David Rientjes @ 2007-10-26  3:58 UTC (permalink / raw)
  To: Paul Jackson; +Cc: clameter, akpm, ak, Lee.Schermerhorn, linux-kernel

On Thu, 25 Oct 2007, Paul Jackson wrote:

> The user space man pages for set_mempolicy(2) are now even more
> behind the curve, by not mentioning that MPOL_INTERLEAVE's mask
> might mean nothing, if (1) in a cpuset marked memory_spread_user,
> (2) after the cpuset has changed 'mems'.
> 

Yeah.  They were already outdated in the sense that they did not specify 
that the interleave nodemask could change as a result of a cpuset mems 
change.

> I wonder if there is any way to fix that.  Who does the man pages
> for Linux system calls?
> 

Good question.

> Hmmm ... that reminds me ... the period of time between when the
> task issues the set_mempolicy(2) MPOL_INTERLEAVE call and when some
> cpuset 'mems' change subsequently moves its memory placement is an
> anomaly here. During that period of time, the MPOL_INTERLEAVE mask
> -does- apply, even if a subset of the 'mems' in the tasks cpuset.
> This could result in test cases missing some failures.  If they
> test with a particular, carefully crafted MPOL_INTERLEAVE mask
> that is a proper (strictly less than) subset of the nodes allowed
> in the cpuset, they might not notice that their code is broken if
> they happen to be in a memory_spread_user cpuset after a 'mems'
> change has jammed the entire cpusets 'mems' into their interleave
> mask.
> 

Well, sure, but mempolicy's already get overridden by cpusets anyway.  For 
example, if you were to attach a task with an MPOL_BIND mempolicy to a 
cpuset with a disjoint set of allowed mems.

The important distinction is that you can still interleave over a subset 
of the mems_allowed if you set your memory policy after being attached to 
the cpuset.

> Perhaps we should make it so that doing a set_mempolicy(2) call
> to set MPOL_INTERLEAVE immediately changes the memory policy to
> the cpusets mems_allowed.
> 

No, because that would negate the above.  We still want to be able to 
restrict interleaved memory policies to a subset of allowed mems.  This 
option gives the most power to applications.

> A key advantage in doing this would be that the set_mempolicy user
> documentation could simply state that the MPOL_INTERLEAVE mask is
> ignored when in a cpuset marked memory_spread_user, instead interleaving
> over all the memory nodes in the cpuset.  This would be quite a bit
> simpler and clearer than saying that the cpusets nodes are used only
> after subsequent cpuset 'mems' changes.
> 

I think that documenting the change in the man page as saying that "the 
nodemask will include all allowed nodes if the mems_allowed of a 
memory_spread_user cpuset is expanded" is better.

I've got a few fixes for my patchset queued so I'll resend it later; it's 
mostly style changes but there is a subtle bug where the task changing the 
value of a cpuset's memory_spread_page is not in the same cpuset.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26  3:58                   ` David Rientjes
@ 2007-10-26  4:34                     ` Paul Jackson
  2007-10-26 15:37                     ` Lee Schermerhorn
  1 sibling, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-26  4:34 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, akpm, ak, Lee.Schermerhorn, linux-kernel

David wrote:
> I think that documenting the change in the man page as saying that
> "the nodemask will include all allowed nodes if the mems_allowed
> of a memory_spread_user cpuset is expanded" is better.

Ok.  I'm inclined the other way, but not certain enough of my
position to push the point any further.

Good enough.

> I've got a few fixes for my patchset queued so I'll resend it later

Ok.  Good work - thanks.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26  0:28       ` Christoph Lameter
  2007-10-26  1:55         ` Paul Jackson
@ 2007-10-26 15:18         ` Lee Schermerhorn
  2007-10-26 17:36           ` Christoph Lameter
  2007-10-26 18:45           ` David Rientjes
  1 sibling, 2 replies; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-26 15:18 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David Rientjes, Andrew Morton, Andi Kleen, Paul Jackson,
	linux-kernel

On Thu, 2007-10-25 at 17:28 -0700, Christoph Lameter wrote:
> On Thu, 25 Oct 2007, David Rientjes wrote:
> 
> > The problem occurs when you add cpusets into the mix and permit the 
> > allowed nodes to change without knowledge to the application.  Right now, 
> > a simple remap is done so if the cardinality of the set of nodes 
> > decreases, you're interleaving over a smaller number of nodes.  If the 
> > cardinality increases, your interleaved nodemask isn't expanded.  That's 
> > the problem that we're facing.  The remap itself is troublesome because it 
> > doesn't take into account the user's desire for a custom nodemask to be 
> > used anyway; it could remap an interleaved policy over several nodes that 
> > will already be contended with one another.
> 
> Right. So I think we are fine if the application cannot setup boundaries 
> for interleave.
> 
> 
> > Normally, MPOL_INTERLEAVE is used to reduce bus contention to improve the 
> > throughput of the application.  If you remap the number of nodes to 
> > interleave over, which is currently how it's done when mems_allowed 
> > changes, you could actually be increasing latency because you're 
> > interleaving over the same bus.
> 
> Well you may hit some nodes more than others so a slight performance 
> degradataion.
> 
> > This isn't a memory policy problem because all it does is effect a 
> > specific policy over a set of nodes.  With my change, cpusets are required 
> > to update the interleaved nodemask if the user specified that they desire 
> > the feature with interleave_over_allowed.  Cpusets are, after all, the 
> > ones that changed the mems_allowed in the first place and invalidated our 
> > custom interleave policy.  We simply can't make inferences about what we 
> > should do, so we allow the creator of the cpuset to specify it for us.  So 
> > the proper place to modify an interleaved policy is in cpusets and not 
> > mempolicy itself.
> 
> With that MPOL_INTERLEAVE would be context dependent and no longer 
> needs translation. Lee had similar ideas. Lee: Could we make 
> MPOL_INTERLEAVE generally cpuset context dependent?
> 

That's what my "cpuset-independent interleave" patch does.  David
doesn't like the "null node mask" interface because it doesn't work with
libnuma.  I plan to fix that, but I'm chasing other issues.  I should
get back to the mempol work after today.

What I like about the cpuset independent interleave is that the "policy
remap" when cpusets are changed is a NO-OP--no need to change the
policy.  Just as "preferred local" policy chooses the node where the
allocation occurs, my cpuset independent interleave patch interleaves
across the set of nodes available at the time of the allocation.  The
application has to specifically ask for this behavior by the null/empty
nodemask or the TBD libnuma API.  IMO, this is the only reasonable
interleave policy for apps running in dynamic cpusets.

An aside:  if David et al [at google] are using cpusets on fake numa for
resource management [I don't know this is the case, but saw some
discussions way back that indicate it might be?], then maybe this
becomes less of an issue when control groups [a.k.a. containers] and
memory resource controls come to fruition?

Lee


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26  2:11           ` David Rientjes
  2007-10-26  2:29             ` Paul Jackson
@ 2007-10-26 15:30             ` Lee Schermerhorn
  2007-10-26 18:46               ` David Rientjes
  1 sibling, 1 reply; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-26 15:30 UTC (permalink / raw)
  To: David Rientjes; +Cc: Paul Jackson, Christoph Lameter, akpm, ak, linux-kernel

On Thu, 2007-10-25 at 19:11 -0700, David Rientjes wrote:
> On Thu, 25 Oct 2007, Paul Jackson wrote:
> 
> > David - could you describe the real world situation in which you
> > are finding that this new 'interleave_over_allowed' option, aka
> > 'memory_spread_user', is useful?  I'm not always opposed to special
> > case solutions; but they do usually require special case needs to
> > justify them ;).
> > 
> 
> Yes, when a task with MPOL_INTERLEAVE has its cpuset mems_allowed expanded 
> to include more memory.  The task itself can't access all that memory with 
> the memory policy of its choice.
> 
> Since the cpuset has changed the mems_allowed of the task without its 
> knowledge, it would require a constant get_mempolicy() and set_mempolicy() 
> loop in the application to catch these changes.  That's obviously not in 
> the best interest of anyone.
> 
> So my change allows those tasks that have already expressed the desire to 
> interleave their memory with MPOL_INTERLEAVE to always use the full range 
> of memory available that is dynamically changing beneath them as a result 
> of cpusets.  Keep in mind that it is still possible to request an 
> interleave only over a subset of allowed mems: but you must do it when you 
> create the interleaved mempolicy after it has been attached to the cpuset.
> set_mempolicy() changes are always honored.
> 
> The only other way to support such a feature is through a modification to 
> mempolicies themselves, which Lee has already proposed.  The problem with 
> that is it requires mempolicy support for cpuset cases and modification to 
> the set_mempolicy() API.  My solution presents a cpuset fix for a cpuset 
> problem.

Actually, my patch doesn't change the set_mempolicy() API at all, it
just co-opts a currently unused/illegal value for the nodemask to
indicate "all allowed nodes".  Again, I need to provide a libnuma API to
request this.   Soon come, mon...

Here's a link the last posting of my patch, as Paul requested:

http://marc.info/?l=linux-mm&m=118849999128086&w=4

A bit out of date, but I'll fix that maybe next week.

Lee
<snip>


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26  3:58                   ` David Rientjes
  2007-10-26  4:34                     ` Paul Jackson
@ 2007-10-26 15:37                     ` Lee Schermerhorn
  2007-10-26 17:04                       ` Paul Jackson
  1 sibling, 1 reply; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-26 15:37 UTC (permalink / raw)
  To: David Rientjes
  Cc: Paul Jackson, clameter, akpm, ak, linux-kernel, Michael Kerrisk

On Thu, 2007-10-25 at 20:58 -0700, David Rientjes wrote:
> On Thu, 25 Oct 2007, Paul Jackson wrote:
> 
> > The user space man pages for set_mempolicy(2) are now even more
> > behind the curve, by not mentioning that MPOL_INTERLEAVE's mask
> > might mean nothing, if (1) in a cpuset marked memory_spread_user,
> > (2) after the cpuset has changed 'mems'.
> > 
> 
> Yeah.  They were already outdated in the sense that they did not specify 
> that the interleave nodemask could change as a result of a cpuset mems 
> change.
> 
> > I wonder if there is any way to fix that.  Who does the man pages
> > for Linux system calls?
> > 

Michael Kerrisk, whom I've copied, does.  I recently sent in an update
to all of the mempolicy man pages that describe the behavior as it
currently exists.  [I need to send in an update for
MPOL_F_MEMS_ALLOWED].

One of the things that has bothered me is that there are no cpuset man
pages to reference from the mempolicy man pages.  [I know, we can and do
refer to the kernel source Documentation, but that might not be
available to everyone w/o some digging.  "See Also" refs typically point
at other man pages...].  To get around this, I had to talk about "nodes
allowed in the current context" or some such weasel-wording in my
updates.

Paul:  what do you think about subsetting the cpuset.txt into a man page
or 2 that can be referenced by other man pages' See Also sections?

> 
<snip>
> 
> 		David

Lee

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 15:37                     ` Lee Schermerhorn
@ 2007-10-26 17:04                       ` Paul Jackson
  2007-10-26 17:28                         ` Lee Schermerhorn
  2007-10-26 20:21                         ` Michael Kerrisk
  0 siblings, 2 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-26 17:04 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel, mtk-manpages

Lee wrote:
> Paul:  what do you think about subsetting the cpuset.txt into a man page
> or 2 that can be referenced by other man pages' See Also sections?

Oh dear --- looking back in my work queue I have with my employer, I
see I have a task that is now over a year old, still unfinished, to
provide man pages for cpusets to Michael Kerrisk" <mtk-manpages@gmx.net>

So, yes, I agree this would be a "good thing".  I just haven't gotten a
round to it (http://www.quantumenterprises.co.uk/roundtuit/index.htm)
yet.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 17:04                       ` Paul Jackson
@ 2007-10-26 17:28                         ` Lee Schermerhorn
  2007-10-26 20:21                         ` Michael Kerrisk
  1 sibling, 0 replies; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-26 17:28 UTC (permalink / raw)
  To: Paul Jackson; +Cc: rientjes, clameter, akpm, ak, linux-kernel, mtk-manpages

On Fri, 2007-10-26 at 10:04 -0700, Paul Jackson wrote:
> Lee wrote:
> > Paul:  what do you think about subsetting the cpuset.txt into a man page
> > or 2 that can be referenced by other man pages' See Also sections?
> 
> Oh dear --- looking back in my work queue I have with my employer, I
> see I have a task that is now over a year old, still unfinished, to
> provide man pages for cpusets to Michael Kerrisk" <mtk-manpages@gmx.net>
> 
> So, yes, I agree this would be a "good thing".  I just haven't gotten a
> round to it (http://www.quantumenterprises.co.uk/roundtuit/index.htm)
> yet.


I'm a little backed up myself, right now, or I'd offer to take a cut for
you to review.  Once I get some free time [Hah!], I'll check with you
again.  If you get started before then, I'd be happy to review.

Lee


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 15:18         ` Lee Schermerhorn
@ 2007-10-26 17:36           ` Christoph Lameter
  2007-10-26 18:45           ` David Rientjes
  1 sibling, 0 replies; 98+ messages in thread
From: Christoph Lameter @ 2007-10-26 17:36 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: David Rientjes, Andrew Morton, Andi Kleen, Paul Jackson,
	linux-kernel

On Fri, 26 Oct 2007, Lee Schermerhorn wrote:

> > With that MPOL_INTERLEAVE would be context dependent and no longer 
> > needs translation. Lee had similar ideas. Lee: Could we make 
> > MPOL_INTERLEAVE generally cpuset context dependent?
> > 
> 
> That's what my "cpuset-independent interleave" patch does.  David
> doesn't like the "null node mask" interface because it doesn't work with
> libnuma.  I plan to fix that, but I'm chasing other issues.  I should
> get back to the mempol work after today.

But this makes it cpuset dependent. The set of nodes is dependent on the 
cpuset. If it would be independent then interleave could allow any nodes 
outside of the cpuset.
 
> What I like about the cpuset independent interleave is that the "policy
> remap" when cpusets are changed is a NO-OP--no need to change the
> policy.  Just as "preferred local" policy chooses the node where the
> allocation occurs, my cpuset independent interleave patch interleaves
> across the set of nodes available at the time of the allocation.  The
> application has to specifically ask for this behavior by the null/empty
> nodemask or the TBD libnuma API.  IMO, this is the only reasonable
> interleave policy for apps running in dynamic cpusets.

Hmmm.. But its an API change and requires more special casing.

> An aside:  if David et al [at google] are using cpusets on fake numa for
> resource management [I don't know this is the case, but saw some
> discussions way back that indicate it might be?], then maybe this
> becomes less of an issue when control groups [a.k.a. containers] and
> memory resource controls come to fruition?

Yes very likely.


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 15:18         ` Lee Schermerhorn
  2007-10-26 17:36           ` Christoph Lameter
@ 2007-10-26 18:45           ` David Rientjes
  2007-10-26 19:02             ` Paul Jackson
  2007-10-27 19:16             ` David Rientjes
  1 sibling, 2 replies; 98+ messages in thread
From: David Rientjes @ 2007-10-26 18:45 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: Christoph Lameter, Andrew Morton, Andi Kleen, Paul Jackson,
	linux-kernel

On Fri, 26 Oct 2007, Lee Schermerhorn wrote:

> That's what my "cpuset-independent interleave" patch does.  David
> doesn't like the "null node mask" interface because it doesn't work with
> libnuma.  I plan to fix that, but I'm chasing other issues.  I should
> get back to the mempol work after today.
> 

Hacking and requiring an updated version of libnuma to allow empty 
nodemasks to be passed is a poor solution; if mempolicy's are supposed to 
be independent from cpusets, then what semantics does an empty nodemask 
actually imply when using MPOL_INTERLEAVE?  To me, it means the entire 
set_mempolicy() should be a no-op, and that's exactly how mainline 
currently treats it _as_well_ as libnuma.  So justifying this change in 
the man page is respectible, but passing an empty nodemask just doesn't 
make sense.

> What I like about the cpuset independent interleave is that the "policy
> remap" when cpusets are changed is a NO-OP--no need to change the
> policy.  Just as "preferred local" policy chooses the node where the
> allocation occurs, my cpuset independent interleave patch interleaves
> across the set of nodes available at the time of the allocation.  The
> application has to specifically ask for this behavior by the null/empty
> nodemask or the TBD libnuma API.  IMO, this is the only reasonable
> interleave policy for apps running in dynamic cpusets.
> 

Passing empty nodemasks with MPOL_INTERLEAVE to set_mempolicy() is the 
only reasonable way of specifying you want, at all times, to interleave 
over all available nodes?  I doubt it.

I personally prefer an approach where cpusets take the responsibility for 
determining how policies change (they use set_mempolicy() anyway to effect 
their mems boundaries) because it's cpusets that has changed the available 
nodemask out from beneath the application.  So instead of trying to create 
a solution where cpusets impact mempolicies and mempolicies impact 
cpusets, it should only be in a single direction.  Cpusets change the 
set of available nodes and should update the attached tasks' mempolicies 
at the same time.  That's the same as saying that cpusets should be built 
on top of mempolicies, which they are, and shouldn't have any reverse 
dependency.

> An aside:  if David et al [at google] are using cpusets on fake numa for
> resource management [I don't know this is the case, but saw some
> discussions way back that indicate it might be?], then maybe this
> becomes less of an issue when control groups [a.k.a. containers] and
> memory resource controls come to fruition?
> 

Completely irrelevant; I care about the interaction between cpusets and 
mempolicies in mainline Linux.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 15:30             ` Lee Schermerhorn
@ 2007-10-26 18:46               ` David Rientjes
  2007-10-26 19:00                 ` Paul Jackson
  2007-10-26 20:43                 ` Lee Schermerhorn
  0 siblings, 2 replies; 98+ messages in thread
From: David Rientjes @ 2007-10-26 18:46 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Paul Jackson, Christoph Lameter, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Lee Schermerhorn wrote:

> Actually, my patch doesn't change the set_mempolicy() API at all, it
> just co-opts a currently unused/illegal value for the nodemask to
> indicate "all allowed nodes".  Again, I need to provide a libnuma API to
> request this.   Soon come, mon...
> 

If something that was previously unaccepted is now allowed with a 
newly-introduced semantic, that's an API change.

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 18:46               ` David Rientjes
@ 2007-10-26 19:00                 ` Paul Jackson
  2007-10-26 20:45                   ` David Rientjes
  2007-10-26 20:43                 ` Lee Schermerhorn
  1 sibling, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-26 19:00 UTC (permalink / raw)
  To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel

David wrote:
> If something that was previously unaccepted is now allowed with a 
> newly-introduced semantic, that's an API change.

Agreed, as I wrote earlier:
> It should work with libnuma and be
> fully upward compatible with current code (except perhaps code that
> depends on getting an error from requesting MPOL_INTERLEAVE on a node
> not allowed.)

Without at least this sort of change to MPOL_INTERLEAVE nodemasks,
allowing either empty nodemasks (Lee's proposal) or extending them
outside the current cpuset (what I'm cooking up now), there is no way
for a task that is currently confined to a single node cpuset to say
anything about how it wants be interleaved in the event that it is
subsequently moved to a larger cpuset.  Currently, such a task is only
allowed to pass exactly one particular nodemask to set_mempolicy
MPOL_INTERLEAVE calls, with exactly the one bit corresponding to its
current node.  No useful information can be passed via an API that only
allows a single legal value.

But you knew that ...

You were just correcting my erroneously unqualified statement.  Good.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 18:45           ` David Rientjes
@ 2007-10-26 19:02             ` Paul Jackson
  2007-10-27 19:16             ` David Rientjes
  1 sibling, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-26 19:02 UTC (permalink / raw)
  To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel

David wrote:
> I personally prefer an approach where cpusets take the responsibility for 
> determining how policies change (they use set_mempolicy() anyway to effect 
> their mems boundaries) because it's cpusets that has changed the available 
> nodemask out from beneath the application.

Agreed.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 17:04                       ` Paul Jackson
  2007-10-26 17:28                         ` Lee Schermerhorn
@ 2007-10-26 20:21                         ` Michael Kerrisk
  2007-10-26 20:25                           ` Paul Jackson
  1 sibling, 1 reply; 98+ messages in thread
From: Michael Kerrisk @ 2007-10-26 20:21 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Lee Schermerhorn, rientjes, clameter, akpm, ak, linux-kernel,
	mtk-manpages

On 10/26/07, Paul Jackson <pj@sgi.com> wrote:
> Lee wrote:
> > Paul:  what do you think about subsetting the cpuset.txt into a man page
> > or 2 that can be referenced by other man pages' See Also sections?
>
> Oh dear --- looking back in my work queue I have with my employer, I
> see I have a task that is now over a year old, still unfinished, to
> provide man pages for cpusets to Michael Kerrisk" <mtk-manpages@gmx.net>

Yes, it would be great to have those pages.  Is there anything I can
do to assist?

Cheers,

Michael

PS Note my new addres for man-apges: mtk.manpages@gmail.com

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 20:21                         ` Michael Kerrisk
@ 2007-10-26 20:25                           ` Paul Jackson
  2007-10-26 20:33                             ` Michael Kerrisk
  0 siblings, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-26 20:25 UTC (permalink / raw)
  To: Michael Kerrisk
  Cc: Lee.Schermerhorn, rientjes, clameter, akpm, ak, linux-kernel,
	mtk-manpages

Michael wrote:
> PS Note my new addres for man-apges: mtk.manpages@gmail.com

Noted.

> Is there anything I can do to assist?

Got any spare round tuit's ;)?

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 20:25                           ` Paul Jackson
@ 2007-10-26 20:33                             ` Michael Kerrisk
  0 siblings, 0 replies; 98+ messages in thread
From: Michael Kerrisk @ 2007-10-26 20:33 UTC (permalink / raw)
  To: Paul Jackson
  Cc: Lee.Schermerhorn, rientjes, clameter, akpm, ak, linux-kernel,
	mtk-manpages

On 10/26/07, Paul Jackson <pj@sgi.com> wrote:
> Michael wrote:
> > PS Note my new addres for man-apges: mtk.manpages@gmail.com
>
> Noted.
>
> > Is there anything I can do to assist?
>
> Got any spare round tuit's ;)?

I ran out quite some time ago unfortunately.

Cheers,

Michael

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 18:46               ` David Rientjes
  2007-10-26 19:00                 ` Paul Jackson
@ 2007-10-26 20:43                 ` Lee Schermerhorn
  1 sibling, 0 replies; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-26 20:43 UTC (permalink / raw)
  To: David Rientjes; +Cc: Paul Jackson, Christoph Lameter, akpm, ak, linux-kernel

On Fri, 2007-10-26 at 11:46 -0700, David Rientjes wrote:
> On Fri, 26 Oct 2007, Lee Schermerhorn wrote:
> 
> > Actually, my patch doesn't change the set_mempolicy() API at all, it
> > just co-opts a currently unused/illegal value for the nodemask to
> > indicate "all allowed nodes".  Again, I need to provide a libnuma API to
> > request this.   Soon come, mon...
> > 
> 
> If something that was previously unaccepted is now allowed with a 
> newly-introduced semantic, that's an API change.

Well, it's an extension for sure, but a backward compatible one.  It
should not affect any correct existing application--i.e., one that
checks it's return status--except maybe the odd test program that needs
to be updated to handle the new semantics.  We're allowed to extend APIs
as long as we don't break correct applications, right?

I mean, it's not like it's a new argument or such.

Lee



^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 19:00                 ` Paul Jackson
@ 2007-10-26 20:45                   ` David Rientjes
  2007-10-26 21:05                     ` Christoph Lameter
  2007-10-26 21:13                     ` Lee Schermerhorn
  0 siblings, 2 replies; 98+ messages in thread
From: David Rientjes @ 2007-10-26 20:45 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Paul Jackson wrote:

> Without at least this sort of change to MPOL_INTERLEAVE nodemasks,
> allowing either empty nodemasks (Lee's proposal) or extending them
> outside the current cpuset (what I'm cooking up now), there is no way
> for a task that is currently confined to a single node cpuset to say
> anything about how it wants be interleaved in the event that it is
> subsequently moved to a larger cpuset.  Currently, such a task is only
> allowed to pass exactly one particular nodemask to set_mempolicy
> MPOL_INTERLEAVE calls, with exactly the one bit corresponding to its
> current node.  No useful information can be passed via an API that only
> allows a single legal value.
> 

Well, passing a single node to set_mempolicy() for MPOL_INTERLEAVE doesn't 
make a whole lot of sense in the first place.  I prefer your solution of 
allowing set_mempolicy(MPOL_INTERLEAVE, NODE_MASK_ALL) to mean "interleave 
me over everything I'm allowed to access."  NODE_MASK_ALL would be stored 
in the struct mempolicy and used later on mpol_rebind_policy().

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 20:45                   ` David Rientjes
@ 2007-10-26 21:05                     ` Christoph Lameter
  2007-10-26 21:08                       ` David Rientjes
  2007-10-26 21:13                     ` Lee Schermerhorn
  1 sibling, 1 reply; 98+ messages in thread
From: Christoph Lameter @ 2007-10-26 21:05 UTC (permalink / raw)
  To: David Rientjes; +Cc: Paul Jackson, Lee.Schermerhorn, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, David Rientjes wrote:

> Well, passing a single node to set_mempolicy() for MPOL_INTERLEAVE doesn't 
> make a whole lot of sense in the first place.  I prefer your solution of 
> allowing set_mempolicy(MPOL_INTERLEAVE, NODE_MASK_ALL) to mean "interleave 
> me over everything I'm allowed to access."  NODE_MASK_ALL would be stored 
> in the struct mempolicy and used later on mpol_rebind_policy().

So instead of an empty nodemask we would pass a nodemask where all bits 
are set? And they would stay set but the cpuset restrictions would 
effectively limit the interleaving to the allowed set?

rebind could ignore rebinds if all bits are set.



^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:05                     ` Christoph Lameter
@ 2007-10-26 21:08                       ` David Rientjes
  2007-10-26 21:12                         ` Christoph Lameter
  0 siblings, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-26 21:08 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Paul Jackson, Lee.Schermerhorn, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Christoph Lameter wrote:

> > Well, passing a single node to set_mempolicy() for MPOL_INTERLEAVE doesn't 
> > make a whole lot of sense in the first place.  I prefer your solution of 
> > allowing set_mempolicy(MPOL_INTERLEAVE, NODE_MASK_ALL) to mean "interleave 
> > me over everything I'm allowed to access."  NODE_MASK_ALL would be stored 
> > in the struct mempolicy and used later on mpol_rebind_policy().
> 
> So instead of an empty nodemask we would pass a nodemask where all bits 
> are set? And they would stay set but the cpuset restrictions would 
> effectively limit the interleaving to the allowed set?
> 

You would pass NODE_MASK_ALL if your intent was to interleave over 
everything you have access to, yes.  Otherwise you can pass whatever you 
want access to and your interleaved nodemask becomes 
mpol_rebind_policy()'s newmask formal (the cpuset's new mems_allowed) 
AND'd with pol->passed_nodemask.

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:08                       ` David Rientjes
@ 2007-10-26 21:12                         ` Christoph Lameter
  2007-10-26 21:15                           ` David Rientjes
  0 siblings, 1 reply; 98+ messages in thread
From: Christoph Lameter @ 2007-10-26 21:12 UTC (permalink / raw)
  To: David Rientjes; +Cc: Paul Jackson, Lee.Schermerhorn, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, David Rientjes wrote:

> You would pass NODE_MASK_ALL if your intent was to interleave over 
> everything you have access to, yes.  Otherwise you can pass whatever you 
> want access to and your interleaved nodemask becomes 
> mpol_rebind_policy()'s newmask formal (the cpuset's new mems_allowed) 
> AND'd with pol->passed_nodemask.

We would need two fields in the policy structure

1. The specified nodemask (generally ignored)

2. The effective nodemask (specified & cpuset_mems_allowed)

If we have these two then its easy to get a bit further by making
the first nodemask a relative nodemask. The calculation of the effective
nodemask changes somewhat but the logic is then applicable to MPOL_BIND as 
well.


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 20:45                   ` David Rientjes
  2007-10-26 21:05                     ` Christoph Lameter
@ 2007-10-26 21:13                     ` Lee Schermerhorn
  2007-10-26 21:17                       ` Christoph Lameter
  2007-10-26 21:18                       ` David Rientjes
  1 sibling, 2 replies; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-26 21:13 UTC (permalink / raw)
  To: David Rientjes; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel

On Fri, 2007-10-26 at 13:45 -0700, David Rientjes wrote:
> On Fri, 26 Oct 2007, Paul Jackson wrote:
> 
> > Without at least this sort of change to MPOL_INTERLEAVE nodemasks,
> > allowing either empty nodemasks (Lee's proposal) or extending them
> > outside the current cpuset (what I'm cooking up now), there is no way
> > for a task that is currently confined to a single node cpuset to say
> > anything about how it wants be interleaved in the event that it is
> > subsequently moved to a larger cpuset.  Currently, such a task is only
> > allowed to pass exactly one particular nodemask to set_mempolicy
> > MPOL_INTERLEAVE calls, with exactly the one bit corresponding to its
> > current node.  No useful information can be passed via an API that only
> > allows a single legal value.
> > 
> 
> Well, passing a single node to set_mempolicy() for MPOL_INTERLEAVE doesn't 
> make a whole lot of sense in the first place.  I prefer your solution of 
> allowing set_mempolicy(MPOL_INTERLEAVE, NODE_MASK_ALL) to mean "interleave 
> me over everything I'm allowed to access."  NODE_MASK_ALL would be stored 
> in the struct mempolicy and used later on mpol_rebind_policy().

You don't need to save the entire mask--just note that NODE_MASK_ALL was
passed--like with my internal MPOL_CONTEXT flag.  This would involve
special casing NODE_MASK_ALL in the error checking, as currently
set_mempolicy() complains loudly if you pass non-allowed nodes--see
"contextualize_policy()".  [mbind() on the other hand, appears to allow
any nodemask, even outside the cpuset.  guess we catch this during
allocation.]  This is pretty much the spirit of my patch w/o the API
change/extension [/improvement :)]

For some systems [not mine], the nodemasks can get quite large.  I have
a patch, that I've tested  atop Mel Gorman's "onezonelist" patches that
replaces the nodemasks embedded in struct mempolicy with pointers to
dynamically allocated ones.  However, it's probably not much of a win,
memorywise, if most of the uses are for interleave and bind
policies--both of which would always need the nodemasks in addition to
the pointers.

Now, if we could replace the 'cpuset_mems_allowed' nodemask with a
pointer to something stable, it might be a win.

Lee

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:12                         ` Christoph Lameter
@ 2007-10-26 21:15                           ` David Rientjes
  0 siblings, 0 replies; 98+ messages in thread
From: David Rientjes @ 2007-10-26 21:15 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: Paul Jackson, Lee.Schermerhorn, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Christoph Lameter wrote:

> We would need two fields in the policy structure
> 
> 1. The specified nodemask (generally ignored)
> 

What I've called pol->passed_nodemask.

> 2. The effective nodemask (specified & cpuset_mems_allowed)
> 

Which is pol->v.nodes.

> If we have these two then its easy to get a bit further by making
> the first nodemask a relative nodemask. The calculation of the effective
> nodemask changes somewhat but the logic is then applicable to MPOL_BIND as 
> well.
> 

Agreed.

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:13                     ` Lee Schermerhorn
@ 2007-10-26 21:17                       ` Christoph Lameter
  2007-10-26 21:26                         ` Lee Schermerhorn
  2007-10-26 21:18                       ` David Rientjes
  1 sibling, 1 reply; 98+ messages in thread
From: Christoph Lameter @ 2007-10-26 21:17 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: David Rientjes, Paul Jackson, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Lee Schermerhorn wrote:

> For some systems [not mine], the nodemasks can get quite large.  I have
> a patch, that I've tested  atop Mel Gorman's "onezonelist" patches that
> replaces the nodemasks embedded in struct mempolicy with pointers to
> dynamically allocated ones.  However, it's probably not much of a win,
> memorywise, if most of the uses are for interleave and bind
> policies--both of which would always need the nodemasks in addition to
> the pointers.
> 
> Now, if we could replace the 'cpuset_mems_allowed' nodemask with a
> pointer to something stable, it might be a win.

The memory policies are already shared and have refcounters for that 
purpose.

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:13                     ` Lee Schermerhorn
  2007-10-26 21:17                       ` Christoph Lameter
@ 2007-10-26 21:18                       ` David Rientjes
  2007-10-26 21:31                         ` Lee Schermerhorn
  1 sibling, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-26 21:18 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Lee Schermerhorn wrote:

> You don't need to save the entire mask--just note that NODE_MASK_ALL was
> passed--like with my internal MPOL_CONTEXT flag.  This would involve
> special casing NODE_MASK_ALL in the error checking, as currently
> set_mempolicy() complains loudly if you pass non-allowed nodes--see
> "contextualize_policy()".  [mbind() on the other hand, appears to allow
> any nodemask, even outside the cpuset.  guess we catch this during
> allocation.]  This is pretty much the spirit of my patch w/o the API
> change/extension [/improvement :)]
> 

Not really, because perhaps your application doesn't want to interleave 
over all nodes.  I suggested NODE_MASK_ALL as the way to get access to all 
the memory you are allowed, but it's certainly plausible that an 
application could request to interleave only over a subset.  That's the 
entire reason set_mempolicy(MPOL_INTERLEAVE) takes a nodemask anyway right 
now instead of just using task->mems_allowed on each allocation.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:17                       ` Christoph Lameter
@ 2007-10-26 21:26                         ` Lee Schermerhorn
  2007-10-26 21:37                           ` Christoph Lameter
  0 siblings, 1 reply; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-26 21:26 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: David Rientjes, Paul Jackson, akpm, ak, linux-kernel

On Fri, 2007-10-26 at 14:17 -0700, Christoph Lameter wrote:
> On Fri, 26 Oct 2007, Lee Schermerhorn wrote:
> 
> > For some systems [not mine], the nodemasks can get quite large.  I have
> > a patch, that I've tested  atop Mel Gorman's "onezonelist" patches that
> > replaces the nodemasks embedded in struct mempolicy with pointers to
> > dynamically allocated ones.  However, it's probably not much of a win,
> > memorywise, if most of the uses are for interleave and bind
> > policies--both of which would always need the nodemasks in addition to
> > the pointers.
> > 
> > Now, if we could replace the 'cpuset_mems_allowed' nodemask with a
> > pointer to something stable, it might be a win.
> 
> The memory policies are already shared and have refcounters for that 
> purpose.

I must have missed that in the code I'm reading :)

Have a nice weekend.

Lee




^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:18                       ` David Rientjes
@ 2007-10-26 21:31                         ` Lee Schermerhorn
  2007-10-26 21:39                           ` David Rientjes
  0 siblings, 1 reply; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-26 21:31 UTC (permalink / raw)
  To: David Rientjes; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel

On Fri, 2007-10-26 at 14:18 -0700, David Rientjes wrote:
> On Fri, 26 Oct 2007, Lee Schermerhorn wrote:
> 
> > You don't need to save the entire mask--just note that NODE_MASK_ALL was
> > passed--like with my internal MPOL_CONTEXT flag.  This would involve
> > special casing NODE_MASK_ALL in the error checking, as currently
> > set_mempolicy() complains loudly if you pass non-allowed nodes--see
> > "contextualize_policy()".  [mbind() on the other hand, appears to allow
> > any nodemask, even outside the cpuset.  guess we catch this during
> > allocation.]  This is pretty much the spirit of my patch w/o the API
> > change/extension [/improvement :)]
> > 
> 
> Not really, because perhaps your application doesn't want to interleave 
> over all nodes.  I suggested NODE_MASK_ALL as the way to get access to all 
> the memory you are allowed, but it's certainly plausible that an 
> application could request to interleave only over a subset.  That's the 
> entire reason set_mempolicy(MPOL_INTERLEAVE) takes a nodemask anyway right 
> now instead of just using task->mems_allowed on each allocation.

So, you pass the subset, you don't set the flag to indicate you want
interleaving over all available.  You must be thinking of some other use
for saving the subset mask that I'm not seeing here.  Maybe restoring to
the exact nodes requested if they're taken away and then re-added to the
cpuset?


Later,
Lee




^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:26                         ` Lee Schermerhorn
@ 2007-10-26 21:37                           ` Christoph Lameter
  2007-10-29 15:00                             ` Lee Schermerhorn
  0 siblings, 1 reply; 98+ messages in thread
From: Christoph Lameter @ 2007-10-26 21:37 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: David Rientjes, Paul Jackson, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Lee Schermerhorn wrote:

> > > Now, if we could replace the 'cpuset_mems_allowed' nodemask with a
> > > pointer to something stable, it might be a win.
> > 
> > The memory policies are already shared and have refcounters for that 
> > purpose.
> 
> I must have missed that in the code I'm reading :)

What is the benefit of having pointers to nodemasks? We likely would need 
to have refcounts in those nodemasks too? So we duplicate a lot of 
the characteristics of memory policies?

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:31                         ` Lee Schermerhorn
@ 2007-10-26 21:39                           ` David Rientjes
  2007-10-27  1:07                             ` Paul Jackson
  2007-10-29 15:10                             ` Lee Schermerhorn
  0 siblings, 2 replies; 98+ messages in thread
From: David Rientjes @ 2007-10-26 21:39 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Lee Schermerhorn wrote:

> So, you pass the subset, you don't set the flag to indicate you want
> interleaving over all available.  You must be thinking of some other use
> for saving the subset mask that I'm not seeing here.  Maybe restoring to
> the exact nodes requested if they're taken away and then re-added to the
> cpuset?
> 

Paul's motivation for saving the passed nodemask to set_mempolicy() is so 
that the _intent_ of the application is never lost.  That's the biggest 
advantage that this method has and that I totally agree with.  So whenever 
the mems_allowed of a cpuset changes, the MPOL_INTERLEAVE nodemask of all 
attached tasks becomes their intent (pol->passed_nodemask) AND'd with the 
new mems_allowed.  That can be done on mpol_rebind_policy() and shouldn't 
be an extensive change.

So MPOL_INTERLEAVE, and possibly other, mempolicies will always try to 
accomodate the intent of the application but only as far as the task's 
cpuset restriction allows them.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:39                           ` David Rientjes
@ 2007-10-27  1:07                             ` Paul Jackson
  2007-10-27  1:26                               ` Christoph Lameter
  2007-10-27 17:45                               ` David Rientjes
  2007-10-29 15:10                             ` Lee Schermerhorn
  1 sibling, 2 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-27  1:07 UTC (permalink / raw)
  To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel

Issue:

    Are the nodes and nodemasks passed into set_mempolicy() to be
    presumed relative to the cpuset or not?  [Careful, this question
    doesn't mean what you might think it means.]

Let's say our system has 100 nodes, numbered 0-99, and we have a task
in a cpuset that includes the twenty nodes 10-29 at the moment.

Currently, if that task does say an MPOL_PREFERRED on node 12, we take
that to mean the 3rd node of its cpuset.  If we move that task to a
cpuset on nodes 40-59, the kernel will change that MPOL_PREFERRED to
node 42.  Similarly for the other MPOL_* policies.

Ok so far ... seems reasonable.  Node numbers passed into the
set_mempolicy call are taken to be absolute node numbers that are to
be mapped relative to the tasks current cpuset, perhaps unbeknownst
to the calling task, and remapped if that cpuset changes.

But now imagine that a task happens to be in a cpuset of just two
nodes, and wants to request an MPOL_PREFERRED policy for the fourth
node of its cpuset, anytime there actually is a fourth node.  That
task can't say that using numbering relative to its current cpuset,
because that cpuset only has two nodes.  It could say it relative to
a mask of all possible nodes by asking for the fourth possible node,
likely numbered node 3.

If that task happened to be in a cpuset on nodes 10 and 11, asking
for the fourth node in the system (node 3) would still be rather
unambiguous, as node 3 can't be either of 10 or 11, so must be
relative to all possible nodes, meaning "the fourth available node,
if I'm ever fortunate enough to have that many nodes."

But if that task happened to be in a cpuset on nodes 2 and 3, then
the node number 3 could mean:

Choice A:
    as it does today, the second node in the tasks cpuset or it could
    mean

Choice B:
    the fourth node in the cpuset, if available, just as
    it did in the case above involving a cpuset on nodes 10 and 11.

Let me restate this.

Either way, passing in node 3 means node 3, as numbered in the system.

But the question is whether (Choice A) node 3 is specified because
it is the second node in the tasks cpuset, or (Choice B) because it
is the fourth node in the system.

Choice A is what we do now.  But if we stay with Choice A, then a
task stuck in a small cpuset at the moment can't express non-trivial
mempolicy's for larger cpusets that it might be in later.

Choice B lets the task calculate its mempolicy mask as if it owned
the entire system, and express whatever elaborate mempolicy placement
it might need, when blessed with enough memory nodes to matter.
The system would automatically scrunch that request down to whatever
is the current size and placement of the cpuset holding that task.

Given a clean slate, I prefer Choice B.

But Choice B is incompatible.   Switching now would break tasks that
had been carefully adapting their set_mempolicy requests to whatever
nodes were in their current cpuset.  This is probably too incompatible
to be acceptable.

Therefore it must be Choice A.

However ...

If I approach this from another angle, I can show it should be
Choice B.  Fasten your seatbelt ...

Before the days of cpusets, Choice B was essentially how it was.
Tasks computing memory policies for set_mempolicy() calls computed
node numbers as if they owned the entire system.

Essentially, cpusets introduced an incompatibility, imposing Choice
A instead.  If a task wants to name the fourth node allowed to it in
a memory policy, it can no longer just say node "3", but now has to
determine its cpuset, and count off the fourth node that is currently
allowed to it.  This is an inherently racey calculation, of the sort
that some of us would find unacceptable, because it can't cope very
easily with simultaneously changing cpusets.

My hunch (unsupported by any real evidence or experience) is that
there is very little user level code that actually depends on this
incompatible change imposed by cpusets.  I'm guessing that most
codes making heavy use of memory policies are still coded as if the
task owns the system, and would be ill prepared to cope with a heavy
cpuset environment.

If that's the case, we'd break less user code by going with Choice B.

I have a little bit of code that will notice the difference (so if
we go with Choice B, there has to be a way for user level code that
cares to probe which choice applies), but I'm not a major user of
mempolicy calls.

I'll have to rely on the experience of others more involved with memory
policy aware user code as to which Choice would be less disruptive.

Recommendations?

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27  1:07                             ` Paul Jackson
@ 2007-10-27  1:26                               ` Christoph Lameter
  2007-10-27  2:41                                 ` Paul Jackson
  2007-10-27 17:45                               ` David Rientjes
  1 sibling, 1 reply; 98+ messages in thread
From: Christoph Lameter @ 2007-10-27  1:26 UTC (permalink / raw)
  To: Paul Jackson; +Cc: David Rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Paul Jackson wrote:

> Choice B lets the task calculate its mempolicy mask as if it owned
> the entire system, and express whatever elaborate mempolicy placement
> it might need, when blessed with enough memory nodes to matter.
> The system would automatically scrunch that request down to whatever
> is the current size and placement of the cpuset holding that task.
> 
> Given a clean slate, I prefer Choice B.

Yes. We should default to Choice B. Add an option MPOL_MF_RELATIVE to 
enable that functionality? A new version of numactl can then enable
that by default for newer applications.

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27  1:26                               ` Christoph Lameter
@ 2007-10-27  2:41                                 ` Paul Jackson
  2007-10-27  2:50                                   ` Christoph Lameter
  2007-10-27 17:50                                   ` David Rientjes
  0 siblings, 2 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-27  2:41 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel

Christoph wrote:
> Yes. We should default to Choice B. Add an option MPOL_MF_RELATIVE to 
> enable that functionality? A new version of numactl can then enable
> that by default for newer applications.

I'm confused.  If B is the default, then we don't need a flag to
enable it, rather we need a flag to go back to the old choice A.

So are you saying that:
 1) Choice A remains the default for the kernel unless
    MPOL_MF_RELATIVE is added, or
 2) that the new default for the kernel is Choice B,
    unless MPOL_MF_RELATIVE is specified, asking to
    revert to the original Choice A behaviour?

Perhaps, either way, whatever compatibility flag we have should be
something that can be forced on an application from the outside,
perhaps as a per-system mode flag in /sys, or a per-cpuset mode flag,
or a per-task operation, by what mechanism is not clear.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27  2:41                                 ` Paul Jackson
@ 2007-10-27  2:50                                   ` Christoph Lameter
  2007-10-27  5:16                                     ` Paul Jackson
  2007-10-27 17:50                                   ` David Rientjes
  1 sibling, 1 reply; 98+ messages in thread
From: Christoph Lameter @ 2007-10-27  2:50 UTC (permalink / raw)
  To: Paul Jackson; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Paul Jackson wrote:

> Christoph wrote:
> > Yes. We should default to Choice B. Add an option MPOL_MF_RELATIVE to 
> > enable that functionality? A new version of numactl can then enable
> > that by default for newer applications.
> 
> I'm confused.  If B is the default, then we don't need a flag to
> enable it, rather we need a flag to go back to the old choice A.

Dont we need it for numactl to preserve backward compatibility? numactl 
can set that flag by default for newer software. We likely need a new 
major release of numactl.
 
> Perhaps, either way, whatever compatibility flag we have should be
> something that can be forced on an application from the outside,
> perhaps as a per-system mode flag in /sys, or a per-cpuset mode flag,
> or a per-task operation, by what mechanism is not clear.

libnuma can take of that. But we need to have that flag for numactl to be 
backward compatible.


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27  2:50                                   ` Christoph Lameter
@ 2007-10-27  5:16                                     ` Paul Jackson
  2007-10-27  6:07                                       ` Christoph Lameter
  0 siblings, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-27  5:16 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel

I'm still confused, Christoph.

Are you saying:
 1) The kernel continues to default to Choice A, unless
    the flag enables Choice B, or
 2) The kernel defaults to the new Choice B, unless the
    flag reverts to the old Choice A?

Alternative (2) breaks libnuma and hence numactl until it is changed
to use the flag, or changed to use choice B (in which case it wouldn't
need the flag.)

So I guess you mean alternative (1) above, since you seem to be taking
the position that we can't break compatibility here.

But I could quote statements from you that seem to clearly state the
exact opposite.

So I remain confused.

Actually, alternative (1) is kinda ugly.  It leaves a permanent wart
on the set_mempolicy API -- two different variants to what the node
numbers and node masks mean, depending on whether this MPOL_MF_RELATIVE
is set on each call.  We'll have to ship out an extra serving of brain
food for most folks looking at this to have much chance that they will
confidently understand the difference between the two options selected
by this flag.

I wonder if there might be some way to avoid that permanent ugly wart
on each and every set/get mempolicy system call forever afterward.

Please try to double check your next reply, Christoph.  I'm beginning
to worry that we might be failing to communicate clearly.  Thanks.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27  5:16                                     ` Paul Jackson
@ 2007-10-27  6:07                                       ` Christoph Lameter
  2007-10-27  8:36                                         ` Paul Jackson
  0 siblings, 1 reply; 98+ messages in thread
From: Christoph Lameter @ 2007-10-27  6:07 UTC (permalink / raw)
  To: Paul Jackson; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Paul Jackson wrote:

> Are you saying:
>  1) The kernel continues to default to Choice A, unless
>     the flag enables Choice B, or
>  2) The kernel defaults to the new Choice B, unless the
>     flag reverts to the old Choice A?

If 2) is keeping the API semantics then 2.
 
> Alternative (2) breaks libnuma and hence numactl until it is changed
> to use the flag, or changed to use choice B (in which case it wouldn't
> need the flag.)

2) keeps everything in order. Let everything be as it is today unless
numactl sets the new.
 
> So I guess you mean alternative (1) above, since you seem to be taking
> the position that we can't break compatibility here.

I am getting confused as to which alternative means what.
 
> Actually, alternative (1) is kinda ugly.  It leaves a permanent wart
> on the set_mempolicy API -- two different variants to what the node
> numbers and node masks mean, depending on whether this MPOL_MF_RELATIVE
> is set on each call.  We'll have to ship out an extra serving of brain
> food for most folks looking at this to have much chance that they will
> confidently understand the difference between the two options selected
> by this flag.

Tough. The API needs to remain stable. We can only change it through an 
additional flag that enables the relativeness and the folding the way you 
want it. libnuma may set the flag on its own without the user having to do 
anything.

> I wonder if there might be some way to avoid that permanent ugly wart
> on each and every set/get mempolicy system call forever afterward.

Hmmm.. The alternative is to add new set/get mempolicy functions.

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27  6:07                                       ` Christoph Lameter
@ 2007-10-27  8:36                                         ` Paul Jackson
  2007-10-27 17:47                                           ` Christoph Lameter
  0 siblings, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-27  8:36 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel

> > Are you saying:
> >  1) The kernel continues to default to Choice A, unless
> >     the flag enables Choice B, or
> >  2) The kernel defaults to the new Choice B, unless the
> >     flag reverts to the old Choice A?
> 
> If 2) is keeping the API semantics then 2.

No .. (1) keeps the same API semantics.

> Let everything be as it is today unless
> numactl sets the new.
> ...
> Tough. The API needs to remain stable. 

Good - that I understand.  Your position is clear now.

You have chosen (1) above, which keeps Choice A as the default.

Before I leave this part, there is one more thing I kinda really need,
if you could, Christoph.  Could you describe in your own words what you
think Choices A and B mean?  We seem to be having trouble communicating,
and hence there is some risk right now that we don't mean the same thing
by this new "Choice B".

===

Now ... onto the matter of permanent API warts:

> > I wonder if there might be some way to avoid that permanent ugly wart
> > on each and every set/get mempolicy system call forever afterward.
> 
> Hmmm.. The alternative is to add new set/get mempolicy functions.

Other alternatives include a per-system, per-cpuset or per-process
flag, in addition to the per-system call flag you suggested earlier
(MPOL_MF_RELATIVE), or whatever you mean by "new set/get mempolicy
functions" ... could you elaborate on that one?

So ... the question becomes this:

  How do we migrate to Choice B, without leaving both Choices
  permanently supported, and an ugly mode flag selecting the
  non-default Choice, while not breaking API's too abruptly?

Thanks.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27  1:07                             ` Paul Jackson
  2007-10-27  1:26                               ` Christoph Lameter
@ 2007-10-27 17:45                               ` David Rientjes
  2007-10-27 21:22                                 ` Paul Jackson
  1 sibling, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-27 17:45 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Paul Jackson wrote:

> Choice A:
>     as it does today, the second node in the tasks cpuset or it could
>     mean
> 
> Choice B:
>     the fourth node in the cpuset, if available, just as
>     it did in the case above involving a cpuset on nodes 10 and 11.
> 

Thanks for describing the situation with MPOL_PREFERRED so thoroughly.

I prefer Choice B because it does not force mempolicies to have any 
dependence on cpusets with regard to what nodemask is passed.

	[rientjes@xroads ~]$ man set_mempolicy | grep -i cpuset | wc -l
	0

It would be very good to store the passed nodemask to set_mempolicy in 
struct mempolicy, as you've already recommended for MPOL_INTERLEAVE, so 
that you can try to match the intent of the application as much as 
possible.  But since cpusets are built on top of mempolicies, I don't 
think there's any reason why we should respect any nodemask in terms of 
the current cpuset context, whether it's preferred or interleave.

So if you were to pass a nodemask with only the fourth node set for an 
MPOL_PREFERRED mempolicy, the correct behavior would be to prefer the 
fourth node on the system or, if constrained by cpusets, the fourth node 
in the cpuset.  If the cpuset has fewer than four nodes, the behavior 
should be undefined (probably implemented to just cycle the set of 
mems_allowed until you reach the fourth entry).  That's the result of 
constraining a task to a cpuset that obviously wants access to more
nodes -- it's a userspace mistake and abusing cpusets so that the task 
does not get what it expects.

That concept isn't actually new: we already restrict tasks to a certain 
amount of memory by writing to the mems file and just because it happens 
to have access to more memory when unconstrained by cpusets doesn't 
matter.  You've placed it in a cpuset that wasn't prepared to deal with 
what the task was asking for.  At least in the MPOL_PREFERRED case you 
describe above, it'll be dealt with much more pleasantly by at least 
giving it a preferred node as opposed to OOM killing it when a task has 
exhausted its available cpuset-constrained memory.

I'd prefer a solution where mempolicies can always be described and used 
without ever considering cpusets.  Then, a sane implementation will 
configure the cpuset accordingly to accomodate its tasks' mempolicies.  We 
don't want to get in a situation where we are denying a task to be 
attached to a cpuset when there are fewer nodes than the preferred node, 
for example, but we can fallback to better behavior by at least giving it 
a preferred node in the MPOL_PREFERRED case.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27  8:36                                         ` Paul Jackson
@ 2007-10-27 17:47                                           ` Christoph Lameter
  2007-10-27 20:59                                             ` Paul Jackson
  0 siblings, 1 reply; 98+ messages in thread
From: Christoph Lameter @ 2007-10-27 17:47 UTC (permalink / raw)
  To: Paul Jackson; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel

On Sat, 27 Oct 2007, Paul Jackson wrote:

> > Tough. The API needs to remain stable. 
> 
> Good - that I understand.  Your position is clear now.
> 
> You have chosen (1) above, which keeps Choice A as the default.

There can be different defaults for the user space API via libnuma that 
are indepdent from the kernel API which needs to remain stable. The kernel 
API can be extended but not changed.
 
> > Hmmm.. The alternative is to add new set/get mempolicy functions.
> 
> Other alternatives include a per-system, per-cpuset or per-process
> flag, in addition to the per-system call flag you suggested earlier
> (MPOL_MF_RELATIVE), or whatever you mean by "new set/get mempolicy
> functions" ... could you elaborate on that one?

None of those sound appealing. Multiple processes may run in one cpuset. 
Some of those may be linked to older libnumas and therefore depend on old 
behavior.

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27  2:41                                 ` Paul Jackson
  2007-10-27  2:50                                   ` Christoph Lameter
@ 2007-10-27 17:50                                   ` David Rientjes
  2007-10-27 23:19                                     ` Paul Jackson
  1 sibling, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-27 17:50 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Christoph Lameter, Lee.Schermerhorn, akpm, ak, linux-kernel

On Fri, 26 Oct 2007, Paul Jackson wrote:

> > Yes. We should default to Choice B. Add an option MPOL_MF_RELATIVE to 
> > enable that functionality? A new version of numactl can then enable
> > that by default for newer applications.
> 
> I'm confused.  If B is the default, then we don't need a flag to
> enable it, rather we need a flag to go back to the old choice A.
> 

I think there's a mixup in the flag name there, but I actually would 
recommend against any flag to effect Choice A.  It's simply going to be 
too complex to describe and is going to be a headache to code and support.  

The MPOL_PREFERRED behavior when constrained by cpusets was previously, to 
my knowledge, undocumented; you're in the position to make the behavior do 
what you want it to do and then release documentation so we'll finally 
have a complete and unambiguous API for it.  Right now it should be 
considered undefined and thus you are free to implement it as you choose.  
Then all callers of set_mempolicy(MPOL_PREFERRED) will standardize on that 
and not have to worry about the machine's 
mpol_preferred_relative_to_cpuset setting.

Then, any task that is attached to a cpuset and expecting the fourth node 
in their set_mempolicy(MPOL_PREFERRED) call to mean system node 3 if 
it's in the cpuset's mems_allowed will be broken.  If you want that, 
you'll need your task to be attached to a cpuset with at least mems 0-3; 
programmers will pick that up quickly enough if it's clearly documented.
I think Choice B is correct and makes more sense in terms of the semantics 
and at least allows mempolicies and cpusets to play nicely together 
without a bidirectional dependency on one another.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 18:45           ` David Rientjes
  2007-10-26 19:02             ` Paul Jackson
@ 2007-10-27 19:16             ` David Rientjes
  2007-10-29 16:23               ` Lee Schermerhorn
  1 sibling, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-27 19:16 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: Christoph Lameter, Andrew Morton, Andi Kleen, Paul Jackson,
	linux-kernel

On Fri, 26 Oct 2007, David Rientjes wrote:

> Hacking and requiring an updated version of libnuma to allow empty 
> nodemasks to be passed is a poor solution; if mempolicy's are supposed to 
> be independent from cpusets, then what semantics does an empty nodemask 
> actually imply when using MPOL_INTERLEAVE?  To me, it means the entire 
> set_mempolicy() should be a no-op, and that's exactly how mainline 
> currently treats it _as_well_ as libnuma.  So justifying this change in 
> the man page is respectible, but passing an empty nodemask just doesn't 
> make sense.
> 

Another reason that passing an empty nodemask to set_mempolicy() doesn't 
make sense is that libnuma uses numa_set_interleave_mask(&numa_no_nodes)
to disable interleaving completely.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27 17:47                                           ` Christoph Lameter
@ 2007-10-27 20:59                                             ` Paul Jackson
  0 siblings, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-27 20:59 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel

> > You have chosen (1) above, which keeps Choice A as the default.
> 
> There can be different defaults for the user space API via libnuma that 
> are indepdent from the kernel API which needs to remain stable. The kernel 
> API can be extended but not changed.

Yes - the user level code can have different defaults too.

I was discussing what should be the default kernel API.

> None of those [alternatives] sound appealing.  Multiple processes may run
> in one cpuset.

Well, that would justify keeping this choice per-task.  I tend to
agree with that.

But that doesn't justify having to specify it on each system call.

In another reply David recommends against supporting Choice A at all.
I'm inclined to agree with him.  I'll reply there, with more thoughts.

But if we did support Choice A, as a backwards compatible alternative
to Choice B, I'd suggest a per-task mode, not per-system call mode.
This would reduce the impact on the API of the ugly, unobvious, modal
flag needed to select the optional, non kernel default, Choice B
semantics.

I still have low confidence that you (Christoph) and I have the same
understanding of what these Choice A and B are.  Hopefully you can
address that, perhaps by briefly describing these choices in your words.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27 17:45                               ` David Rientjes
@ 2007-10-27 21:22                                 ` Paul Jackson
  0 siblings, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-27 21:22 UTC (permalink / raw)
  To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel

David wrote:
> I prefer Choice B because it does not force mempolicies to have any 
> dependence on cpusets with regard to what nodemask is passed.

Yes, well said.

> It would be very good to store the passed nodemask to set_mempolicy in 
> struct mempolicy, 

Yes - that's what I'm intending to do.

> If the cpuset has fewer than four nodes, the behavior 
> should be undefined (probably implemented to just cycle the set of 
> mems_allowed until you reach the fourth entry).

I do intend to implement it as you suggest.  See the lib/bitmap.c
routines bitmap_remap() and bitmap_bitremap(), and the nodemask
wrappers for these, nodes_remap() and node_remap().  They will
define the cycling, or I sometimes call it folding.

I would have tended to make this folding a defined part of the API,
though I will grant that the possibility of being lazy and forgetting
to document it seems attractive (less to document ;).

> That [running in a cpuset with fewer nodes than used in a memory policy
> mask] is the result of constraining a task to a cpuset that obviously
> wants access to more nodes -- it's a userspace mistake and abusing
> cpusets so that the task does not get what it expects.

Nah - I wouldn't put it that way.  It's no mistake or abuse.  It's just
one more example of a kernel making too few resources look sufficient
by sharing, multiplexing and virtualizing them.  That's what kernels do.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27 17:50                                   ` David Rientjes
@ 2007-10-27 23:19                                     ` Paul Jackson
  2007-10-28 18:19                                       ` David Rientjes
  2007-10-29 16:54                                       ` Lee Schermerhorn
  0 siblings, 2 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-27 23:19 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

David wrote:
> I think there's a mixup in the flag name [MPOL_MF_RELATIVE] there

Most likely.  The discussion involving that flag name was kinda mixed up ;).

> but I actually would recommend against any flag to effect Choice A.
> It's simply going to be too complex to describe and is going to be a
> headache to code and support. 

While I am sorely tempted to agree entirely with this, I suspect that
Christoph has a point when he cautions against breaking this kernel API.

Especially for users of the set/get mempolicy calls coming in via
libnuma, we have to be very careful not to break the current behaviour,
whether it is documented API or just an accident of the implementation.

There is a fairly deep and important stack of software, involving a
well known DBMS product whose name begins with 'O', sitting on that
libnuma software stack.  Steering that solution stack is like steering
a giant oil tanker near shore.  You take it slow and easy, and listen
closely to the advice of the ancient harbor master.  The harbor masters
in this case are or were Andi Kleen and Christoph Lameter.

> It's simply going to be too complex to describe and is going to be a
> headache to code and support.

True, which is why I am hoping we can keep this modal flag, if such be,
from having to be used on every set/get mempolicy call.  The ordinary
coder of new code using these calls directly should just see Choice B
behaviour.  However the user of libnuma should continue to see whatever
API libnuma supports, with no change whatsoever, and various versions of
libnuma, including those already shipped years ago, must continue to
behave without any changes in node numbering.

There are two decent looking ways (and some ugly ways) that I can see
to accomplish this:

 1) One could claim that no important use of Oracle over libnuma over
    these memory policy calls is happening on a system using cpusets.
    There would be a fair bit of circumstantial evidence for this
    claim, but I don't know it for a fact, and would not be the
    expert to determine this.  On systems making no use of cpusets,
    these two Choices A and B are identical, and this is a non-issue.
    Those systems will see no API changes whatsoever from any of this.

 2) We have a per-task mode flag selecting whether Choice A or B
    node numbering apply to the masks passed in to set_mempolicy.

    The kernel implementation is fairly easy.  (Yeah, I know, I
    too cringe everytime I read that line ;)

    If Choice A is active, then we continue to enforce the current
    check, in mm/mempolicy.c: contextualize_policy(), that the passed
    in mask be a subset of the current cpusets allowed memory nodes,
    and we add a call to nodes_remap(), to map the passed in nodemask
    from cpuset centric to system centric (if they asked for the
    fourth node in their current cpuset, they get the fourth node in
    the entire system.)

    Similarly, for masks passed back by get_mempolicy, if Choice A
    is active, we use nodes_remap() to convert the mask from system
    centric back to cpuset centric.

    There is a subtle change in the kernel API's here:

	In current kernels, which are Choice A, if a task is moved from
	a big cpuset (many nodes) to a small cpuset and then -back-
	to a big cpuset, the nodemasks returned by get_mempolicy
	will still show the smaller masks (fewer set nodes) imposed
	by the smaller cpuset.

	In todays kernels, once scrunched or folded down, the masks
	don't recover their original size after the task is moved
	back to a large cpuset.

	With this change, even a task asking for Choice A would,
	once back on a larger cpuset, again see the larger masks from
	get_mempolicy queries.  This is a change in the kernel API's
	visible to user space; but I really do not think that there
	is sufficient use of Oracle over libnuma on systems actively
	moving tasks between differing size cpusets for this to be
	a problem.

	Indeed, if there was much such usage, I suspect they'd
	be complaining that the current kernel API was borked, and
	they'd be filing a request for enhancement -asking- for just
	this subtle change in the kernel API's here.  In other words,
	this subtle API change is a feature, not a bug ;)

    The bulk of the kernel's mempolicy code is coded for Choice B.

    If Choice B is active, we don't enforce the subset check in
    contextualize_policy(), and we don't invoke nodes_remap() in either
    of the set or get mempolicy code paths.

    A new option to get_mempolicy() would query the current state of
    this mode flag, and a new option to set_mempolicy() would set
    and clear this mode flag.  Perhaps Christoph had this in mind
    when he wrote in an earlier message "The alternative is to add
    new set/get mempolicy functions."

    The default kernel API for each task would be Choice B (!).

    However, in deference to the needs of libnuma, if the following
    call was made, this would change the mode for that task to
    Choice A:

	get_mempolicy(NULL, NULL, 0, 0, 0);

    This last detail above is an admitted hack.  *So far as I know*
    it happens that all current infield versions of libnuma issue the
    above call, as their first mempolicy query, to detemine whether
    the active kernel supports mempolicy.

    The mode for each task would be inherited across fork, and reset
    to Choice (B) on exec.

If we determine that we must go with a new flag bit to be passed in
on each and every get and set mempolicy call that wants Choice B node
numbering rather than Choice A, then I will need (1) a bottle of rum,
and (2) a credible plan in place to phase out this abomination ;).

There would be just way too many coding errors and coder frustrations
introduced by requiring such a flag on each and every mempolicy system
call that wants the alternative numbering.  There must be some
international treaty that prohibits landmines capable of blowing ones
foot off that would apply here.

There are two major user level libraries sitting on top of this API,
libnuma and libcpuset.  Libnuma is well known; it was written by Andi
Kleen.  I wrote libcpuset, and while it is LGPL licensed, it has not
been publicized very well yet.  I can speak for libcpuset: it could
adapt to the above proposal, in particular to the details in way (2),
just fine.  Old versions of libcpuset running on new kernels will
have a little bit of subtle breakage, but not in areas that I expect
will cause much grief.  Someone more familiar with libnuma than I would
have to examine the above proposal in way (2) to be sure that we weren't
throwing libnuma some curveball that was unnecessarily troublesome.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27 23:19                                     ` Paul Jackson
@ 2007-10-28 18:19                                       ` David Rientjes
  2007-10-28 23:46                                         ` Paul Jackson
  2007-10-29 16:54                                       ` Lee Schermerhorn
  1 sibling, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-28 18:19 UTC (permalink / raw)
  To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

On Sat, 27 Oct 2007, Paul Jackson wrote:

> > but I actually would recommend against any flag to effect Choice A.
> > It's simply going to be too complex to describe and is going to be a
> > headache to code and support. 
> 
> While I am sorely tempted to agree entirely with this, I suspect that
> Christoph has a point when he cautions against breaking this kernel API.
> 
> Especially for users of the set/get mempolicy calls coming in via
> libnuma, we have to be very careful not to break the current behaviour,
> whether it is documented API or just an accident of the implementation.
> 

>From a standpoint of the MPOL_PREFERRED memory policy itself, there is no 
documented behavior or standard that specifies its interaction with 
cpusets.  Thus, it's "undefined."  We are completely free to implement an 
undefined behavior as we choose and change it as Linux matures.

Once it is defined, however, we carry the burden of protecting 
applications that are written on that definition.  That's the point where 
we need to get it right and if we don't, we're stuck with it forever; I 
don't believe we're at that point with MPOL_PREFERRED policies under 
cpusets right now.

> There is a fairly deep and important stack of software, involving a
> well known DBMS product whose name begins with 'O', sitting on that
> libnuma software stack.  Steering that solution stack is like steering
> a giant oil tanker near shore.  You take it slow and easy, and listen
> closely to the advice of the ancient harbor master.  The harbor masters
> in this case are or were Andi Kleen and Christoph Lameter.
> 

Ok, let's take a look at some specific unproprietary examples of tasks 
that use set_mempolicy(MPOL_PREFERRED) for a specific node, intending it 
to be the actual system node offset, that is then assigned to a cpuset 
that doesn't require that offset to be allowed.

I think it's going to become pretty difficult to find an example because 
the whole scenario is pretty lame: you would need to already know which 
nodes you're going to be assigned to in the cpuset to ask for one of them 
as your preferred node.  I don't imagine any application can have that 
type of foresight and, if it does, then we certainly shouldn't support the 
preferred node_remap() when it changes mems.

You're trying to support a scheme, in Choice A, where an application knows 
it's going to be assigned to a range of nodes (for example, 1-3) and wants 
the preferred node to be included (for example, 2).  So now the 
application must have control over both its memory policy and its cpuset 
placement.  Then it must be willing to change its cpuset placement to a 
different set of nodes (with equal or greater cardinality) and have the 
preferred node offset respected.  Why can't it simply then issue another 
set_mempolicy(MPOL_PREFERRED) call for the new preferred node?

See?  The problem is that you're trying to protect applications that know 
its initial cpuset mems [the only way it could ever send a 
set_mempolicy(MPOL_PREFERRED) for the right node in that range in the 
first place] but then seemingly loses control over its cpuset and intends 
for the kernel to fix it up for it without having the burden of issuing 
another set_mempolicy() call.

And you're trying to protect this application that based this 
implementation not on a standard or documentation, but on its observed 
behavior.  My bet is that it's going to issue that subsequent 
set_mempolicy(), at least if libnuma returned a numa_preferred() value 
that it wasn't expecting.

> True, which is why I am hoping we can keep this modal flag, if such be,
> from having to be used on every set/get mempolicy call.  The ordinary
> coder of new code using these calls directly should just see Choice B
> behaviour.  However the user of libnuma should continue to see whatever
> API libnuma supports, with no change whatsoever, and various versions of
> libnuma, including those already shipped years ago, must continue to
> behave without any changes in node numbering.
> 

I don't see how you can accomplish that.  If the default behavior is 
Choice B, which is different from what is currently implemented in the 
kernel, you're going to either require a modification to the application 
to set a flag asking for Choice A again or make the default kernel 
behavior that of Choice A and set a flag implicitly via libnuma when 
future versions are released.

In the former case, just ask the application to adjust its node numbering 
scheme or check the result of numa_preferred().  In the latter case, we're 
not even talking about changing the kernel default anymore to Choice B.

>  2) We have a per-task mode flag selecting whether Choice A or B
>     node numbering apply to the masks passed in to set_mempolicy.
> 
>     The kernel implementation is fairly easy.  (Yeah, I know, I
>     too cringe everytime I read that line ;)
> 

If you add this per-task mode flag to default to Choice A for preferred 
memory policies, it'll be extremely confusing to document and support.  If 
it's already decided that we should default to Choice B, it's going to 
require an update to the application to write to /proc/pid/i_want_choice_A 
or use the new set_mempolicy() option anyway, so instead of adding that 
hack you should simply fix your node numbering.

And I suspect that if that per-task mode flag is added, it will eventually 
be the subject of a thread with the subject "is this highly specialized 
flag even used anymore?" at which point it will be marked deprecated and 
eventually obsoleted.

>     The bulk of the kernel's mempolicy code is coded for Choice B.
> 
>     If Choice B is active, we don't enforce the subset check in
>     contextualize_policy(), and we don't invoke nodes_remap() in either
>     of the set or get mempolicy code paths.
> 

Yeah, remapping the nodemask is a bad idea anyway to get a preferred node.  
Preferred nodes inherently deal with offsets from node 0 anyway.

>     A new option to get_mempolicy() would query the current state of
>     this mode flag, and a new option to set_mempolicy() would set
>     and clear this mode flag.  Perhaps Christoph had this in mind
>     when he wrote in an earlier message "The alternative is to add
>     new set/get mempolicy functions."
> 

That still requires a change to the application.  So they should simply 
rethink their node numbering instead and fix their application to follow a 
behavior that will, at that point, be documented.

Any application that doesn't respect the return value of 
set_mempolicy(MPOL_PREFERRED) node isn't worth supporting anyway.

There's two cases to think about:

 - When the cpuset assignment changes from the root cpuset to a
   user-created cpuset with a subset of system mems and then
   set_mempolicy() is called, and

 - When set_mempolicy() is called and then the cpuset mems change either
   because it was attached to a different cpuset or someone wrote to its
   'mems' file.

In the first case, the new API should return -EINVAL if you ask for a 
preferred node offset that is smaller than the cardinality of your 
mems_allowed.  That will catch some of these applications that may have 
actually been implemented based on the current undocumented behavior.

In the second case, the first node in the nodemask passed to 
set_mempolicy() was a system node offset anyway and had nothing to do with 
cpusets (it was a member of the root cpuset with access to all mems) so it 
already behaves as Choice B.

> There are two major user level libraries sitting on top of this API,
> libnuma and libcpuset.  Libnuma is well known; it was written by Andi
> Kleen.  I wrote libcpuset, and while it is LGPL licensed, it has not
> been publicized very well yet.  I can speak for libcpuset: it could
> adapt to the above proposal, in particular to the details in way (2),
> just fine.  Old versions of libcpuset running on new kernels will
> have a little bit of subtle breakage, but not in areas that I expect
> will cause much grief.  Someone more familiar with libnuma than I would
> have to examine the above proposal in way (2) to be sure that we weren't
> throwing libnuma some curveball that was unnecessarily troublesome.
> 

I think any application that gets constrained to a subset of nodes in its 
mems_allowed and then bases its preferred node number off that subset to 
create an offset that is intended to be preserved over subsequent mems 
changes without rechecking the result with numa_preferred() or issuing a 
subsequent set_mempolicy() is poorly written.  Especially since that 
behavior was undocumented.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-28 18:19                                       ` David Rientjes
@ 2007-10-28 23:46                                         ` Paul Jackson
  2007-10-29  1:04                                           ` David Rientjes
  0 siblings, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-28 23:46 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

David wrote:
> From a standpoint of the MPOL_PREFERRED memory policy itself, there
> is no documented behavior or standard that specifies its interaction
> with cpusets.  Thus, it's "undefined."  We are completely free
> to implement an undefined behavior as we choose and change it as
> Linux matures.

You state this point clearly, but I have to disagree.

The Linux documentation is not a legal contract.  Anytime we change the
actual behaviour of the code, we have to ask ourselves what will be the
impact of that change on existing users and usages.  The burden is on
us to minimize breaking things (by that I mean, what users would
consider breakage, even if we think it is all for the better and that
their code was the real problem.)  I didn't say no breakage, but
minimum breakage, doing our best to guide users through changes with
minimum disruption to their work.

Linux is gaining market share rapidly because we co-operate with our
users to give us both the best chance of succeeding.

We don't just play gotcha games with the documentation -- ha ha --
we didn't document that detail, so it's your fault for ever depending
on it.  And besides your code sucks.  So there!  Let's leave that game
for others.

> And you're trying to protect this application that based this 
> implementation not on a standard or documentation, but on its observed 
> behavior.

Yes, if there is such an application, I'm trying to protect it.

> My bet is that it's going to issue that subsequent 
> set_mempolicy(), at least if libnuma returned a numa_preferred() value 
> that it wasn't expecting.

Perhaps.  Perhaps not.  I don't know.

> That still requires a change to the application.

If that were so, then yes much of your subsequent reasoning would follow.

However, let me repeat the following from my previous message:

>     However, in deference to the needs of libnuma, if the following
>     call was made, this would change the mode for that task to
>     Choice A:
> 
> 	get_mempolicy(NULL, NULL, 0, 0, 0);
> 
>     This last detail above is an admitted hack.  *So far as I know*
>     it happens that all current infield versions of libnuma issue the
>     above call, as their first mempolicy query, to detemine whether
>     the active kernel supports mempolicy.

The above is the hack that allows us to support existing libnuma based
applications (the most significant users of memory policy historically)
with a default of Choice A, while other code and future code defaults
to Choice B.

> See?  The problem is that you're trying to protect applications that know 
> its initial cpuset mems [the only way it could ever send a 
> set_mempolicy(MPOL_PREFERRED) for the right node in that range in the 
> first place] but then seemingly loses control over its cpuset and intends 
> for the kernel to fix it up for it without having the burden of issuing 
> another set_mempolicy() call.

That's not the only sort of application I'm trying to protect.

I'm trying to protect almost any application that uses both
set_mempolicy or mbind, while in a cpuset.

    If a task is in a cpuset on say nodes 16-23, and it wants to issue
    any mbind, or any MPOL_PREFERRED, MPOL_BIND, or MPOL_INTERLEAVE
    mempolicy call, then under Choice A it must issue nodemasks offset
    by 16, relative to what it would issue under Choice B.

Almost any task using memory policies on a system making active use of
cpusets will be affected, even well written ones doing simple things.

I am more concerned that the above hack for libnuma isn't enough,
rather than it is unnecessary.

I think the above hack covers existing libnuma users rather well,
though I could be wrong even here, as I don't actually work with
most of the existing libnuma users.

And I can cover those using both memory policies and cpusets via
libcpuset, as there I am the expert and know that I can guide
libcpuset and its users through this change.  There will be some
breakage, but I know how to manage it.

However, if anyone is deploying product or has important (to them) but
not easy to change software using both memory policies and cpusets that
are not doing so via libnuma or libcpuset, then any change that
changes the default for their memory policy calls from Choice A to
Choice B will probably break them.

Perhaps there are no significant cases like this, using memory policies
on cpuset managed systems, but not via libnuma or libcpuset.  It might
be that the only way to smoke out such cases is to ship the change (to
Choice B default) and see what squawks.  That kinda sucks.

I'm tempted to think I need to go a bit further:
 1) Add a per-system or per-cpuset runtime mode, to enable a system
    administrator to revert to the Choice A default.
 2) Make sure that both the new libnuma and libcpuset versions dynamically
    probe the state of this default, and dynamically adapt to running
    on a kernel, or in a cpuset, of either default.  If this mode is
    per-cpuset, then so long as there are not two applications that
    must both run in the same cpuset, both using memory policies,
    neither using libnuma or libcpuset, requiring conflicting defaults
    (one Choice A, the other Choice B) then this would provide an
    administrator sufficient flexibility to adapt.

There would be one key advantage to a per-cpuset mode flag here.  It
exposes in a fairly visible way this change in the default numbering of
memory policy node masks, from cpuset relative to system relative (with
the system now automatically making them cpuset relative across any
changes in the number of nodes in the cpuset.)  This is a classic way
to guide users to understanding new alternatives and changes; expose it
as a 'button on the dashboard', a feature, not a hidden gotcha.

It's the old "it's a feature, not a bug" approach.  It can be a
useful mechanism to educate and enpower customers.

Users end up seeing it, learning about it, enjoying some sense of
control, and usually being able to make it work for them, one way
or the other.  That works out better than just hitting some random
subset of users with subtle bugs (from their perspective) over which
they have little or no control and fewer clues.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-28 23:46                                         ` Paul Jackson
@ 2007-10-29  1:04                                           ` David Rientjes
  2007-10-29  4:27                                             ` Paul Jackson
  0 siblings, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-29  1:04 UTC (permalink / raw)
  To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

On Sun, 28 Oct 2007, Paul Jackson wrote:

> The Linux documentation is not a legal contract.  Anytime we change the
> actual behaviour of the code, we have to ask ourselves what will be the
> impact of that change on existing users and usages.  The burden is on
> us to minimize breaking things (by that I mean, what users would
> consider breakage, even if we think it is all for the better and that
> their code was the real problem.)  I didn't say no breakage, but
> minimum breakage, doing our best to guide users through changes with
> minimum disruption to their work.
> 

Nobody can show an example of an application that would be broken because 
of this and, given the scenario and sequence of events that it requires to 
be broken when implementing the default as Choice B, I don't think it's as 
much of an issue as you believe.

> >     However, in deference to the needs of libnuma, if the following
> >     call was made, this would change the mode for that task to
> >     Choice A:
> > 
> > 	get_mempolicy(NULL, NULL, 0, 0, 0);
> > 
> >     This last detail above is an admitted hack.  *So far as I know*
> >     it happens that all current infield versions of libnuma issue the
> >     above call, as their first mempolicy query, to detemine whether
> >     the active kernel supports mempolicy.
> 
> The above is the hack that allows us to support existing libnuma based
> applications (the most significant users of memory policy historically)
> with a default of Choice A, while other code and future code defaults
> to Choice B.
> 

So all applications that use the libnuma interface and numactl will have 
different default behavior than those that simply issue 
{get,set}_mempolicy() calls.  libnuma is a collection of higher level 
functions that should be built upon {get,set}_mempolicy() like they 
currently are and not introduce new subtleties like changing the semantics 
of a preferred node argument.  This is going to quickly become a 
documentation nightmare and, in my opinion, isn't worth the time or effort 
to support because we haven't even idenitifed any real-world examples.

Maybe Andi Kleen should weigh in on this topic because, if we go with what 
you're suggesting, we'll never get rid of the two differing behaviors and 
we'll be introducing different semantics to arguments of libnuma functions 
than the kernel API they are built upon.

> I'm trying to protect almost any application that uses both
> set_mempolicy or mbind, while in a cpuset.
> 
>     If a task is in a cpuset on say nodes 16-23, and it wants to issue
>     any mbind, or any MPOL_PREFERRED, MPOL_BIND, or MPOL_INTERLEAVE
>     mempolicy call, then under Choice A it must issue nodemasks offset
>     by 16, relative to what it would issue under Choice B.
> 

True, but the ordering of that scenario is troublesome.  The correct way 
to implement it is to use set_mempolicy() or a higher level libnuma 
function with the same semantics and _then_ attach the task to a cpuset.  
Then the nodes_remap() takes care of the rest.

The scenario you describe above has a problem because it requires the task 
to have knowledge of the cpuset's mems in which it is attached when, for 
portability, it should have been written so that it is robust to any range 
of nodes you happen to assign it to.

> Almost any task using memory policies on a system making active use of
> cpusets will be affected, even well written ones doing simple things.
> 

No, because nodes_remap() takes care of the instances you describe above 
when the task sets its memory policy (usually done when it is started) and 
is then attached to a cpuset.

> However, if anyone is deploying product or has important (to them) but
> not easy to change software using both memory policies and cpusets that
> are not doing so via libnuma or libcpuset, then any change that
> changes the default for their memory policy calls from Choice A to
> Choice B will probably break them.
> 

Supporting two different behaviors is going to be more problematic than 
simply selecting one and going with it and its associated documentation in 
future versions of the kernel.

> I'm tempted to think I need to go a bit further:
>  1) Add a per-system or per-cpuset runtime mode, to enable a system
>     administrator to revert to the Choice A default.

Paul, the changes required to an application that is currently using 
{get,set}_mempolicy() calls to setup the memory policy or the higher level 
functions through libnuma is so easy to use Choice B as a default instead 
of Choice A that it would be ridiculous to support configuring it on a 
per-system or per-cpuset basis.

>  2) Make sure that both the new libnuma and libcpuset versions dynamically
>     probe the state of this default, and dynamically adapt to running
>     on a kernel, or in a cpuset, of either default.  If this mode is
>     per-cpuset, then so long as there are not two applications that
>     must both run in the same cpuset, both using memory policies,
>     neither using libnuma or libcpuset, requiring conflicting defaults
>     (one Choice A, the other Choice B) then this would provide an
>     administrator sufficient flexibility to adapt.
> 

Choosing only one behavior for the kernel (Choice B) is by far the 
superior selection because then any task can share a cpuset with any other 
task and implement its memory policy preferences in terms of low level 
system calls, numactl, or libnuma.  That's the power that we should be 
giving users, not the addition of hacks or more configuration knobs that 
is going to clutter and confuse anybody who wants to simply pick a 
preferred node.

> There would be one key advantage to a per-cpuset mode flag here.  It
> exposes in a fairly visible way this change in the default numbering of
> memory policy node masks, from cpuset relative to system relative (with
> the system now automatically making them cpuset relative across any
> changes in the number of nodes in the cpuset.)  This is a classic way
> to guide users to understanding new alternatives and changes; expose it
> as a 'button on the dashboard', a feature, not a hidden gotcha.
> 

Yet the 'mems' file would still be system-wide; otherwise it would be 
impossible to expand the memory your cpuset has access to.  Everything 
else would be relative to 'mems'.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29  1:04                                           ` David Rientjes
@ 2007-10-29  4:27                                             ` Paul Jackson
  2007-10-29  4:47                                               ` David Rientjes
  0 siblings, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-29  4:27 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

> Nobody can show an example of an application that would be broken because 
> of this and, given the scenario and sequence of events that it requires to 
> be broken when implementing the default as Choice B, I don't think it's as 
> much of an issue as you believe.

Well, neither you nor I have shown an example.  That's different than
"nobody can."

Since it would affect any task setting memory policies while in a
cpuset holding less than all memory nodes, it seems potentially serious
to me.

Actually, I have one example.  The libcpuset library would have some
breakage with Choice B the only Choice.  But I'm in a position to deal
with that, so it's not a big deal.

> So all applications that use the libnuma interface and numactl will have 
> different default behavior than those that simply issue 
> {get,set}_mempolicy() calls.

Breaking the libnuma-Oracle solution stack is not an option.

And, unless someone in the know tells us otherwise, I have to assume
that this could break them.  Now, the odds are that they simply don't
run that solution stack on any system making active use of cpusets,
so the odds are this would be no problem for them.  But I don't
presently have enough knowledge of their situation to take that risk.

> if we go with what you're suggesting, we'll never get rid of the two
> differing behaviors and we'll be introducing different semantics
> to arguments of libnuma functions than the kernel API they are
> built upon.

We could get rid of Choice A once libnuma and libcpuset have adapted
to Choice B, and any other uses of Choice A that we've subsequently
identified have had sufficient time to adapt.

But dual support is pretty easy so far as the kernel code is concerned.
It's just a few nodes_remap() calls optionally invoked at a few key
spots in mm/mempolicy.c.  Consequently there won't be a big hurry to
remove Choice A.

> > I'm trying to protect almost any application that uses both
> > set_mempolicy or mbind, while in a cpuset.
> > 
> >     If a task is in a cpuset on say nodes 16-23, and it wants to issue
> >     any mbind, or any MPOL_PREFERRED, MPOL_BIND, or MPOL_INTERLEAVE
> >     mempolicy call, then under Choice A it must issue nodemasks offset
> >     by 16, relative to what it would issue under Choice B.
> > 
> 
> True, but the ordering of that scenario is troublesome.  The correct way 
> to implement it is to use set_mempolicy() or a higher level libnuma 
> function with the same semantics and _then_ attach the task to a cpuset.  
> Then the nodes_remap() takes care of the rest.

There is no "_then_ attach the task to a cpuset."  On systems with
kernels configured with CONFIG_CPUSETS=y, all tasks are in a cpuset
all the time.  Moreover, from a practical point of view, on large
systems managed with cpuset based mechanisms, almost all tasks are in
cpusets that do not include all nodes, for the entire life of the task.

And besides, I can't break existing applications willy-nilly, and
then claim it's their fault, because they should have been coded
differently.  So "correct way" arguments don't hold alot of weight
for already released and deployed product.

> Paul, the changes required to an application that is currently using 
> {get,set}_mempolicy() calls to setup the memory policy or the higher level 
> functions through libnuma is so easy to use Choice B as a default instead 
> of Choice A that it would be ridiculous to support configuring it on a 
> per-system or per-cpuset basis.

David ;)  I make some effort to avoid forcing applications to be
recoded and rebuilt in order to continue functioning.

> Yet the 'mems' file would still be system-wide; otherwise it would be 
> impossible to expand the memory your cpuset has access to.

I had to read that a couple of times to make sense of it.  I take that
it means that the node numbering used in each cpuset's 'mems' file has
to be system-wide.  Yes, agreed.

(Well, actually, the node numbering of each cpusets 'mems' file could
be relative to its parent cpusets 'mem' numbers, but let's not go
there, as this discussion is already sufficiently complicated ;)

> Everything else would be relative to 'mems'.

That's what Choice B states, yes.  Though to be clear, time for another
example:
  * task is in cpuset with mems: 24-31
  * task wants some memory policy on the first two nodes of its cpuset.
  * by Choice A, it asks for nodes 24 and 25
  * by Choice B, it asks for nodes 0 and 1

The Choice B numbering can be thought of as cpuset relative.  In it,
node N means the N-th node in my current cpuset, modulo whatever is the
node size of that cpuset.

However ...

We need to continue to support Choice A as well, perhaps for some
interim, perhaps forever.  Which doesn't much matter for now.

===

David - how would the following do for you?

Would it meet the need that prompted your initial patch set if we
added Choice B memory policy node numbering, but left Choice A as the
kernel default, with a per-task option (perhaps invokable by a new
option to one of the {get,set}_mempolicy() calls) to choose Choice B?

This lets us get Choice B out there, and lets the two main libraries,
libnuma and libcpuset, dynamically adapt to whichever Choice is active
for the current task.

Unchanged applications and existing binaries would simply continue with
Choice A.  With one additional line of code, a user application could
get Choice B, with its ability for example to request MPOL_INTERLEAVE
over all cpuset allowed nodes, where the kernel automatically adapts
that to changing cpuset changes from larger 'mems' to smaller 'mems'
and back to larger 'mems' again.

It would mean you would have to make a change to your applications
to get this improved interleaving.  But I trust from all you've been
advocating that such a code change and rebuild would not be any problem
for whatever situation you're concerned with.

We could recommend that new code probe to see if Choice B is available
and prefer it if it is.   At some future time, we might deprecate and
eventually remove Choice A.

I appreciate that you don't want to leave in place the complications
of dual Choices, but I lack the experience, knowledge or clarity I
need to support fully changing over to Choice B at this time.

Getting Choice B out there will go a long way toward providing us
with the feedback we will need to guide future decisions.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29  4:27                                             ` Paul Jackson
@ 2007-10-29  4:47                                               ` David Rientjes
  2007-10-29  5:45                                                 ` Paul Jackson
  2007-10-29  7:15                                                 ` Paul Jackson
  0 siblings, 2 replies; 98+ messages in thread
From: David Rientjes @ 2007-10-29  4:47 UTC (permalink / raw)
  To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

On Sun, 28 Oct 2007, Paul Jackson wrote:

> And, unless someone in the know tells us otherwise, I have to assume
> that this could break them.  Now, the odds are that they simply don't
> run that solution stack on any system making active use of cpusets,
> so the odds are this would be no problem for them.  But I don't
> presently have enough knowledge of their situation to take that risk.
> 

If we can't identify any applications that would be broken by this, what's 
the difference in simply implementing Choice B and then, if we hear 
complaints, add your hack to revert back to Choice A behavior based on the 
get_mempolicy() call you specified is always part of libnuma?

The problem that I see with immediately offering both choices is that we 
don't know if anybody is actually reverting back to Choice A behavior 
because libnuma, by default, would use it.  That's going to making it very 
painful to remove later because we've supported both options and have made 
libnuma and {get,set}_mempolicy() arguments ambiguous.  We should only 
support both choices if they will both be used and there's no hard 
evidence to suggest that at this point.

> But dual support is pretty easy so far as the kernel code is concerned.
> It's just a few nodes_remap() calls optionally invoked at a few key
> spots in mm/mempolicy.c.  Consequently there won't be a big hurry to
> remove Choice A.
> 

You earlier insisted on an ease of documentation for the MPOL_INTERLEAVE 
case and now this dual support that you're proposing is going to make the 
documentation very difficult to understand for anyone who simply wants to 
use mempolicies.

Others even in this thread have had a hard enough time understanding the 
difference between the two choices and you explained them very thoroughly.  
It's going to be much more trouble than it's worth, I predict.

> There is no "_then_ attach the task to a cpuset."  On systems with
> kernels configured with CONFIG_CPUSETS=y, all tasks are in a cpuset
> all the time.  Moreover, from a practical point of view, on large
> systems managed with cpuset based mechanisms, almost all tasks are in
> cpusets that do not include all nodes, for the entire life of the task.
> 

And that application would need to be implemented to know the nodes that 
it has access to before it issues its set_mempolicy(MPOL_PREFERRED) 
command anyway if it truly uses Choice A behavior.  So unless these tasks 
are looking in /proc/pid/status and parsing Mems_allowed and then 
specifying one as its preferred node or always being guaranteed a certain 
set of nodes that they are always attached to in a cpuset so they have 
such foresight of what node to prefer, Choice A can't possibly be what 
they want.

> > Yet the 'mems' file would still be system-wide; otherwise it would be 
> > impossible to expand the memory your cpuset has access to.
> 
> I had to read that a couple of times to make sense of it.  I take that
> it means that the node numbering used in each cpuset's 'mems' file has
> to be system-wide.  Yes, agreed.
> 
> (Well, actually, the node numbering of each cpusets 'mems' file could
> be relative to its parent cpusets 'mem' numbers, but let's not go
> there, as this discussion is already sufficiently complicated ;)
> 

I appreciate that very much.

> Would it meet the need that prompted your initial patch set if we
> added Choice B memory policy node numbering, but left Choice A as the
> kernel default, with a per-task option (perhaps invokable by a new
> option to one of the {get,set}_mempolicy() calls) to choose Choice B?
> 

The needs I was addressing with my initial patchset was so that when a 
cpuset is expanded, any MPOL_INTERLEAVE memory policy of attached tasks 
automatically get expanded as well.  This discussion has somewhat diverged 
from that, but I hope you still support what we earlier talked about in 
terms of adding a field to struct mempolicy to remember the intended 
nodemask the application asked to interleave over.

> This lets us get Choice B out there, and lets the two main libraries,
> libnuma and libcpuset, dynamically adapt to whichever Choice is active
> for the current task.
> 
> Unchanged applications and existing binaries would simply continue with
> Choice A.  With one additional line of code, a user application could
> get Choice B, with its ability for example to request MPOL_INTERLEAVE
> over all cpuset allowed nodes, where the kernel automatically adapts
> that to changing cpuset changes from larger 'mems' to smaller 'mems'
> and back to larger 'mems' again.
> 

You don't actually need to choose between the two choices for adapting 
MPOL_INTERLEAVE over _all_ allowed cpuset nodes.

I thought what we agreed upon and what you were going to implement was 
adding a nodemask_t to struct mempolicy for the intended nodemask of the 
memory policy and then AND it with pol->cpuset_mems_allowed.  That 
completely satisfies my needs and my applications that want to allocate 
over all available nodes (by simply passing numa_all_nodes to 
set_mempolicy(MPOL_INTERLEAVE)).  If I wanted to interleave only over a 
subset, the choices would matter.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29  4:47                                               ` David Rientjes
@ 2007-10-29  5:45                                                 ` Paul Jackson
  2007-10-29  7:00                                                   ` David Rientjes
  2007-10-29  7:15                                                 ` Paul Jackson
  1 sibling, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-29  5:45 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

David wrote:
> If we can't identify any applications that would be broken by this, what's 
> the difference in simply implementing Choice B and then, if we hear 
> complaints, add your hack to revert back to Choice A behavior based on the 
> get_mempolicy() call you specified is always part of libnuma?

I'll probably reply to other parts of your message later, but this
one catches my eye right now.

"if we hear complaints, add your hack ... back"  -- this doesn't seem
like a good idea to me.  Maybe inside Google you don't see it, but
for those of us shipping computer systems using major distributions
such as SUSE or Red Hat, there can be a year lag between when I send a
feature patch to Andrew, and when my customers send their first
feedback to me resulting from using that new feature.

There are ways to expedite fixes for specific situations, of course,
but in general, this is rather like sending out a deep space probe.
You have to conservatively cover your options pre-launch, because
post-launch repairs are costly, slow and limited.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29  5:45                                                 ` Paul Jackson
@ 2007-10-29  7:00                                                   ` David Rientjes
  2007-10-29  7:26                                                     ` Paul Jackson
  0 siblings, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-29  7:00 UTC (permalink / raw)
  To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

On Sun, 28 Oct 2007, Paul Jackson wrote:

> > If we can't identify any applications that would be broken by this, what's 
> > the difference in simply implementing Choice B and then, if we hear 
> > complaints, add your hack to revert back to Choice A behavior based on the 
> > get_mempolicy() call you specified is always part of libnuma?
> 
> I'll probably reply to other parts of your message later, but this
> one catches my eye right now.
> 
> "if we hear complaints, add your hack ... back"  -- this doesn't seem
> like a good idea to me.  Maybe inside Google you don't see it, but
> for those of us shipping computer systems using major distributions
> such as SUSE or Red Hat, there can be a year lag between when I send a
> feature patch to Andrew, and when my customers send their first
> feedback to me resulting from using that new feature.
> 

Let's add a Choice C:

	Any nodemask that is passed to set_mempolicy() is saved as the
	intent of the application in struct mempolicy.  All policies
	are effected on a contextualized per-allocation basis.

	Policies such as MPOL_INTERLEAVE always get AND'd with 
	pol->cpuset_mems_allowed.  If that yields numa_no_nodes,
	MPOL_DEFAULT is used instead.

	Policies such as MPOL_PREFERRED are respected if the node is set
	in pol->cpuset_mems_allowed, otherwise MPOL_DEFAULT is used.	

	If an application attempts to setup a memory policy for an
	MPOL_PREFERRED node that it doesn't have access to or an
	MPOL_INTERLEAVE nodemask that is empty when AND'd with
	pol->cpuset_mems_allowed, -EINVAL is returned and no new policy
	is effected.

	If an application gains nodes in pol->cpuset_mems_allowed that
	now include the nodes from MPOL_INTERLEAVE or MPOL_PREFERRED,
	that policy is then effected once again.  Otherwise,
	MPOL_DEFAULT is still used.

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29  4:47                                               ` David Rientjes
  2007-10-29  5:45                                                 ` Paul Jackson
@ 2007-10-29  7:15                                                 ` Paul Jackson
  2007-10-30 23:12                                                   ` David Rientjes
  1 sibling, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-29  7:15 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

David wrote:
> The problem that I see with immediately offering both choices is that we 
> don't know if anybody is actually reverting back to Choice A behavior 
> because libnuma, by default, would use it.  That's going to making it very 
> painful to remove later.

Yes, that's a problem.  I would rather end up with both Choices
forever, than breaking stuff because we changed how memory policy
nodes are numbered.

> We should only 
> support both choices if they will both be used and there's no hard 
> evidence to suggest that at this point.

No.  We could only remove Choice A if we had hard evidence that
it wouldn't break things, especially for the libnuma-Oracle stack.

Either way, we obviously have to decide this lacking sufficient
hard evidence.  Changing memory policy node numbering is just way
too likely to break things, in ways that users initially find
difficult to diagnose.  We -can-not- inflict that on our users
in a single, sudden, change.  We must stage it, starting by
adding the new.

> You earlier insisted on an ease of documentation for the MPOL_INTERLEAVE 
> case and now this dual support that you're proposing is going to make the 
> documentation very difficult to understand for anyone who simply wants to 
> use mempolicies.

Yup - that's a problem.  But it is one that users can control.
If they just continue using memory policies and libnuma as before,
it continues to work as before.  If they need to deal with situations
in which applications using memory policies are being moved around
between larger and smaller cpusets, and they are willing and able to
modify and improve the part of their code that handles memory policies,
then they can read the new section of the documentation about this
improved cpuset-relative node numbering, and give it a try.

Blind siding users with a unilateral change like this will leave
orphaned bits gasping in agony on the computer room floor.  It can
sometimes takes months of elapsed time and hundreds of hours of various
peoples time across a dozen departments in three to five corporations
to track down the root cause of such a problem, from the point of the
initial failure, back to the desk of someone like you or me.  And then
it can take tens or hundreds more hours of human effort to deliver a
fix.  I refuse to knowingly go down that road.

I will not agree to suddenly replacing Choice A with Choice B.

> And that application would need to [...] Choice A can't possibly
> be what they want.

People do this sort of stuff all the time; they just don't realize
what all is going on beneath the surface of the various tools,
libraries, scripts and magic incantations that they cobble together to
meet their needs.

Choice A is meeting most of our needs.  Not until you brought
up this case of MPOL_INTERLEAVE across the nodes of a job being
moved between varying size cpusets did it prove inadequate.

> I hope you still support what we earlier talked about in 
> terms of adding a field to struct mempolicy to remember the intended 
> nodemask the application asked to interleave over.

Yes - that's a key element of the Choice B implementation.

> I thought what we agreed upon and what you were going to implement was 
> adding a nodemask_t to struct mempolicy for the intended nodemask of the 
> memory policy and then AND it with pol->cpuset_mems_allowed.

Not "AND".  Fold - the n-th bit is set in a tasks mems_allowed iff
there exists m such that (m % w) == n, and such that the m-th bit is
set in the tasks mempolicy's remembered nodemask, where w is the weight
(number of '1' bits) in the tasks current cpusets mems_allowed. See
lib/bitmap.c:bitmap_remap(), and its wrapper nodes_remap() for the
implementation.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29  7:00                                                   ` David Rientjes
@ 2007-10-29  7:26                                                     ` Paul Jackson
  2007-10-30 22:53                                                       ` David Rientjes
  0 siblings, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-29  7:26 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

> Let's add a Choice C:
> 
>     Any nodemask that is passed to set_mempolicy() is saved as
>     the intent of the application in struct mempolicy.

Yes

>     All policies are effected on a contextualized per-allocation
>     basis.

"contextualized" - I guess that means converted to cpuset
relative numbering - yes.

"per-allocation" - Most of the calculation of nodemasks and
zonelists is done when memory policies change.

>     Policies such as MPOL_INTERLEAVE always get AND'd with
>     pol->cpuset_mems_allowed.

Not AND'd - Folded, as in bitmap_remap().

>     If that yields numa_no_nodes, MPOL_DEFAULT is used instead.

Not an issue with Folding.

>     Policies such as MPOL_PREFERRED are respected if the node
>     is set in pol->cpuset_mems_allowed, otherwise MPOL_DEFAULT
>     is used.

Not an issue with Folding.

>     If an application attempts to setup a memory policy for
>     an MPOL_PREFERRED node that it doesn't have access to or
>     an MPOL_INTERLEAVE nodemask that is empty when AND'd with
>     pol->cpuset_mems_allowed, -EINVAL is returned and no new
>     policy is effected.

Not issues with Folding.

>     If an application gains nodes in pol->cpuset_mems_allowed that
>     now include the nodes from MPOL_INTERLEAVE or MPOL_PREFERRED,
>     that policy is then effected once again.  Otherwise,
>     MPOL_DEFAULT is still used.

Not issues with Folding.

With folding, an application that layed out an elaborate memory
policy configuration covering say 16 nodes can run in a 4 node
cpuset, where whatever would have been on node N gets folded down
to node N % 4.

With AND'ing, such an application would find 3/4's of its fancy
memory policy configuration replaced with MPOL_DEFAULT and -EINVAL
fallbacks.


-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:37                           ` Christoph Lameter
@ 2007-10-29 15:00                             ` Lee Schermerhorn
  2007-10-29 17:33                               ` Paul Jackson
  2007-10-29 20:35                               ` Christoph Lameter
  0 siblings, 2 replies; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-29 15:00 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David Rientjes, Paul Jackson, akpm, ak, linux-kernel, Mel Gorman

[-- Attachment #1: Type: text/plain, Size: 2442 bytes --]

On Fri, 2007-10-26 at 14:37 -0700, Christoph Lameter wrote:
> On Fri, 26 Oct 2007, Lee Schermerhorn wrote:
> 
> > > > Now, if we could replace the 'cpuset_mems_allowed' nodemask with a
> > > > pointer to something stable, it might be a win.
> > > 
> > > The memory policies are already shared and have refcounters for that 
> > > purpose.
> > 
> > I must have missed that in the code I'm reading :)
> 
> What is the benefit of having pointers to nodemasks? We likely would need 
> to have refcounts in those nodemasks too? So we duplicate a lot of 
> the characteristics of memory policies?

Hi, Christoph:

remoting the nodemasks from the mempolicy and allocating them only when
needed is something that you and Mel and I discussed last month, in the
context of Mel's "one zonelist filtered by nodemask" patches.  I just
put together the dynamic nodemask patch [included below FYI, NOT for
serious consideration] to see what it looked like and whether it helped.
Conclusion:  it's ugly/complex [especially trying to keep the nodemasks
embedded for systems that don't require > a pointer's worth of bits] and
they probably don't help much if most uses of non-default mempolicy
requires a nodemask.

I only brought it up again because now you all are considering another
nodemask per policy.  In fact, I only considered it in the first place
because nodemasks on our [HP's] platform don't require more than a
pointer's worth of bits [today, at least--I don't know about future
plans].  However, since we share an arch--ia64-with SGI and distros
don't want to support special kernels for different vendors, if they can
avoid it, we have 1K-bit nodemasks.   Since this is ia64 we're talking
about, most folks don't care.  Now that you're going to do the same for
x86_64, it might become more visible.  Then again, maybe there are few
enough mempolicy structs that no-one will care anyway.

Note:  I don't [didn't] think I need to ref count the nodemasks
associated with the mempolicies because they are allocated when the
mempolicy is and destroyed when the policy is--not shared.  Just like
the custom zonelist for bind policy, and we have no ref count there.
I.e., they're protected by the mempol's ref.  However, now that you
bring it up, I'm wondering about the effects of policy remapping, and
whether we have the reference counting or indirect protection [mmap_sem,
whatever] correct there in current code.  I'll have to take a look.

Lee

[-- Attachment #2: dynamically-allocate-mempolicy-nodemasks.patch --]
[-- Type: text/x-patch, Size: 10131 bytes --]

PATCH/RFC Memory Policy:  dynamically allocate policy nodemaps

Something that Christoph Lameter, Mel Gorman and I discussed.
Once Mel's "one zonelist" patches go in, we no longer have the
MPOL_BIND custom zonelist in the mempolicy struct.  However, we
still have 2 nodemask_t's that can get quite large:  one in the
union with the preferred_node, and one for the cpuset current
mems allowed.  Note that this 2nd nodemask_t does NOT depend on
whether or not cpusets are configured.

So, on an ia64 platform with NODES_SHIFT configured at 8 [256 nodes],
this results in a nodemask_t size of 32 bytes and a mempolicy size
of 72 bytes.  Real soon now, we'll be seeing x86_64 platforms with
hundreds of nodes.  Indirect/dynamically allocated policy nodemasks
can reduce this size, for policies that don't need them, to:
TODO

[Some distros ship with NODES_SHIFT=10 => 128 byte nodemasks.]

However, on platforms and configurations where BITS_PER_LONG >=
(1 << NODES_SHIFT),  indirect policy nodemasks are unnecessary 
overhead, as the maximum number of nodes will fit in a pointers
worth of data.

So, this patch implements indirect, dynamically allocated nodemasks
for memory policies  when:  BITS_PER_LONG < (1 << NODES_SHIFT).

etc...

Signed-off-by:  Lee Schermerhorn <lee.schermerhorn@hp.com>

 include/linux/mempolicy.h |   20 +++++-
 mm/mempolicy.c            |  134 ++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 135 insertions(+), 19 deletions(-)

Index: Linux/include/linux/mempolicy.h
===================================================================
--- Linux.orig/include/linux/mempolicy.h	2007-10-02 14:11:20.000000000 -0400
+++ Linux/include/linux/mempolicy.h	2007-10-02 16:07:43.000000000 -0400
@@ -47,6 +47,12 @@ struct mm_struct;
 
 #ifdef CONFIG_NUMA
 
+#if BITS_PER_LONG < (1 << NODES_SHIFT)
+#define INDIRECT_POLICY_NODEMASK 1
+#else
+#define INDIRECT_POLICY_NODEMASK 0
+#endif
+
 /*
  * Describe a memory policy.
  *
@@ -71,11 +77,19 @@ struct mempolicy {
 	atomic_t refcnt;
 	short policy; 	/* See MPOL_* above */
 	union {
-		short 		 preferred_node; /* preferred */
-		nodemask_t	 nodes;		/* interleave/bind */
+		short 		preferred_node;	/* preferred */
+#if INDIRECT_POLICY_NODEMASK
+		nodemask_t	*nodes;		/* interleave/bind */
+#else
+		nodemask_t	nodes;		/* interleave/bind */
+#endif
 		/* undefined for default */
 	} v;
-	nodemask_t cpuset_mems_allowed;	/* mempolicy relative to these nodes */
+#if INDIRECT_POLICY_NODEMASK
+	nodemask_t *cpuset_mems_allowed; /* mempolicy relative to these nodes */
+#else
+	nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
+#endif
 };
 
 /*
Index: Linux/mm/mempolicy.c
===================================================================
--- Linux.orig/mm/mempolicy.c	2007-10-02 14:12:35.000000000 -0400
+++ Linux/mm/mempolicy.c	2007-10-02 17:18:24.000000000 -0400
@@ -100,6 +100,7 @@
 
 static struct kmem_cache *policy_cache;
 static struct kmem_cache *sn_cache;
+static struct kmem_cache *nodemask_cache;
 
 /* Highest zone. An specific allocation for a zone below that is not
    policied. */
@@ -158,10 +159,68 @@ static int is_valid_nodemask(nodemask_t 
 	return 0;
 }
 
+#if INDIRECT_POLICY_NODEMASK
+/*
+ * mempolicy operations for indirect policy nodemasks
+ * operate on pointers to nodemask_t pointers in mempolicy struct
+ */
+static int set_policy_nodemask(nodemask_t **pol_nodes, nodemask_t *nodes)
+{
+	**pol_nodes = *nodes;
+	return 0;
+}
+
+static int new_policy_nodemask(nodemask_t **pol_nodes, nodemask_t *nodes)
+{
+	*pol_nodes = kmem_cache_alloc(nodemask_cache, GFP_KERNEL);
+	if (!*pol_nodes)
+		return -ENOMEM;
+	return set_policy_nodemask(pol_nodes, nodes);
+}
+
+static nodemask_t *policy_nodemask_ref(nodemask_t **pol_nodes)
+{
+	return *pol_nodes;
+}
+
+static void free_policy_nodemask(nodemask_t **pol_nodes)
+{
+	kmem_cache_free(nodemask_cache, *pol_nodes);
+}
+
+#else
+
+/*
+ * mempolicy operations for embedded policy nodemasks
+ * operate on pointers to nodemasks embedded in mempolicy structs
+ */
+static int set_policy_nodemask(nodemask_t *pol_nodes, nodemask_t *nodes)
+{
+	*policy->v.nodes = *nodes;
+	return 0;
+}
+
+static int new_policy_nodemask(nodemask_t *pol_nodes, nodemask_t *nodes)
+{
+	return set_policy_nodemask(pol_nodes, nodes);
+}
+
+static nodemask_t *policy_nodemask_ref(nodemask_t *pol_nodes)
+{
+	return pol_nodes;
+}
+
+static void free_policy_nodemask(nodemask_t *pol_nodes)
+{
+}
+#endif
+
 /* Create a new policy */
 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 {
 	struct mempolicy *policy;
+	nodemask_t mems_allowed;
+	int ret;
 
 	pr_debug("setting mode %d nodes[0] %lx\n",
 		 mode, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -175,14 +234,18 @@ static struct mempolicy *mpol_new(int mo
 	atomic_set(&policy->refcnt, 1);
 	switch (mode) {
 	case MPOL_INTERLEAVE:
-		policy->v.nodes = *nodes;
+		ret = new_policy_nodemask(&policy->v.nodes, nodes);
+		if (ret)
+			return ERR_PTR(ret);
 		if (nodes_weight(*nodes) == 0) {
 			mode |= MPOL_CONTEXT;
 			break;
 		}
-		nodes_and(policy->v.nodes, policy->v.nodes,
+		nodes_and(*policy_nodemask_ref(&policy->v.nodes),
+				*policy_nodemask_ref(&policy->v.nodes),
 					node_states[N_HIGH_MEMORY]);
-		if (nodes_weight(policy->v.nodes) == 0) {
+		if (nodes_weight(*policy_nodemask_ref(&policy->v.nodes)) == 0) {
+			free_policy_nodemask(&policy->v.nodes);
 			kmem_cache_free(policy_cache, policy);
 			return ERR_PTR(-EINVAL);
 		}
@@ -197,11 +260,21 @@ static struct mempolicy *mpol_new(int mo
 			kmem_cache_free(policy_cache, policy);
 			return ERR_PTR(-EINVAL);
 		}
-		policy->v.nodes = *nodes;
+		ret = new_policy_nodemask(&policy->v.nodes, nodes);
+		if (ret)
+			return ERR_PTR(ret);
 		break;
 	}
 	policy->policy = mode;
-	policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
+
+//TODO:  I ought to be able to figure out how to reference the
+//       mems_allowed [current task's or node_possible_map] w/o
+//       allocating a new copy.  Need more helper function?
+	mems_allowed = cpuset_mems_allowed(current);
+	ret = new_policy_nodemask(&policy->cpuset_mems_allowed,
+					 		&mems_allowed);
+	if (ret)
+		return ERR_PTR(ret);
 	return policy;
 }
 
@@ -464,7 +537,7 @@ static nodemask_t *get_interleave_nodes(
 	if (unlikely(p->policy & MPOL_CONTEXT))
 		return &cpuset_current_mems_allowed;
 
-	return &p->v.nodes;
+	return policy_nodemask_ref(&p->v.nodes);
 }
 
 /* Set the process memory policy */
@@ -1135,8 +1208,8 @@ static inline nodemask_t *nodemask_polic
 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
 	if (unlikely(policy->policy == MPOL_BIND &&
 			gfp_zone(gfp) >= policy_zone &&
-			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)))
-		return &policy->v.nodes;
+			cpuset_nodemask_valid_mems_allowed(policy_nodemask_ref(&policy->v.nodes))))
+		return policy_nodemask_ref(&policy->v.nodes);
 
 	return NULL;
 }
@@ -1200,8 +1273,9 @@ unsigned slab_node(struct mempolicy *pol
 		struct zoneref *z;
 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
 		zonelist = &NODE_DATA(numa_node_id())->node_zonelist;
-		z = first_zones_zonelist(zonelist, &policy->v.nodes,
-							highest_zoneidx);
+		z = first_zones_zonelist(zonelist,
+					policy_nodemask_ref(&policy->v.nodes),
+					highest_zoneidx);
 		return zonelist_node_idx(z);
 	}
 
@@ -1421,7 +1495,25 @@ struct mempolicy *__mpol_copy(struct mem
 		nodemask_t mems = cpuset_mems_allowed(current);
 		mpol_rebind_policy(old, &mems);
 	}
-	*new = *old;
+
+	/*
+	 * need to copy members explicitly to handle indirect vs
+	 * embedded nodemasks
+	 */
+	new->policy = old->policy;
+	switch (policy_mode(old)) {
+	case MPOL_PREFERRED:
+		new->v.preferred_node = old->v.preferred_node;
+		break;
+
+	case MPOL_BIND:
+		/*FALL THROUGH*/
+	case MPOL_INTERLEAVE:
+		new_policy_nodemask(&new->v.nodes,
+					policy_nodemask_ref(&old->v.nodes));
+	}
+	new_policy_nodemask(&new->cpuset_mems_allowed,
+				policy_nodemask_ref(&old->cpuset_mems_allowed));
 	atomic_set(&new->refcnt, 1);
 	return new;
 }
@@ -1441,7 +1533,9 @@ int __mpol_equal(struct mempolicy *a, st
 		 * works for MPOL_BIND as it shouldn't have MPOL_CONTEXT set
 		 */
 		return a->policy & MPOL_CONTEXT ||
-				nodes_equal(a->v.nodes, b->v.nodes);
+				nodes_equal(
+					*policy_nodemask_ref(&a->v.nodes),
+					*policy_nodemask_ref(&b->v.nodes));
 	case MPOL_PREFERRED:
 		return a->v.preferred_node == b->v.preferred_node;
 	default:
@@ -1455,6 +1549,8 @@ void __mpol_free(struct mempolicy *p)
 {
 	if (!atomic_dec_and_test(&p->refcnt))
 		return;
+	free_policy_nodemask(&p->v.nodes);
+	free_policy_nodemask(&p->cpuset_mems_allowed);
 	kmem_cache_free(policy_cache, p);
 }
 
@@ -1646,7 +1742,8 @@ int mpol_set_shared_policy(struct shared
 	pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
 		 vma->vm_pgoff,
 		 sz, npol? npol->policy : -1,
-		 npol ? nodes_addr(npol->v.nodes)[0] : -1);
+		 npol ? nodes_addr(*policy_nodemask_ref(&npol->v.nodes))[0] :
+			-1);
 
 	if (npol) {
 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1694,6 +1791,10 @@ void __init numa_policy_init(void)
 				     sizeof(struct sp_node),
 				     0, SLAB_PANIC, NULL);
 
+	if (INDIRECT_POLICY_NODEMASK)
+		nodemask_cache = kmem_cache_create("nodemask",
+					     sizeof(nodemask_t),
+					     0, SLAB_PANIC, NULL);
 	/*
 	 * Set interleaving policy for system init. Interleaving is only
 	 * enabled across suitably sized nodes (default is >= 16MB), or
@@ -1738,7 +1839,7 @@ static void mpol_rebind_policy(struct me
 
 	if (!pol)
 		return;
-	mpolmask = &pol->cpuset_mems_allowed;
+	mpolmask = policy_nodemask_ref(&pol->cpuset_mems_allowed);
 	if (nodes_equal(*mpolmask, *newmask))
 		return;
 
@@ -1746,8 +1847,9 @@ static void mpol_rebind_policy(struct me
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
-		nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
-		pol->v.nodes = tmp;
+		nodes_remap(tmp, *policy_nodemask_ref(&pol->v.nodes),
+				 *mpolmask, *newmask);
+		set_policy_nodemask(&pol->v.nodes, &tmp);
 		*mpolmask = *newmask;
 		current->il_next = node_remap(current->il_next,
 						*mpolmask, *newmask);

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-26 21:39                           ` David Rientjes
  2007-10-27  1:07                             ` Paul Jackson
@ 2007-10-29 15:10                             ` Lee Schermerhorn
  2007-10-29 18:41                               ` Paul Jackson
  2007-10-30 22:57                               ` David Rientjes
  1 sibling, 2 replies; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-29 15:10 UTC (permalink / raw)
  To: David Rientjes; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel

On Fri, 2007-10-26 at 14:39 -0700, David Rientjes wrote:
> On Fri, 26 Oct 2007, Lee Schermerhorn wrote:
> 
> > So, you pass the subset, you don't set the flag to indicate you want
> > interleaving over all available.  You must be thinking of some other use
> > for saving the subset mask that I'm not seeing here.  Maybe restoring to
> > the exact nodes requested if they're taken away and then re-added to the
> > cpuset?
> > 
> 
> Paul's motivation for saving the passed nodemask to set_mempolicy() is so 
> that the _intent_ of the application is never lost.  That's the biggest 
> advantage that this method has and that I totally agree with.  So whenever 
> the mems_allowed of a cpuset changes, the MPOL_INTERLEAVE nodemask of all 
> attached tasks becomes their intent (pol->passed_nodemask) AND'd with the 
> new mems_allowed.  That can be done on mpol_rebind_policy() and shouldn't 
> be an extensive change.
> 
> So MPOL_INTERLEAVE, and possibly other, mempolicies will always try to 
> accomodate the intent of the application but only as far as the task's 
> cpuset restriction allows them.
> 
> 		David

Maybe it's just me, but I think it's pretty presumptuous to think we can
infer the intent of the application from the nodemask w/o additional
flags such as Christoph proposed [cpuset relative]--especially for
subsets of the cpuset.  E.g., the application could intend the nodemask
to specify memories within a certain distance of a physical resource,
such as where a particular IO adapter or set thereof attach to the
platform.  

And even when the intent is to preserve the cpuset relative positions of
the nodes in the nodemask, this really only makes sense if the original
and modified cpusets have the same physical topology w/rt multi-level
NUMA interconnects.  This is something that has bothered me about
dynamic cpusets and current policy remapping.  We don't do a good job of
explaining the implications of changing cpuset topology on applications,
nor do we handle it very well in the code.  Paul addresses one of my
concerns in a later message in this thread, so I'll comment there.

Later,
Lee

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27 19:16             ` David Rientjes
@ 2007-10-29 16:23               ` Lee Schermerhorn
  2007-10-29 17:35                 ` Andi Kleen
  2007-10-29 19:35                 ` Paul Jackson
  0 siblings, 2 replies; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-29 16:23 UTC (permalink / raw)
  To: David Rientjes
  Cc: Christoph Lameter, Andrew Morton, Andi Kleen, Paul Jackson,
	linux-kernel

On Sat, 2007-10-27 at 12:16 -0700, David Rientjes wrote:
> On Fri, 26 Oct 2007, David Rientjes wrote:
> 
> > Hacking and requiring an updated version of libnuma to allow empty 
> > nodemasks to be passed is a poor solution; if mempolicy's are supposed to 
> > be independent from cpusets, then what semantics does an empty nodemask 
> > actually imply when using MPOL_INTERLEAVE?  To me, it means the entire 
> > set_mempolicy() should be a no-op, and that's exactly how mainline 
> > currently treats it _as_well_ as libnuma.  So justifying this change in 
> > the man page is respectible, but passing an empty nodemask just doesn't 
> > make sense.
> > 
> 
> Another reason that passing an empty nodemask to set_mempolicy() doesn't 
> make sense is that libnuma uses numa_set_interleave_mask(&numa_no_nodes)
> to disable interleaving completely.
> 

David:  as we discussed when you contacted me off-list about this, the
libnuma API and the system call interface are two quite different APIs.
For example,  numa_set_interleave_mask(&numa_no_nodes) does not pass
MPOL_INTERLEAVE with an empty mask to set_mempolicy().  Rather it
"installs" an MPOL_DEFAULT policy which internally just deletes the
task's mempolicy, allowing fallback to system default policy.  I would
not propose to change this behavior, nor break libnuma in any way.

For other, who weren't involved in the off-list exchange, here's an
excerpt from my response to David:

[
At the libnuma level, I think we need an explicit
"numa_set_interleave_allowed()"--analogous to "numa_set_localalloc()".

The current "numa_alloc_interleaved()" should, I think, allocate on all
*allowed* nodes, rather than all nodes.  It can do this using the sys
call interface as defined.

Independent of cpuset-independent interleave, an application needs to
pass a valid subset of the current mems allowed to
"numa_alloc_interleaved_subset()".   An application can now obtain the
mems_allowed using the MPOL_F_MEMS_ALLOWED flag that I added, but we
need a libnuma wrapper for this as well.  [Yeah, this info can change at
any time, but that's always been the case....]

"numa_interleave_memory()" is essentially mbind(), I think [not looking
at the libnuma source code at this moment].  Maybe provide
"numa_interleave_memory_allowed(void *mem, size_t size)" ???

Finally, I think we need to add a query function:  
"nodemask_t numa_get_mems_allowed()" to return the mask of valid nodes
in the current context [cpuset].  This would just be a wrapper around
get_mempolicy() with the MPOL_F_MEMS_ALLOWED flag.
]

Couple of comments on the above:

1. "the sys call interface as defined" in the 2nd paragraph of the
except refers to my patch that uses null/empty nodemask to indicate "all
allowed".

2.  As this thread progresses, you've discussed relaxing the requirement
that applications pass a valid subset of mems_allowed.  I.e., something
that was illegal becomes legal.  An API change, I think.  But, a
backward compatible one, so that's OK, right? :-)

3. If we do change the semantics of the mempolicy system calls to allow
nodes outside of the cpuset, then maybe we don't need to query the mems
allowed.  I still find it useful, but not absolutely necessary--e.g., to
construct a nodemask that will be acceptable in the current cpuset.

4. I looked at libnuma source.  numa_interleave_memory() does use
mbind() which, again, does not complain about nodemasks that include
non-allowed nodes.

Another thing occurs to me:  perhaps numactl would need an additional
'nodes' specifier such as 'allowed'.  Alternatively, 'all' could be
redefined to me 'all allowed'.  This is independent of how you specify
'all allowed' to the system call.

Regards,
Lee

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-27 23:19                                     ` Paul Jackson
  2007-10-28 18:19                                       ` David Rientjes
@ 2007-10-29 16:54                                       ` Lee Schermerhorn
  2007-10-29 19:40                                         ` Paul Jackson
                                                           ` (3 more replies)
  1 sibling, 4 replies; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-29 16:54 UTC (permalink / raw)
  To: Paul Jackson; +Cc: David Rientjes, clameter, akpm, ak, linux-kernel

On Sat, 2007-10-27 at 16:19 -0700, Paul Jackson wrote:
> David wrote:
> > I think there's a mixup in the flag name [MPOL_MF_RELATIVE] there
> 
> Most likely.  The discussion involving that flag name was kinda mixed up ;).
> 
> > but I actually would recommend against any flag to effect Choice A.
> > It's simply going to be too complex to describe and is going to be a
> > headache to code and support. 
> 
> While I am sorely tempted to agree entirely with this, I suspect that
> Christoph has a point when he cautions against breaking this kernel API.
> 
> Especially for users of the set/get mempolicy calls coming in via
> libnuma, we have to be very careful not to break the current behaviour,
> whether it is documented API or just an accident of the implementation.
> 
> There is a fairly deep and important stack of software, involving a
> well known DBMS product whose name begins with 'O', sitting on that
> libnuma software stack.  Steering that solution stack is like steering
> a giant oil tanker near shore.  You take it slow and easy, and listen
> closely to the advice of the ancient harbor master.  The harbor masters
> in this case are or were Andi Kleen and Christoph Lameter.
> 
> > It's simply going to be too complex to describe and is going to be a
> > headache to code and support.
> 
> True, which is why I am hoping we can keep this modal flag, if such be,
> from having to be used on every set/get mempolicy call.  The ordinary
> coder of new code using these calls directly should just see Choice B
> behaviour.  However the user of libnuma should continue to see whatever
> API libnuma supports, with no change whatsoever, and various versions of
> libnuma, including those already shipped years ago, must continue to
> behave without any changes in node numbering.

If most apps use libnuma APIs instead of directly calling the sys calls,
libnuma could query something as simple as an environment variable, or a
new flag to get_mempolicy(), or the value of a file in it's current
cpuset--but I'd like to avoid a dependency on libcpuset--to determine
whether to implement "new" semantics.

> 
> There are two decent looking ways (and some ugly ways) that I can see
> to accomplish this:
> 
>  1) One could claim that no important use of Oracle over libnuma over
>     these memory policy calls is happening on a system using cpusets.
>     There would be a fair bit of circumstantial evidence for this
>     claim, but I don't know it for a fact, and would not be the
>     expert to determine this.  On systems making no use of cpusets,
>     these two Choices A and B are identical, and this is a non-issue.
>     Those systems will see no API changes whatsoever from any of this.

I'd certainly like to hear from Oracle what libnuma features they use
and their opinion of the changes being discussed here.

> 
>  2) We have a per-task mode flag selecting whether Choice A or B
>     node numbering apply to the masks passed in to set_mempolicy.
> 
>     The kernel implementation is fairly easy.  (Yeah, I know, I
>     too cringe everytime I read that line ;)
> 
>     If Choice A is active, then we continue to enforce the current
>     check, in mm/mempolicy.c: contextualize_policy(), that the passed
>     in mask be a subset of the current cpusets allowed memory nodes,
>     and we add a call to nodes_remap(), to map the passed in nodemask
>     from cpuset centric to system centric (if they asked for the
>     fourth node in their current cpuset, they get the fourth node in
>     the entire system.)
> 
>     Similarly, for masks passed back by get_mempolicy, if Choice A
>     is active, we use nodes_remap() to convert the mask from system
>     centric back to cpuset centric.
> 
>     There is a subtle change in the kernel API's here:
> 
> 	In current kernels, which are Choice A, if a task is moved from
> 	a big cpuset (many nodes) to a small cpuset and then -back-
> 	to a big cpuset, the nodemasks returned by get_mempolicy
> 	will still show the smaller masks (fewer set nodes) imposed
> 	by the smaller cpuset.
> 
> 	In todays kernels, once scrunched or folded down, the masks
> 	don't recover their original size after the task is moved
> 	back to a large cpuset.

Yeah.  This bothered me about policy remapping when I looked at it a
while back.  Worse, this behavior isn't documented as intended [or not].
I thought at the time that this could be solved by retaining the
original argument nodemask, but 1) I was worried about the size when ~1K
nodes are required to be supported and 2) it still doesn't solve the
problem of ensuring the same locality characteristics w/o a lot of
documentation about the implications of changing cpuset resources or
moving tasks between cpusets in such a way to preserve the locality
characteristics requested by the original mask.

Again, we stumble upon the notion of "intent".  If the intent is just to
spread allocations to share bandwidth, it probably doesn't matter.  If,
on the other hand, the original mask was carefully constructed, taking
into consideration the distances between the memories specified and
other resources [cpus in the cpuset, other memories in the cpuset, IO
adpater connection points, ...], there is a lot more to consider than
just preserving the cpuset relative positions of the nodes.

> 
> 	With this change, even a task asking for Choice A would,
> 	once back on a larger cpuset, again see the larger masks from
> 	get_mempolicy queries.  This is a change in the kernel API's
> 	visible to user space; but I really do not think that there
> 	is sufficient use of Oracle over libnuma on systems actively
> 	moving tasks between differing size cpusets for this to be
> 	a problem.
> 
> 	Indeed, if there was much such usage, I suspect they'd
> 	be complaining that the current kernel API was borked, and
> 	they'd be filing a request for enhancement -asking- for just
> 	this subtle change in the kernel API's here.  In other words,
> 	this subtle API change is a feature, not a bug ;)

Agreed. 

> 
>     The bulk of the kernel's mempolicy code is coded for Choice B.
> 
>     If Choice B is active, we don't enforce the subset check in
>     contextualize_policy(), and we don't invoke nodes_remap() in either
>     of the set or get mempolicy code paths.
> 
>     A new option to get_mempolicy() would query the current state of
>     this mode flag, and a new option to set_mempolicy() would set
>     and clear this mode flag.  Perhaps Christoph had this in mind
>     when he wrote in an earlier message "The alternative is to add
>     new set/get mempolicy functions."
> 
>     The default kernel API for each task would be Choice B (!).
> 
>     However, in deference to the needs of libnuma, if the following
>     call was made, this would change the mode for that task to
>     Choice A:
> 
> 	get_mempolicy(NULL, NULL, 0, 0, 0);
> 
>     This last detail above is an admitted hack.  *So far as I know*
>     it happens that all current infield versions of libnuma issue the
>     above call, as their first mempolicy query, to detemine whether
>     the active kernel supports mempolicy.

In libnuma in numactl-1.0.2 that I recently grabbed off Andi's site,
numa_available() indeed issues this call.  But, I don't see any internal
calls to numa_available() [comments says all other calls undefined when
numa_available() returns an error] nor any other calls to
get_mempolicy() with all null/0 args.  So, you'd be depending on the
application to call numa_available().  However, you could define an
additional MPOL_F_* flag to get_mempolicy() that is issued in library
init code to enable new behavior--again, based on some indication that
new behavior is desired or not.

> 
>     The mode for each task would be inherited across fork, and reset
>     to Choice (B) on exec.
> 
> If we determine that we must go with a new flag bit to be passed in
> on each and every get and set mempolicy call that wants Choice B node
> numbering rather than Choice A, then I will need (1) a bottle of rum,
> and (2) a credible plan in place to phase out this abomination ;).
> 
> There would be just way too many coding errors and coder frustrations
> introduced by requiring such a flag on each and every mempolicy system
> call that wants the alternative numbering.  

Only for apps that use the sys calls directly, right?  This can be
hidden by libnuma(), if all apps use that.  The "behavior switch" flag
suggested above would obviate a flag on each sys call and could also be
hidden by libnuma.  Any of these changes will require some better
documentation than we have now...

> There must be some
> international treaty that prohibits landmines capable of blowing ones
> foot off that would apply here.
> 
> There are two major user level libraries sitting on top of this API,
> libnuma and libcpuset.  Libnuma is well known; it was written by Andi
> Kleen.  I wrote libcpuset, and while it is LGPL licensed, it has not
> been publicized very well yet.  I can speak for libcpuset: it could
> adapt to the above proposal, in particular to the details in way (2),
> just fine.  Old versions of libcpuset running on new kernels will
> have a little bit of subtle breakage, but not in areas that I expect
> will cause much grief.  Someone more familiar with libnuma than I would
> have to examine the above proposal in way (2) to be sure that we weren't
> throwing libnuma some curveball that was unnecessarily troublesome.
> 

I worry more about applications that take a more physical view of the
node ids and that emphasize locality more than bandwidth spreading.  If
libnuma explicitly enables new behavior when requested, however that
might be implemented, I don't know that it would be a problem.

Lee

Lee


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 15:00                             ` Lee Schermerhorn
@ 2007-10-29 17:33                               ` Paul Jackson
  2007-10-29 17:46                                 ` Lee Schermerhorn
  2007-10-29 20:35                               ` Christoph Lameter
  1 sibling, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-29 17:33 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: clameter, rientjes, akpm, ak, linux-kernel, mel

Lee wrote:
> I only brought it up again because now you all are considering another
> nodemask per policy.

The patch David and I are discussing will replace the
cpuset_mems_allowed nodemask in struct mempolicy, not
add a new nodemask.  In other words, the meaning and
name of that existing nodemask will change, with no
change in the overall structure size.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 16:23               ` Lee Schermerhorn
@ 2007-10-29 17:35                 ` Andi Kleen
  2007-10-29 19:35                 ` Paul Jackson
  1 sibling, 0 replies; 98+ messages in thread
From: Andi Kleen @ 2007-10-29 17:35 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: David Rientjes, Christoph Lameter, Andrew Morton, Paul Jackson,
	linux-kernel


> 
> Another thing occurs to me:  perhaps numactl would need an additional
> 'nodes' specifier such as 'allowed'.  Alternatively, 'all' could be
> redefined to me 'all allowed'.  This is independent of how you specify
> 'all allowed' to the system call.

cpuset support in libnuma/numactl is still incomplete. I'm also
not sure what the best way to handle this is.

Probably there should be a switch for both.

-Andi

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 17:33                               ` Paul Jackson
@ 2007-10-29 17:46                                 ` Lee Schermerhorn
  0 siblings, 0 replies; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-29 17:46 UTC (permalink / raw)
  To: Paul Jackson; +Cc: clameter, rientjes, akpm, ak, linux-kernel, mel

On Mon, 2007-10-29 at 10:33 -0700, Paul Jackson wrote:
> Lee wrote:
> > I only brought it up again because now you all are considering another
> > nodemask per policy.
> 
> The patch David and I are discussing will replace the
> cpuset_mems_allowed nodemask in struct mempolicy, not
> add a new nodemask.  In other words, the meaning and
> name of that existing nodemask will change, with no
> change in the overall structure size.


Kool!

Lee


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 15:10                             ` Lee Schermerhorn
@ 2007-10-29 18:41                               ` Paul Jackson
  2007-10-29 19:01                                 ` Lee Schermerhorn
  2007-10-30 23:17                                 ` David Rientjes
  2007-10-30 22:57                               ` David Rientjes
  1 sibling, 2 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-29 18:41 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel

Lee wrote:
> Maybe it's just me, but I think it's pretty presumptuous to think we can
> infer the intent of the application from the nodemask w/o additional
> flags such as Christoph proposed [cpuset relative]--especially for
> subsets of the cpuset.  E.g., the application could intend the nodemask
> to specify memories within a certain distance of a physical resource,
> such as where a particular IO adapter or set thereof attach to the
> platform.

Well, yes, we can't presume to know whether some application can move
or not.

But our kernel work is not presuming that.

It's providing mechanisms useful for moving apps.

The people using this decide what and when and if to move.

For example, the particular customers (HPC) I focus on for my job don't
move jobs because they don't want to take the transient performance
hit that would come from blowing out all their memory caches.

I'm guessing that David's situation involves something closer what you
see with a shared web hosting service, running jobs that are very
independent of hardware particulars.

But in any case, we (the kernel) are just providing the mechanisms.
If they don't fit ones needs, don't use them ;).

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 18:41                               ` Paul Jackson
@ 2007-10-29 19:01                                 ` Lee Schermerhorn
  2007-10-30 23:17                                 ` David Rientjes
  1 sibling, 0 replies; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-29 19:01 UTC (permalink / raw)
  To: Paul Jackson; +Cc: rientjes, clameter, akpm, ak, linux-kernel

On Mon, 2007-10-29 at 11:41 -0700, Paul Jackson wrote:
> Lee wrote:
> > Maybe it's just me, but I think it's pretty presumptuous to think we can
> > infer the intent of the application from the nodemask w/o additional
> > flags such as Christoph proposed [cpuset relative]--especially for
> > subsets of the cpuset.  E.g., the application could intend the nodemask
> > to specify memories within a certain distance of a physical resource,
> > such as where a particular IO adapter or set thereof attach to the
> > platform.
> 
> Well, yes, we can't presume to know whether some application can move
> or not.
> 
> But our kernel work is not presuming that.
> 
> It's providing mechanisms useful for moving apps.
> 
> The people using this decide what and when and if to move.
> 
> For example, the particular customers (HPC) I focus on for my job don't
> move jobs because they don't want to take the transient performance
> hit that would come from blowing out all their memory caches.
> 
> I'm guessing that David's situation involves something closer what you
> see with a shared web hosting service, running jobs that are very
> independent of hardware particulars.
> 
> But in any case, we (the kernel) are just providing the mechanisms.
> If they don't fit ones needs, don't use them ;).
> 

I'm with you on this last point!  I was reacting to the notion that we
can infer intent from a nodemask and that preserving the cpuset relative
numbering after changing cpuset resources or moving tasks preserves that
intent--especially if it involves locality and distance considerations.
I can envision sets of such transformations on HP platforms where
locality and distance would be preserved by preserving cpuset-relative
numbering, and many where they would not.  I expect you could do the
same for SGI platforms.  I'm not opposed to what you're trying to do,
modulo complexity concerns.  And I'm not saying that the complexity is
not worth it to customers.  But, given that we just "providing the
mechanism", I think we need to provide very good documentation on the
implications of these mechanism vis a vis whatever
characteristics--locality, distance, bandwidth sharing, ...--the
application intends when it installs a policy.

Like you, no doubt, I'm eyeballs deep in a number of things.  At some
point, I'll take a cut at enumerating various "intents" that different
types of applications might have when using mem policies and cpusets.
Others can add to that, or may even beat me to it.   We can then
evaluate how well these scenarios are served by the current mechanisms
and by whatever changes are proposed.

I should note that I really like cpusets--i.e., find them useful--and
I'm painfully aware of the awkward interactions with mempolicy.  On the
other hand, I don't want to sacrifice mem policy capabilities to shoe
horn them into cpusets.  In fact, I want to add additional mechanisms
that may also be awkward in cpusets.  As you say, "if they don't fit
your needs, don't use them."  

Later,
Lee

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 16:23               ` Lee Schermerhorn
  2007-10-29 17:35                 ` Andi Kleen
@ 2007-10-29 19:35                 ` Paul Jackson
  2007-10-29 20:36                   ` Christoph Lameter
  2007-10-29 21:08                   ` Andi Kleen
  1 sibling, 2 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-29 19:35 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel

Lee wrote:
> 2.  As this thread progresses, you've discussed relaxing the requirement
> that applications pass a valid subset of mems_allowed.  I.e., something
> that was illegal becomes legal.  An API change, I think.  But, a
> backward compatible one, so that's OK, right? :-)

The more I have stared at this, the more certain I've become that we
need to make the mbind/mempolicy calls modal -- the default mode
continues to interpret node numbers and masks just as these calls do
now, and the alternative mode provides the so called "Choice B",
which takes node numbers and masks as if the task owned the entire
system, and then the kernel internally and automatically scrunches
those masks down to whatever happens to be the current cpuset of
the task.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 16:54                                       ` Lee Schermerhorn
@ 2007-10-29 19:40                                         ` Paul Jackson
  2007-10-29 19:45                                         ` Paul Jackson
                                                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-29 19:40 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel

Lee wrote:
> If most apps use libnuma APIs instead of directly calling the sys calls,
> libnuma could query something as simple as an environment variable, or a
> new flag to get_mempolicy(), or the value of a file in it's current
> cpuset--but I'd like to avoid a dependency on libcpuset--to determine
> whether to implement "new" semantics.

The patch I'm working has a new set of options to get_mempolicy to set
and get the per-task kernel state indicating whether to use the old or
new semantics.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 16:54                                       ` Lee Schermerhorn
  2007-10-29 19:40                                         ` Paul Jackson
@ 2007-10-29 19:45                                         ` Paul Jackson
  2007-10-29 19:57                                         ` Paul Jackson
  2007-10-29 20:02                                         ` Paul Jackson
  3 siblings, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-29 19:45 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel

Lee wrote:
> Again, we stumble upon the notion of "intent".  If the intent is just to
> spread allocations to share bandwidth, it probably doesn't matter.  If,
> on the other hand, the original mask was carefully constructed, taking
> into consideration the distances between the memories specified and
> other resources [cpus in the cpuset, other memories in the cpuset, IO
> adpater connection points, ...], there is a lot more to consider than
> just preserving the cpuset relative positions of the nodes.

Yes - as I noted in an earlier reply, the kernel just provides the
mechanisms.  It's up to user level code and people to decide whether
moving jobs around is a worthwhile activity in their situation.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 16:54                                       ` Lee Schermerhorn
  2007-10-29 19:40                                         ` Paul Jackson
  2007-10-29 19:45                                         ` Paul Jackson
@ 2007-10-29 19:57                                         ` Paul Jackson
  2007-10-29 20:02                                         ` Paul Jackson
  3 siblings, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-29 19:57 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel

Lee wrote:
> > 	Indeed, if there was much such usage, I suspect they'd
> > 	be complaining that the current kernel API was borked, and
> > 	they'd be filing a request for enhancement -asking- for just
> > 	this subtle change in the kernel API's here.  In other words,
> > 	this subtle API change is a feature, not a bug ;)
> 
> Agreed. 

Hmmm ... put your thinking hat for my next comment ...

I could do one of two things in mm/mempolicy.c:
 B1) continue accepting nodemasks across the set_mempolicy and mbind
     system call APIs that are just like now (only nodes in the current
     tasks cpuset matter), but then remember what was passed in, so that
     if the tasks cpuset subsequently shrank down and then expanded
     again back to its original size, they would end up with the same
     memory policy placement they first had, or
 B2) accept nodemasks as if relative to the entire system, regardless
     of what cpuset they were in at the moment (all nodes in the system
     matter and can be specified.)

If I did B1, then that's just a subtle change in the API, and what
you agreed to above holds.

If I did B2, then that's a serious change in the way that nodes
are numbered in the nodemasks passed into mbind and set_mempolicy,
from being only nodes that happen to be in the tasks current cpuset,
to being nodes relative to all possible nodes on the system.

We need B2, I think.  Otherwise, if a job happens to be running in
a shrunken cpuset, it can't request what memory policy placement
it wants should it end up in a larger cpuset later on.  With B1, we
would continue to have the timing dependencies between when a task
is moved between different size cpusets, and when it happens to issue
mbind/set_mempolicy calls.

But B2 is an across the board change in how we number the nodes
passed into mbind and set_mempolicy.  That is in no way an upward
compatible change.

I am strongly inclined toward B2, but it must be a non-default optional
mode, at least for a while, perhaps a long while.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 16:54                                       ` Lee Schermerhorn
                                                           ` (2 preceding siblings ...)
  2007-10-29 19:57                                         ` Paul Jackson
@ 2007-10-29 20:02                                         ` Paul Jackson
  3 siblings, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-29 20:02 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel

Lee wrote:
> In libnuma in numactl-1.0.2 that I recently grabbed off Andi's site,
> numa_available() indeed issues this call.  But, I don't see any internal
> calls to numa_available() [comments says all other calls undefined when
> numa_available() returns an error] nor any other calls to
> get_mempolicy() with all null/0 args.  So, you'd be depending on the
> application to call numa_available().

Aha - good point.  It happened to be the numactl command line utility
that I tested with that issued the get_mempolicy(0,0,0,0,0) call.

Yup - this proposed hack, to have the kernel revert to the original
memory policy nodemask numbering if it sees such a getmempolicy call
is now officially dead meat.

Thanks.

> However, you could define an
> additional MPOL_F_* flag to get_mempolicy() that is issued in library
> init code to enable new behavior--again, based on some indication that
> new behavior is desired or not.

Yes - I am intending to define such MPOL_F_* flags, to set and get
which behavior applies to the current task.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 15:00                             ` Lee Schermerhorn
  2007-10-29 17:33                               ` Paul Jackson
@ 2007-10-29 20:35                               ` Christoph Lameter
  1 sibling, 0 replies; 98+ messages in thread
From: Christoph Lameter @ 2007-10-29 20:35 UTC (permalink / raw)
  To: Lee Schermerhorn
  Cc: David Rientjes, Paul Jackson, akpm, ak, linux-kernel, Mel Gorman

On Mon, 29 Oct 2007, Lee Schermerhorn wrote:

> Note:  I don't [didn't] think I need to ref count the nodemasks
> associated with the mempolicies because they are allocated when the
> mempolicy is and destroyed when the policy is--not shared.  Just like
> the custom zonelist for bind policy, and we have no ref count there.
> I.e., they're protected by the mempol's ref.  However, now that you
> bring it up, I'm wondering about the effects of policy remapping, and
> whether we have the reference counting or indirect protection [mmap_sem,
> whatever] correct there in current code.  I'll have to take a look.

In that case we could just put the nodemask at the end of the mempolicy 
structure and then allocate the size needed? That way we would not need to 
deref an additional pointer?


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 19:35                 ` Paul Jackson
@ 2007-10-29 20:36                   ` Christoph Lameter
  2007-10-29 21:08                   ` Andi Kleen
  1 sibling, 0 replies; 98+ messages in thread
From: Christoph Lameter @ 2007-10-29 20:36 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Lee Schermerhorn, rientjes, akpm, ak, linux-kernel

On Mon, 29 Oct 2007, Paul Jackson wrote:

> The more I have stared at this, the more certain I've become that we
> need to make the mbind/mempolicy calls modal -- the default mode
> continues to interpret node numbers and masks just as these calls do
> now, and the alternative mode provides the so called "Choice B",
> which takes node numbers and masks as if the task owned the entire
> system, and then the kernel internally and automatically scrunches
> those masks down to whatever happens to be the current cpuset of
> the task.

Ack.


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 19:35                 ` Paul Jackson
  2007-10-29 20:36                   ` Christoph Lameter
@ 2007-10-29 21:08                   ` Andi Kleen
  2007-10-29 22:48                     ` Paul Jackson
  2007-10-30 19:47                     ` Paul Jackson
  1 sibling, 2 replies; 98+ messages in thread
From: Andi Kleen @ 2007-10-29 21:08 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Lee Schermerhorn, rientjes, clameter, akpm, linux-kernel

On Monday 29 October 2007 20:35:58 Paul Jackson wrote:
> Lee wrote:
> > 2.  As this thread progresses, you've discussed relaxing the requirement
> > that applications pass a valid subset of mems_allowed.  I.e., something
> > that was illegal becomes legal.  An API change, I think.  But, a
> > backward compatible one, so that's OK, right? :-)
> 
> The more I have stared at this, the more certain I've become that we
> need to make the mbind/mempolicy calls modal -- the default mode
> continues to interpret node numbers and masks just as these calls do
> now, and the alternative mode provides the so called "Choice B",
> which takes node numbers and masks as if the task owned the entire
> system, and then the kernel internally and automatically scrunches
> those masks down to whatever happens to be the current cpuset of
> the task.

So the user space asks for 8 nodes because it knows the machine
has that many from /sys and it only gets 4 if a cpuset says so? That's
just bad semantics. And is not likely to make the user programs happy.

I don't think you'll get around to teaching user space (or rather libnuma)
about cpusets and let it handle it.

>From the libnuma perspective the machine size would be essentially 
current cpuset size. 

On the syscall level I don't think it makes much sense to change though.

The alternative would be to throw out the complete cpuset concept and go for 
virtual nodes inside containers with virtualized /sys.

-Andi



^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 21:08                   ` Andi Kleen
@ 2007-10-29 22:48                     ` Paul Jackson
  2007-10-30 19:47                     ` Paul Jackson
  1 sibling, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-29 22:48 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Lee.Schermerhorn, rientjes, clameter, akpm, linux-kernel

> So the user space asks for 8 nodes because it knows the machine
> has that many from /sys and it only gets 4 if a cpuset says so? That's
> just bad semantics. And is not likely to make the user programs happy.

That's no different than what can happen today -- if a task actually
is in an 8 node cpuset, sets up its mempolicies accordingly, and then
gets shoe horned into a 4 node cpuset.

It's not good or bad; it's just interactions between two mechanisms.

If your app doesn't run well in a small cpuset, don't run it there
(or do run it there, poorly ;).

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 21:08                   ` Andi Kleen
  2007-10-29 22:48                     ` Paul Jackson
@ 2007-10-30 19:47                     ` Paul Jackson
  2007-10-30 20:20                       ` Lee Schermerhorn
  2007-10-30 20:27                       ` Andi Kleen
  1 sibling, 2 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-30 19:47 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Lee.Schermerhorn, rientjes, clameter, akpm, linux-kernel

Andi, Christoph, or whomever:

  Are there any good regression tests of mempolicy functionality?

  This patch I'm coding is delicate enough that I probably broke
  something.  It would be nice to catch it sooner rather than later.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 19:47                     ` Paul Jackson
@ 2007-10-30 20:20                       ` Lee Schermerhorn
  2007-10-30 20:26                         ` Paul Jackson
  2007-10-30 20:27                       ` Andi Kleen
  1 sibling, 1 reply; 98+ messages in thread
From: Lee Schermerhorn @ 2007-10-30 20:20 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Andi Kleen, rientjes, clameter, akpm, linux-kernel

On Tue, 2007-10-30 at 12:47 -0700, Paul Jackson wrote:
> Andi, Christoph, or whomever:
> 
>   Are there any good regression tests of mempolicy functionality?

Paul:  Andi has a regression test in the numactl source package.

Try:
	http://freshmeat.net/redir/numactl/62210/url_tgz/numactl-1.0.2.tar.gz

Lee


^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 20:20                       ` Lee Schermerhorn
@ 2007-10-30 20:26                         ` Paul Jackson
  0 siblings, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-30 20:26 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: ak, rientjes, clameter, akpm, linux-kernel

Lee wrote:
> Paul:  Andi has a regression test in the numactl source package.

Good - thanks.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 19:47                     ` Paul Jackson
  2007-10-30 20:20                       ` Lee Schermerhorn
@ 2007-10-30 20:27                       ` Andi Kleen
  1 sibling, 0 replies; 98+ messages in thread
From: Andi Kleen @ 2007-10-30 20:27 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Lee.Schermerhorn, rientjes, clameter, akpm, linux-kernel

On Tuesday 30 October 2007 20:47:51 Paul Jackson wrote:
> Andi, Christoph, or whomever:
> 
>   Are there any good regression tests of mempolicy functionality?

numactl has some basic tests (make test). I think newer LTP 
also has some but i haven't looked at them. And there is Lee's
memtoy which does some things; but I don't think it's very automated.

-Andi

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29  7:26                                                     ` Paul Jackson
@ 2007-10-30 22:53                                                       ` David Rientjes
  2007-10-30 23:17                                                         ` Paul Jackson
  0 siblings, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-30 22:53 UTC (permalink / raw)
  To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

On Mon, 29 Oct 2007, Paul Jackson wrote:

> >     Policies such as MPOL_INTERLEAVE always get AND'd with
> >     pol->cpuset_mems_allowed.
> 
> Not AND'd - Folded, as in bitmap_remap().
> 
> >     If that yields numa_no_nodes, MPOL_DEFAULT is used instead.
> 
> Not an issue with Folding.
> 
> >     Policies such as MPOL_PREFERRED are respected if the node
> >     is set in pol->cpuset_mems_allowed, otherwise MPOL_DEFAULT
> >     is used.
> 
> Not an issue with Folding.
> 
> >     If an application attempts to setup a memory policy for
> >     an MPOL_PREFERRED node that it doesn't have access to or
> >     an MPOL_INTERLEAVE nodemask that is empty when AND'd with
> >     pol->cpuset_mems_allowed, -EINVAL is returned and no new
> >     policy is effected.
> 
> Not issues with Folding.
> 
> >     If an application gains nodes in pol->cpuset_mems_allowed that
> >     now include the nodes from MPOL_INTERLEAVE or MPOL_PREFERRED,
> >     that policy is then effected once again.  Otherwise,
> >     MPOL_DEFAULT is still used.
> 
> Not issues with Folding.
> 
> With folding, an application that layed out an elaborate memory
> policy configuration covering say 16 nodes can run in a 4 node
> cpuset, where whatever would have been on node N gets folded down
> to node N % 4.
> 

Missing the point; this is an alternative to the previous choices; Choice 
C explicitly removes all remaps ("folding") from mempolicies.  The 
nodemask passed to set_mempolicy() will always have exactly one meaning: 
the system nodes that the policy is intended for.

Cpusets, which are built upon mempolicies, can obviously take access to 
some of those nodes away.  That's why the existing mempolicies are AND'd 
with the cpuset's mems_allowed to represent the current nodemask that the 
mempolicy is effecting.  If none of them are available because of cpusets, 
the mempolicy is invalidated and MPOL_DEFAULT is used.  If access to some 
nodes from the mempolicy's nodemask become available once again, the 
policy is again effected.

I'm arguing that remapping a policy's nodemask, although that is what 
currently is done, is troublesome because it can use a policy such as 
MPOL_PREFERRED to work on a node for which it was never intended.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 15:10                             ` Lee Schermerhorn
  2007-10-29 18:41                               ` Paul Jackson
@ 2007-10-30 22:57                               ` David Rientjes
  2007-10-30 23:46                                 ` Paul Jackson
  1 sibling, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-30 22:57 UTC (permalink / raw)
  To: Lee Schermerhorn; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel

On Mon, 29 Oct 2007, Lee Schermerhorn wrote:

> And even when the intent is to preserve the cpuset relative positions of
> the nodes in the nodemask, this really only makes sense if the original
> and modified cpusets have the same physical topology w/rt multi-level
> NUMA interconnects.  This is something that has bothered me about
> dynamic cpusets and current policy remapping.  We don't do a good job of
> explaining the implications of changing cpuset topology on applications,
> nor do we handle it very well in the code.  Paul addresses one of my
> concerns in a later message in this thread, so I'll comment there.
> 

I agree with your assessment of our current policy remapping with respect 
to the passed nodemask, I think it's troublesome.  Whether we can change 
that now is another question, but the remap certainly doesn't help respect 
the intent of the application and the mempolicies they have set up when 
influenced by an outside entity such as cpusets.

See my new Choice C alternative.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29  7:15                                                 ` Paul Jackson
@ 2007-10-30 23:12                                                   ` David Rientjes
  2007-10-30 23:44                                                     ` Paul Jackson
  0 siblings, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-30 23:12 UTC (permalink / raw)
  To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

On Mon, 29 Oct 2007, Paul Jackson wrote:

> Blind siding users with a unilateral change like this will leave
> orphaned bits gasping in agony on the computer room floor.  It can
> sometimes takes months of elapsed time and hundreds of hours of various
> peoples time across a dozen departments in three to five corporations
> to track down the root cause of such a problem, from the point of the
> initial failure, back to the desk of someone like you or me.  And then
> it can take tens or hundreds more hours of human effort to deliver a
> fix.  I refuse to knowingly go down that road.
> 

If your argument is that most applications are written to implement 
mempolicies without necessarily thinking too much about its cpuset 
placement or interactions with cpusets, then the requirement of remapping 
nodes when a cpuset changes for effected mempolicies isn't actually that 
important.  In other words, my Choice C with AND'd behavior as opposed to 
remapping behavior could be introduced as a replacement for Choice A.

Those applications that currently rely on the remapping are going to be 
broken anyway because they are unknowingly receiving different nodes than 
they intended, this is the objection to remapping that Lee agreed with.  
The remap doesn't take into account any notion of locality or affinity to 
physical controllers and seems to be merely a convenience of not 
invalidating the entire mempolicy in light of an ever-changing cpuset 
policy.

> Not "AND".  Fold - the n-th bit is set in a tasks mems_allowed iff
> there exists m such that (m % w) == n, and such that the m-th bit is
> set in the tasks mempolicy's remembered nodemask, where w is the weight
> (number of '1' bits) in the tasks current cpusets mems_allowed. See
> lib/bitmap.c:bitmap_remap(), and its wrapper nodes_remap() for the
> implementation.
> 

Yes, I know, and my Choice C does _not_ want that folding behavior; it 
wants the AND'd behavior because it fully respects the intent of the 
application with regard to the actual nodes that it specified in its 
memory policies.  A node should only have one definition and policies that 
are effected on a set of nodes, or one node in the preferred case, should 
not change from beneath the application because it was not the intent of 
the implementation.  Doing so is dangerous, regardless of whether or not 
it is currently the mempolicy behavior in HEAD.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-29 18:41                               ` Paul Jackson
  2007-10-29 19:01                                 ` Lee Schermerhorn
@ 2007-10-30 23:17                                 ` David Rientjes
  2007-10-31  0:03                                   ` Paul Jackson
  1 sibling, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-30 23:17 UTC (permalink / raw)
  To: Paul Jackson; +Cc: Lee Schermerhorn, clameter, akpm, ak, linux-kernel

On Mon, 29 Oct 2007, Paul Jackson wrote:

> But in any case, we (the kernel) are just providing the mechanisms.
> If they don't fit ones needs, don't use them ;).
> 

The kernel is providing the mechanism to interleave over a set of nodes or 
prefer a single node for allocations, but it also provides for remapping 
those to different nodes, without regard to locality or affinity to 
specific hardware, when the cpuset changes.  That's what Choice C is 
intended to replace: a node means a node so either you get an effected 
mempolicy over the nodemask you asked for, or MPOL_DEFAULT is used because 
you lack sufficient access.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 22:53                                                       ` David Rientjes
@ 2007-10-30 23:17                                                         ` Paul Jackson
  2007-10-30 23:25                                                           ` David Rientjes
  0 siblings, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-30 23:17 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

David wrote:
> The nodemask passed to set_mempolicy() will always have exactly one
> meaning: the system nodes that the policy is intended for.

Ok - that makes the meaning of Choice C clearer to me.  Thank-you.

We've already got two Choices, one released and one in the oven.  Is
there an actual, real world situation, motivating this third Choice?

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 23:17                                                         ` Paul Jackson
@ 2007-10-30 23:25                                                           ` David Rientjes
  2007-10-31  0:03                                                             ` Paul Jackson
  2007-10-31  0:05                                                             ` Paul Jackson
  0 siblings, 2 replies; 98+ messages in thread
From: David Rientjes @ 2007-10-30 23:25 UTC (permalink / raw)
  To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

On Tue, 30 Oct 2007, Paul Jackson wrote:

> We've already got two Choices, one released and one in the oven.  Is
> there an actual, real world situation, motivating this third Choice?
> 

Let's put Choice C into the lower oven, then.

Of course there's actual and real world examples of this, because right 
now we're not meeting the full intent of the application.  Cpusets deal 
with cpus and memory, they don't have anything to do with affinity to 
particular I/O devices; that part is left up to the creator of the cpuset 
to sort out correctly based on their system topology.

If my application does tons of I/O on one particular device to which my 
memory has access, I can use MPOL_PREFERRED to prefer the memory be 
allocated on a node with the best affinity to my device.  If cpusets 
change my access to that node, I'm still using an MPOL_PREFERRED policy 
with a remapped node that no longer has affinity to that device because 
nodes_remap() doesn't take that into account.  My preference would be to 
fallback to MPOL_DEFAULT behavior, since it's certainly plausible that 
other cpusets share the same node, instead of unnecessarily filling up a 
node that I don't even prefer anymore.

Same situation exists of MPOL_INTERLEAVE policies where my NUMA 
optimization is no longer helpful because I'm interleaving over a set of 
nodes that was simply remapped and their affinity (which isn't guaranteed 
to be unifom) wasn't even taken into account.

But, with Choice C, my intent is still preserved in the mempolicy even 
though it's not effected because my access rights to the node has changed.  
If I get access to that node back later, and I haven't issued subsequent 
set_mempolicy() calls to change my policy, my MPOL_PREFERRED or 
MPOL_INTERLEAVE policy is again effected and I then benefit from my NUMA 
optimization once again.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 23:12                                                   ` David Rientjes
@ 2007-10-30 23:44                                                     ` Paul Jackson
  2007-10-30 23:53                                                       ` David Rientjes
  0 siblings, 1 reply; 98+ messages in thread
From: Paul Jackson @ 2007-10-30 23:44 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

David wrote:
> If your argument is that most applications are written to implement 
> mempolicies without necessarily thinking too much about its cpuset 
> placement or interactions with cpusets, then the requirement of remapping 
> nodes when a cpuset changes for effected mempolicies isn't actually that 
> important.

Just because they didn't think about cpuset remapping when they coded
their mempolicy calls, doesn't mean they wouldn't be broken by changes
in how mempolicy numbers nodes.  Often, it's the other way around:
the less they though of it, the more likely changing it would break
them.

> In other words, my Choice C with AND'd behavior as opposed to 
> remapping behavior could be introduced as a replacement for Choice A.

No - I will not agree to changing the default mempolicy kernel API node
numbering at this time.  Period.  Full stop.  We can add non-default
choices for now, and perhaps in the light of future experience, we
may choose to do more later.

> Those applications that currently rely on the remapping are going to be 
> broken anyway because they are unknowingly receiving different nodes than 
> they intended, this is the objection to remapping that Lee agreed with.

No, they may or may not be broken.  That depends on whether or not they had
specific hardware locality or affinity needs.

> The remap doesn't take into account any notion of locality or affinity to 
> physical controllers and seems to be merely a convenience of not 
> invalidating the entire mempolicy in light of an ever-changing cpuset 
> policy.

If you're running apps that have specific hardware affinity requirements,
then perhaps you shouldn't be moving them about in the first place ;).
And if they did have such needs, aren't they just as likely to be busted
by AND'ing off some of their nodes as they are by remapping those nodes?

I sure wish I knew what real world, actual, not hypothetical, situations
were motivating this.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 22:57                               ` David Rientjes
@ 2007-10-30 23:46                                 ` Paul Jackson
  0 siblings, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-30 23:46 UTC (permalink / raw)
  To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel

David wrote:
> but the remap certainly doesn't help respect the intent of the
> application and the mempolicies they have set up when influenced
> by an outside entity such as cpusets.

... guess that depends on the intent, doesn't it?

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 23:44                                                     ` Paul Jackson
@ 2007-10-30 23:53                                                       ` David Rientjes
  2007-10-31  0:29                                                         ` Paul Jackson
  0 siblings, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-30 23:53 UTC (permalink / raw)
  To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

On Tue, 30 Oct 2007, Paul Jackson wrote:

> > Those applications that currently rely on the remapping are going to be 
> > broken anyway because they are unknowingly receiving different nodes than 
> > they intended, this is the objection to remapping that Lee agreed with.
> 
> No, they may or may not be broken.  That depends on whether or not they had
> specific hardware locality or affinity needs.
> 

Of course they have specific affinity needs, that's why they used 
mempolicies.  Remapping those policies to a set of nodes that resembles 
the original mempolicy's nodemask in terms of construction but without 
regard for the affinity those nodes have with respect to system topology 
could lead to performance degredations.

> If you're running apps that have specific hardware affinity requirements,
> then perhaps you shouldn't be moving them about in the first place ;).
> And if they did have such needs, aren't they just as likely to be busted
> by AND'ing off some of their nodes as they are by remapping those nodes?
> 

No, because you're interleaving over the set of actual nodes you wanted to 
interleave over in the first place and not some pseudo-random set that 
your cpuset has access to.

> I sure wish I knew what real world, actual, not hypothetical, situations
> were motivating this.
> 

You're defending the current remap behavior in terms of semantics of 
mempolicies?  My position, and Choice C's position, is that you either get 
the exact (or partially-constructed) policy that you asked for, or you get 
the MPOL_DEFAULT behavior.  What you don't get, even though it's currently 
how we do it, is a completely different set of nodes that you never 
intended to have a specific policy over.

		David

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 23:17                                 ` David Rientjes
@ 2007-10-31  0:03                                   ` Paul Jackson
  0 siblings, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-31  0:03 UTC (permalink / raw)
  To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel

David wrote:
> That's what Choice C is intended to replace

Yes, one remaps nodes it can't provide, and the other removes
nodes it can't provide.

Yup - that's a logical difference.  So ... I would think that
the only solution that would be satisfactory to apps that require
specific hardware nodes would be to simply not move them in the
first place.  If you do that, then none of these Choices matter
in the slightest.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 23:25                                                           ` David Rientjes
@ 2007-10-31  0:03                                                             ` Paul Jackson
  2007-10-31  0:05                                                             ` Paul Jackson
  1 sibling, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-31  0:03 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

David wrote:
> Of course there's actual and real world examples of this, because right 
> now we're not meeting the full intent of the application.

Please describe one, an actual one, not a hypothetical one, of which you
have personal knowledge.

There are many refinements we could add, an endless stream of them.
Each one adds a burden to those who didn't need it.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 23:25                                                           ` David Rientjes
  2007-10-31  0:03                                                             ` Paul Jackson
@ 2007-10-31  0:05                                                             ` Paul Jackson
  1 sibling, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-31  0:05 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

David wrote:
> But, with Choice C, my intent is still preserved in the mempolicy even 
> though it's not effected because my access rights to the node has changed.  

Choice B, as I'm coding it, has this property as well.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

* Re: [patch 2/2] cpusets: add interleave_over_allowed option
  2007-10-30 23:53                                                       ` David Rientjes
@ 2007-10-31  0:29                                                         ` Paul Jackson
  0 siblings, 0 replies; 98+ messages in thread
From: Paul Jackson @ 2007-10-31  0:29 UTC (permalink / raw)
  To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel

David wrote:
> Of course they have specific affinity needs, that's why they used 
> mempolicies.

No.  Good grief.  If they are just looking for some set of memory
banks, not to other node-specific hardware, then they might not need
a specific node.

Consider for example a multi-threaded, compute bound, long running
scientific computation that has a substantial and fussy memory layout.
Remapping it from one cpuset to another having the same NUMA topology
may well work fine, once its memory caches recover.  Reverting it to
the lowest common denominator MPOL_DEFAULT policy because (Choice C) it
no longer has access to its initial nodes might devastate its
performance.

pj wrote:
> I sure wish I knew what real world, actual, not hypothetical, situations
> were motivating this.

I'm still wishing ...

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 98+ messages in thread

end of thread, other threads:[~2007-10-31  0:29 UTC | newest]

Thread overview: 98+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-25 22:54 [patch 1/2] cpusets: extract mmarray loading from update_nodemask David Rientjes
2007-10-25 22:54 ` [patch 2/2] cpusets: add interleave_over_allowed option David Rientjes
2007-10-25 23:37   ` Christoph Lameter
2007-10-25 23:56     ` David Rientjes
2007-10-26  0:28       ` Christoph Lameter
2007-10-26  1:55         ` Paul Jackson
2007-10-26  2:11           ` David Rientjes
2007-10-26  2:29             ` Paul Jackson
2007-10-26  2:45               ` David Rientjes
2007-10-26  3:14                 ` Paul Jackson
2007-10-26  3:58                   ` David Rientjes
2007-10-26  4:34                     ` Paul Jackson
2007-10-26 15:37                     ` Lee Schermerhorn
2007-10-26 17:04                       ` Paul Jackson
2007-10-26 17:28                         ` Lee Schermerhorn
2007-10-26 20:21                         ` Michael Kerrisk
2007-10-26 20:25                           ` Paul Jackson
2007-10-26 20:33                             ` Michael Kerrisk
2007-10-26 15:30             ` Lee Schermerhorn
2007-10-26 18:46               ` David Rientjes
2007-10-26 19:00                 ` Paul Jackson
2007-10-26 20:45                   ` David Rientjes
2007-10-26 21:05                     ` Christoph Lameter
2007-10-26 21:08                       ` David Rientjes
2007-10-26 21:12                         ` Christoph Lameter
2007-10-26 21:15                           ` David Rientjes
2007-10-26 21:13                     ` Lee Schermerhorn
2007-10-26 21:17                       ` Christoph Lameter
2007-10-26 21:26                         ` Lee Schermerhorn
2007-10-26 21:37                           ` Christoph Lameter
2007-10-29 15:00                             ` Lee Schermerhorn
2007-10-29 17:33                               ` Paul Jackson
2007-10-29 17:46                                 ` Lee Schermerhorn
2007-10-29 20:35                               ` Christoph Lameter
2007-10-26 21:18                       ` David Rientjes
2007-10-26 21:31                         ` Lee Schermerhorn
2007-10-26 21:39                           ` David Rientjes
2007-10-27  1:07                             ` Paul Jackson
2007-10-27  1:26                               ` Christoph Lameter
2007-10-27  2:41                                 ` Paul Jackson
2007-10-27  2:50                                   ` Christoph Lameter
2007-10-27  5:16                                     ` Paul Jackson
2007-10-27  6:07                                       ` Christoph Lameter
2007-10-27  8:36                                         ` Paul Jackson
2007-10-27 17:47                                           ` Christoph Lameter
2007-10-27 20:59                                             ` Paul Jackson
2007-10-27 17:50                                   ` David Rientjes
2007-10-27 23:19                                     ` Paul Jackson
2007-10-28 18:19                                       ` David Rientjes
2007-10-28 23:46                                         ` Paul Jackson
2007-10-29  1:04                                           ` David Rientjes
2007-10-29  4:27                                             ` Paul Jackson
2007-10-29  4:47                                               ` David Rientjes
2007-10-29  5:45                                                 ` Paul Jackson
2007-10-29  7:00                                                   ` David Rientjes
2007-10-29  7:26                                                     ` Paul Jackson
2007-10-30 22:53                                                       ` David Rientjes
2007-10-30 23:17                                                         ` Paul Jackson
2007-10-30 23:25                                                           ` David Rientjes
2007-10-31  0:03                                                             ` Paul Jackson
2007-10-31  0:05                                                             ` Paul Jackson
2007-10-29  7:15                                                 ` Paul Jackson
2007-10-30 23:12                                                   ` David Rientjes
2007-10-30 23:44                                                     ` Paul Jackson
2007-10-30 23:53                                                       ` David Rientjes
2007-10-31  0:29                                                         ` Paul Jackson
2007-10-29 16:54                                       ` Lee Schermerhorn
2007-10-29 19:40                                         ` Paul Jackson
2007-10-29 19:45                                         ` Paul Jackson
2007-10-29 19:57                                         ` Paul Jackson
2007-10-29 20:02                                         ` Paul Jackson
2007-10-27 17:45                               ` David Rientjes
2007-10-27 21:22                                 ` Paul Jackson
2007-10-29 15:10                             ` Lee Schermerhorn
2007-10-29 18:41                               ` Paul Jackson
2007-10-29 19:01                                 ` Lee Schermerhorn
2007-10-30 23:17                                 ` David Rientjes
2007-10-31  0:03                                   ` Paul Jackson
2007-10-30 22:57                               ` David Rientjes
2007-10-30 23:46                                 ` Paul Jackson
2007-10-26 20:43                 ` Lee Schermerhorn
2007-10-26 15:18         ` Lee Schermerhorn
2007-10-26 17:36           ` Christoph Lameter
2007-10-26 18:45           ` David Rientjes
2007-10-26 19:02             ` Paul Jackson
2007-10-27 19:16             ` David Rientjes
2007-10-29 16:23               ` Lee Schermerhorn
2007-10-29 17:35                 ` Andi Kleen
2007-10-29 19:35                 ` Paul Jackson
2007-10-29 20:36                   ` Christoph Lameter
2007-10-29 21:08                   ` Andi Kleen
2007-10-29 22:48                     ` Paul Jackson
2007-10-30 19:47                     ` Paul Jackson
2007-10-30 20:20                       ` Lee Schermerhorn
2007-10-30 20:26                         ` Paul Jackson
2007-10-30 20:27                       ` Andi Kleen
2007-10-26  1:13   ` Paul Jackson
2007-10-26  1:30     ` David Rientjes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox