* [patch 1/2] cpusets: extract mmarray loading from update_nodemask
@ 2007-10-25 22:54 David Rientjes
2007-10-25 22:54 ` [patch 2/2] cpusets: add interleave_over_allowed option David Rientjes
0 siblings, 1 reply; 98+ messages in thread
From: David Rientjes @ 2007-10-25 22:54 UTC (permalink / raw)
To: Andrew Morton
Cc: Andi Kleen, Paul Jackson, Christoph Lameter, Lee Schermerhorn,
linux-kernel
Extract a helper function from update_nodemask() to load an array of
mm_struct pointers with references to each task's mm_struct that is
currently attached to a given cpuset.
This will be used later for other purposes where memory policies need to
be rebound for each task attached to a cpuset.
Cc: Andi Kleen <ak@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
kernel/cpuset.c | 130 ++++++++++++++++++++++++++++++++++---------------------
1 files changed, 81 insertions(+), 49 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -702,6 +702,79 @@ done:
/* Don't kfree(doms) -- partition_sched_domains() does that. */
}
+/*
+ * Loads mmarray with pointers to all the mm_struct's of tasks attached to
+ * cpuset cs.
+ *
+ * The reference count to each mm is incremented before loading it into the
+ * array, so put_cpuset_mm_array() must be called after this function to
+ * decrement each reference count and free the memory allocated for mmarray
+ * via this function.
+ */
+static struct mm_struct **get_cpuset_mm_array(const struct cpuset *cs,
+ int *ntasks)
+{
+ struct mm_struct **mmarray;
+ struct task_struct *p;
+ struct cgroup_iter it;
+ int count;
+ int fudge;
+
+ *ntasks = 0;
+ fudge = 10; /* spare mmarray[] slots */
+ fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
+ /*
+ * Allocate mmarray[] to hold mm reference for each task in cpuset cs.
+ * Can't kmalloc GFP_KERNEL while holding tasklist_lock. We could use
+ * GFP_ATOMIC, but with a few more lines of code, we can retry until
+ * we get a big enough mmarray[] w/o using GFP_ATOMIC.
+ */
+ while (1) {
+ count = cgroup_task_count(cs->css.cgroup); /* guess */
+ count += fudge;
+ mmarray = kmalloc(count * sizeof(*mmarray), GFP_KERNEL);
+ if (!mmarray)
+ return NULL;
+ read_lock(&tasklist_lock); /* block fork */
+ if (cgroup_task_count(cs->css.cgroup) <= count)
+ break; /* got enough */
+ read_unlock(&tasklist_lock); /* try again */
+ kfree(mmarray);
+ }
+
+ /* Load up mmarray[] with mm reference for each task in cpuset. */
+ cgroup_iter_start(cs->css.cgroup, &it);
+ while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
+ struct mm_struct *mm;
+
+ if (*ntasks >= count) {
+ printk(KERN_WARNING
+ "Cpuset mempolicy rebind incomplete.\n");
+ break;
+ }
+ mm = get_task_mm(p);
+ if (!mm)
+ continue;
+ mmarray[(*ntasks)++] = mm;
+ }
+ cgroup_iter_end(cs->css.cgroup, &it);
+ read_unlock(&tasklist_lock);
+ return mmarray;
+}
+
+/*
+ * Decrements the reference count to each mm in mmarray and frees the memory
+ * allocated for mmarray.
+ *
+ * To be used in conjunction with get_cpuset_mm_array().
+ */
+static void put_cpuset_mm_array(struct mm_struct **mmarray, int ntasks)
+{
+ while (ntasks-- > 0)
+ mmput(mmarray[ntasks]);
+ kfree(mmarray);
+}
+
static inline int started_after_time(struct task_struct *t1,
struct timespec *time,
struct task_struct *t2)
@@ -915,13 +988,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
{
struct cpuset trialcs;
nodemask_t oldmem;
- struct task_struct *p;
struct mm_struct **mmarray;
- int i, n, ntasks;
+ int i, n;
int migrate;
- int fudge;
int retval;
- struct cgroup_iter it;
/*
* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -963,50 +1033,12 @@ static int update_nodemask(struct cpuset *cs, char *buf)
mutex_unlock(&callback_mutex);
cpuset_being_rebound = cs; /* causes mpol_copy() rebind */
-
- fudge = 10; /* spare mmarray[] slots */
- fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
retval = -ENOMEM;
-
- /*
- * Allocate mmarray[] to hold mm reference for each task
- * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
- * tasklist_lock. We could use GFP_ATOMIC, but with a
- * few more lines of code, we can retry until we get a big
- * enough mmarray[] w/o using GFP_ATOMIC.
- */
- while (1) {
- ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
- ntasks += fudge;
- mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
- if (!mmarray)
- goto done;
- read_lock(&tasklist_lock); /* block fork */
- if (cgroup_task_count(cs->css.cgroup) <= ntasks)
- break; /* got enough */
- read_unlock(&tasklist_lock); /* try again */
- kfree(mmarray);
- }
-
- n = 0;
-
- /* Load up mmarray[] with mm reference for each task in cpuset. */
- cgroup_iter_start(cs->css.cgroup, &it);
- while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
- struct mm_struct *mm;
-
- if (n >= ntasks) {
- printk(KERN_WARNING
- "Cpuset mempolicy rebind incomplete.\n");
- break;
- }
- mm = get_task_mm(p);
- if (!mm)
- continue;
- mmarray[n++] = mm;
- }
- cgroup_iter_end(cs->css.cgroup, &it);
- read_unlock(&tasklist_lock);
+ mmarray = get_cpuset_mm_array(cs, &n);
+ if (!mmarray)
+ goto done;
+ if (!n)
+ goto done_success;
/*
* Now that we've dropped the tasklist spinlock, we can
@@ -1028,12 +1060,12 @@ static int update_nodemask(struct cpuset *cs, char *buf)
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
- mmput(mm);
}
/* We're done rebinding vma's to this cpusets new mems_allowed. */
- kfree(mmarray);
cpuset_being_rebound = NULL;
+done_success:
+ put_cpuset_mm_array(mmarray, n);
retval = 0;
done:
return retval;
^ permalink raw reply [flat|nested] 98+ messages in thread* [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-25 22:54 [patch 1/2] cpusets: extract mmarray loading from update_nodemask David Rientjes @ 2007-10-25 22:54 ` David Rientjes 2007-10-25 23:37 ` Christoph Lameter 2007-10-26 1:13 ` Paul Jackson 0 siblings, 2 replies; 98+ messages in thread From: David Rientjes @ 2007-10-25 22:54 UTC (permalink / raw) To: Andrew Morton Cc: Andi Kleen, Paul Jackson, Christoph Lameter, Lee Schermerhorn, linux-kernel Adds a new 'interleave_over_allowed' option to cpusets. When a task with an MPOL_INTERLEAVE memory policy is attached to a cpuset with this option set, the interleaved nodemask becomes the cpuset's mems_allowed. When the cpuset's mems_allowed changes, the interleaved nodemask for all tasks with MPOL_INTERLEAVE memory policies is also updated to be the new mems_allowed nodemask. This allows applications to specify that they want to interleave over all nodes that they are allowed to access. This set of nodes can be changed at any time via the cpuset interface and each individual memory policy is updated to reflect the changes for all attached tasks when this option is set. Cc: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Signed-off-by: David Rientjes <rientjes@google.com> --- Documentation/cpusets.txt | 30 +++++++++++++++++++- include/linux/cpuset.h | 6 ++++ kernel/cpuset.c | 64 +++++++++++++++++++++++++++++++++++++++++++++ mm/mempolicy.c | 6 ++++ 4 files changed, 104 insertions(+), 2 deletions(-) diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt @@ -20,7 +20,8 @@ CONTENTS: 1.5 What is memory_pressure ? 1.6 What is memory spread ? 1.7 What is sched_load_balance ? - 1.8 How do I use cpusets ? + 1.8 What is interleave_over_allowed ? + 1.9 How do I use cpusets ? 2. Usage Examples and Syntax 2.1 Basic Usage 2.2 Adding/removing cpus @@ -497,7 +498,32 @@ the cpuset code to update these sched domains, it compares the new partition requested with the current, and updates its sched domains, removing the old and adding the new, for each change. -1.8 How do I use cpusets ? +1.8 What is interleave_over_allowed ? +------------------------------------- + +Tasks may specify a memory policy of MPOL_INTERLEAVE with the desired +result of interleaving memory allocations over their set of allowed +nodes. + +Since the set of allowed nodes may change via cpusets (through the +'mems' file) without knowledge to the application, a mechanism needs +to exist such that applications can specify that they desire to +interleave over all nodes to which they have access. This avoids a +constant get_mempolicy() and set_mempolicy() loop to update an +interleaved memory policy that respects both its cpuset's mems_allowed +and the intent of the application. + +When interleave_over_allowed is set, all attached tasks with +MPOL_INTERLEAVE memory policies automatically interleave over all +available cpuset nodes regardless of what nodemask was passed to +set_mempolicy(). When the cpuset's mems change, all attached tasks +with interleaved policies automatically gets updated with the new +nodemask. + +The value of 'interleave_over_allowed' is inherited from a cpuset's +parent upon creation. + +1.9 How do I use cpusets ? -------------------------- In order to minimize the impact of cpusets on critical kernel diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -77,6 +77,7 @@ static inline int cpuset_do_slab_mem_spread(void) extern void cpuset_track_online_nodes(void); extern int current_cpuset_is_being_rebound(void); +extern nodemask_t current_cpuset_interleaved_mems(void); #else /* !CONFIG_CPUSETS */ @@ -157,6 +158,11 @@ static inline int current_cpuset_is_being_rebound(void) return 0; } +static inline nodemask_t current_cpuset_interleaved_mems(void) +{ + return NODE_MASK_NONE; +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -121,6 +121,7 @@ typedef enum { CS_SCHED_LOAD_BALANCE, CS_SPREAD_PAGE, CS_SPREAD_SLAB, + CS_INTERLEAVE, } cpuset_flagbits_t; /* convenient tests for these bits */ @@ -154,6 +155,11 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +static inline int is_interleave_over_allowed(const struct cpuset *cs) +{ + return test_bit(CS_INTERLEAVE, &cs->flags); +} + /* * Increment this integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation @@ -1089,6 +1095,46 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) return 0; } +/* Rebinds the memory policies of all tasks attached to cs. + * + * Call with cgroup_mutex held. + */ +static int update_interleave(struct cpuset *cs, char *buf) +{ + struct mm_struct **mmarray; + int ntasks; + int i; + + if (!simple_strtoul(buf, NULL, 10)) { + clear_bit(CS_INTERLEAVE, &cs->flags); + return 0; + } + + mmarray = get_cpuset_mm_array(cs, &ntasks); + if (!mmarray) + return -ENOMEM; + if (!ntasks) + goto done; + + for (i = 0; i < ntasks; i++) + mpol_rebind_mm(mmarray[i], &cs->mems_allowed); +done: + put_cpuset_mm_array(mmarray, ntasks); + set_bit(CS_INTERLEAVE, &cs->flags); + return 0; +} + +nodemask_t current_cpuset_interleaved_mems(void) +{ + nodemask_t mask = NODE_MASK_NONE; + + mutex_lock(&callback_mutex); + if (is_interleave_over_allowed(task_cs(current))) + mask = task_cs(current)->mems_allowed; + mutex_unlock(&callback_mutex); + return mask; +} + /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, @@ -1283,6 +1329,7 @@ typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, + FILE_INTERLEAVE_OVER_ALLOWED, } cpuset_filetype_t; static ssize_t cpuset_common_file_write(struct cgroup *cont, @@ -1350,6 +1397,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, retval = update_flag(CS_SPREAD_SLAB, cs, buffer); cs->mems_generation = cpuset_mems_generation++; break; + case FILE_INTERLEAVE_OVER_ALLOWED: + retval = update_interleave(cs, buffer); + break; default: retval = -EINVAL; goto out2; @@ -1446,6 +1496,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, case FILE_SPREAD_SLAB: *s++ = is_spread_slab(cs) ? '1' : '0'; break; + case FILE_INTERLEAVE_OVER_ALLOWED: + *s++ = is_interleave_over_allowed(cs) ? '1' : '0'; + break; default: retval = -EINVAL; goto out; @@ -1536,6 +1589,13 @@ static struct cftype cft_spread_slab = { .private = FILE_SPREAD_SLAB, }; +static struct cftype cft_interleave_over_allowed = { + .name = "interleave_over_allowed", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, + .private = FILE_INTERLEAVE_OVER_ALLOWED, +}; + static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) { int err; @@ -1558,6 +1618,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) return err; if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0) return err; + if ((err = cgroup_add_file(cont, ss, &cft_interleave_over_allowed)) < 0) + return err; /* memory_pressure_enabled is in root cpuset only */ if (err == 0 && !cont->parent) err = cgroup_add_file(cont, ss, @@ -1633,6 +1695,8 @@ static struct cgroup_subsys_state *cpuset_create( set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); + if (is_interleave_over_allowed(parent)) + set_bit(CS_INTERLEAVE, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cs->cpus_allowed = CPU_MASK_NONE; cs->mems_allowed = NODE_MASK_NONE; diff --git a/mm/mempolicy.c b/mm/mempolicy.c --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1739,6 +1739,12 @@ static void mpol_rebind_policy(struct mempolicy *pol, case MPOL_DEFAULT: break; case MPOL_INTERLEAVE: + tmp = current_cpuset_interleaved_mems(); + if (!nodes_empty(tmp)) { + pol->v.nodes = tmp; + *mpolmask = tmp; + break; + } nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); pol->v.nodes = tmp; *mpolmask = *newmask; ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-25 22:54 ` [patch 2/2] cpusets: add interleave_over_allowed option David Rientjes @ 2007-10-25 23:37 ` Christoph Lameter 2007-10-25 23:56 ` David Rientjes 2007-10-26 1:13 ` Paul Jackson 1 sibling, 1 reply; 98+ messages in thread From: Christoph Lameter @ 2007-10-25 23:37 UTC (permalink / raw) To: David Rientjes Cc: Andrew Morton, Andi Kleen, Paul Jackson, Lee Schermerhorn, linux-kernel On Thu, 25 Oct 2007, David Rientjes wrote: > Adds a new 'interleave_over_allowed' option to cpusets. > > When a task with an MPOL_INTERLEAVE memory policy is attached to a cpuset > with this option set, the interleaved nodemask becomes the cpuset's > mems_allowed. When the cpuset's mems_allowed changes, the interleaved > nodemask for all tasks with MPOL_INTERLEAVE memory policies is also > updated to be the new mems_allowed nodemask. > > This allows applications to specify that they want to interleave over all > nodes that they are allowed to access. This set of nodes can be changed > at any time via the cpuset interface and each individual memory policy is > updated to reflect the changes for all attached tasks when this option is > set. More interactions between cpusets and memory policies. We have to be careful here to keep clean semantics. Isnt it a bit surprising for an application that has set up a custom MPOL_INTERLEAVE policy if the nodes suddenly change because of a cpuset or mems_allowed change? ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-25 23:37 ` Christoph Lameter @ 2007-10-25 23:56 ` David Rientjes 2007-10-26 0:28 ` Christoph Lameter 0 siblings, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-25 23:56 UTC (permalink / raw) To: Christoph Lameter Cc: Andrew Morton, Andi Kleen, Paul Jackson, Lee Schermerhorn, linux-kernel On Thu, 25 Oct 2007, Christoph Lameter wrote: > More interactions between cpusets and memory policies. We have to be > careful here to keep clean semantics. > I agree. > Isnt it a bit surprising for an application that has set up a custom > MPOL_INTERLEAVE policy if the nodes suddenly change because of a cpuset or > mems_allowed change? > Every MPOL_INTERLEAVE policy is a custom policy that the application has setup. If you don't use cpusets at all, the nodemask you pass to set_mempolicy() with MPOL_INTERLEAVE is static and won't change without the application's knowledge. It has full control over the nodemask that it desires to interleave over. The problem occurs when you add cpusets into the mix and permit the allowed nodes to change without knowledge to the application. Right now, a simple remap is done so if the cardinality of the set of nodes decreases, you're interleaving over a smaller number of nodes. If the cardinality increases, your interleaved nodemask isn't expanded. That's the problem that we're facing. The remap itself is troublesome because it doesn't take into account the user's desire for a custom nodemask to be used anyway; it could remap an interleaved policy over several nodes that will already be contended with one another. Normally, MPOL_INTERLEAVE is used to reduce bus contention to improve the throughput of the application. If you remap the number of nodes to interleave over, which is currently how it's done when mems_allowed changes, you could actually be increasing latency because you're interleaving over the same bus. This isn't a memory policy problem because all it does is effect a specific policy over a set of nodes. With my change, cpusets are required to update the interleaved nodemask if the user specified that they desire the feature with interleave_over_allowed. Cpusets are, after all, the ones that changed the mems_allowed in the first place and invalidated our custom interleave policy. We simply can't make inferences about what we should do, so we allow the creator of the cpuset to specify it for us. So the proper place to modify an interleaved policy is in cpusets and not mempolicy itself. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-25 23:56 ` David Rientjes @ 2007-10-26 0:28 ` Christoph Lameter 2007-10-26 1:55 ` Paul Jackson 2007-10-26 15:18 ` Lee Schermerhorn 0 siblings, 2 replies; 98+ messages in thread From: Christoph Lameter @ 2007-10-26 0:28 UTC (permalink / raw) To: David Rientjes Cc: Andrew Morton, Andi Kleen, Paul Jackson, Lee Schermerhorn, linux-kernel On Thu, 25 Oct 2007, David Rientjes wrote: > The problem occurs when you add cpusets into the mix and permit the > allowed nodes to change without knowledge to the application. Right now, > a simple remap is done so if the cardinality of the set of nodes > decreases, you're interleaving over a smaller number of nodes. If the > cardinality increases, your interleaved nodemask isn't expanded. That's > the problem that we're facing. The remap itself is troublesome because it > doesn't take into account the user's desire for a custom nodemask to be > used anyway; it could remap an interleaved policy over several nodes that > will already be contended with one another. Right. So I think we are fine if the application cannot setup boundaries for interleave. > Normally, MPOL_INTERLEAVE is used to reduce bus contention to improve the > throughput of the application. If you remap the number of nodes to > interleave over, which is currently how it's done when mems_allowed > changes, you could actually be increasing latency because you're > interleaving over the same bus. Well you may hit some nodes more than others so a slight performance degradataion. > This isn't a memory policy problem because all it does is effect a > specific policy over a set of nodes. With my change, cpusets are required > to update the interleaved nodemask if the user specified that they desire > the feature with interleave_over_allowed. Cpusets are, after all, the > ones that changed the mems_allowed in the first place and invalidated our > custom interleave policy. We simply can't make inferences about what we > should do, so we allow the creator of the cpuset to specify it for us. So > the proper place to modify an interleaved policy is in cpusets and not > mempolicy itself. With that MPOL_INTERLEAVE would be context dependent and no longer needs translation. Lee had similar ideas. Lee: Could we make MPOL_INTERLEAVE generally cpuset context dependent? ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 0:28 ` Christoph Lameter @ 2007-10-26 1:55 ` Paul Jackson 2007-10-26 2:11 ` David Rientjes 2007-10-26 15:18 ` Lee Schermerhorn 1 sibling, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-26 1:55 UTC (permalink / raw) To: Christoph Lameter; +Cc: rientjes, akpm, ak, Lee.Schermerhorn, linux-kernel Christoph wrote: > With that MPOL_INTERLEAVE would be context dependent and no longer > needs translation. Lee had similar ideas. Lee: Could we make > MPOL_INTERLEAVE generally cpuset context dependent? Well ... MPOL_INTERLEAVE already is essentially cpuset relative. So long as the cpuset size (number of allowed memory nodes) doesn't change, whatever MPOL_INTERLEAVE you set is remapped whenever the cpusets 'mems' changes, preserving the cpuset relative interleaving. The problem, as David explains, comes when cpusets change sizes. When the cpuset gets smaller, one can still do a pretty good job, scrunching down the interleave nodes in proportion. But when the cpuset gets larger, it's not clear how to convert a subset of a smaller set, to an equivalent subset of a larger set. The existing code handled this last case by saying screw it -- don't expand the set of interleave nodes when the cpuset 'mems' grows. David's new code handles this last case by adding a new per-cpuset Boolean that adds a new alternative, forcing all the tasks using MPOL_INTERLEAVE in that cpuset, anytime thereafter that the cpusets 'mems' changes, to get interleaved over the entire cpuset. Now that I spell it out that way, I am having second thoughts about this one. It's another special case palliative, given that we can't give the user what they really want. David - could you describe the real world situation in which you are finding that this new 'interleave_over_allowed' option, aka 'memory_spread_user', is useful? I'm not always opposed to special case solutions; but they do usually require special case needs to justify them ;). I suspect that the general case solution would require having the user pass in two nodemasks, call them ALL and SUBSET, requesting that relative to the ALL nodes, interleave be done on the SUBSET nodes. That way, even if say the task happened to be running in a cpuset with a -single- allowed memory node at the moment, it could express its user memory interleave memory needs for the general case of any number of nodes. Then for whatever nodes were currently allowed by the cpuset to that task at any point, the nodes_remap() logic could be done to derive from the ALL and SUBSET masks, and the current allowed mask, what nodes to interleave that tasks user allocations over. This would require a new set_mempolicy API, and might not be worth it. If David has a compelling use case, it is simple enough that it might well be worth doing, even though it's not the general case solution. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 1:55 ` Paul Jackson @ 2007-10-26 2:11 ` David Rientjes 2007-10-26 2:29 ` Paul Jackson 2007-10-26 15:30 ` Lee Schermerhorn 0 siblings, 2 replies; 98+ messages in thread From: David Rientjes @ 2007-10-26 2:11 UTC (permalink / raw) To: Paul Jackson; +Cc: Christoph Lameter, akpm, ak, Lee.Schermerhorn, linux-kernel On Thu, 25 Oct 2007, Paul Jackson wrote: > David - could you describe the real world situation in which you > are finding that this new 'interleave_over_allowed' option, aka > 'memory_spread_user', is useful? I'm not always opposed to special > case solutions; but they do usually require special case needs to > justify them ;). > Yes, when a task with MPOL_INTERLEAVE has its cpuset mems_allowed expanded to include more memory. The task itself can't access all that memory with the memory policy of its choice. Since the cpuset has changed the mems_allowed of the task without its knowledge, it would require a constant get_mempolicy() and set_mempolicy() loop in the application to catch these changes. That's obviously not in the best interest of anyone. So my change allows those tasks that have already expressed the desire to interleave their memory with MPOL_INTERLEAVE to always use the full range of memory available that is dynamically changing beneath them as a result of cpusets. Keep in mind that it is still possible to request an interleave only over a subset of allowed mems: but you must do it when you create the interleaved mempolicy after it has been attached to the cpuset. set_mempolicy() changes are always honored. The only other way to support such a feature is through a modification to mempolicies themselves, which Lee has already proposed. The problem with that is it requires mempolicy support for cpuset cases and modification to the set_mempolicy() API. My solution presents a cpuset fix for a cpuset problem. > I suspect that the general case solution would require having the user > pass in two nodemasks, call them ALL and SUBSET, requesting that > relative to the ALL nodes, interleave be done on the SUBSET nodes. > That way, even if say the task happened to be running in a cpuset with > a -single- allowed memory node at the moment, it could express its user > memory interleave memory needs for the general case of any number of > nodes. Then for whatever nodes were currently allowed by the cpuset > to that task at any point, the nodes_remap() logic could be done to > derive from the ALL and SUBSET masks, and the current allowed mask, > what nodes to interleave that tasks user allocations over. > I find it hard to believe that a single cpuset with a single memory_spread_user boolean is going to include multiple tasks that request interleaved mempolicies over differing nodes within the cpuset's mems_allowed. That, to me, is the special case. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 2:11 ` David Rientjes @ 2007-10-26 2:29 ` Paul Jackson 2007-10-26 2:45 ` David Rientjes 2007-10-26 15:30 ` Lee Schermerhorn 1 sibling, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-26 2:29 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, akpm, ak, Lee.Schermerhorn, linux-kernel > Yes, when a task with MPOL_INTERLEAVE has its cpuset mems_allowed expanded > to include more memory. The task itself can't access all that memory with > the memory policy of its choice. That much I could have guessed (did guess, actually.) Are you seeing this in a real world situation? Can you describe the situation? I don't mean just describing how it looks to this kernel code, but what is going on in the system, what sort of job mix or applications, what kind of users, ... In short, a "use case", or brief approximation thereto. See further: http://en.wikipedia.org/wiki/Use_case I have no need of a full blown use case; just a three sentence mini-story should suffice. But it should (if you can, without revealing proprietary knowledge) describe a situation you have actual need of addressing. > So my change allows those tasks that have already expressed the > desire to interleave their memory with MPOL_INTERLEAVE to always > use the full range of memory available that is dynamically changing > beneath them as a result of cpusets. Yup, that it does. Note that it is a special case -- "the full range", not any application controlled specific subset thereof, short of reissuing set_mempolicy() calls anytime that the applications cpuset 'mems' changes. > The only other way to support such a feature is through a modification to > mempolicies themselves, which Lee has already proposed. The problem with > that is it requires mempolicy support for cpuset cases and modification to > the set_mempolicy() API. Do you have a link to what Lee proposed? I agree that a full general solution would seem to require a new or changed set_mempolicy API, which may well be more than we want to do, absent a more compelling "use case" requiring it than we have now. > I find it hard to believe that a single cpuset with a single > memory_spread_user boolean is going to include multiple tasks that > request interleaved mempolicies over differing nodes within the > cpuset's mems_allowed. That, to me, is the special case. That may well be, to you. To me, pretty much -all- uses of set_mempolicy() are special cases ;). I have no way of telling whether or not there are users who would require multiple tasks in the same cpuset to have different interleave masks, but since the API clearly supports that (except when changing cpuset 'mems' settings mess things up), I have been presuming that somewhere in the universe, such users exist or might come to exist. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 2:29 ` Paul Jackson @ 2007-10-26 2:45 ` David Rientjes 2007-10-26 3:14 ` Paul Jackson 0 siblings, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-26 2:45 UTC (permalink / raw) To: Paul Jackson; +Cc: clameter, akpm, ak, Lee.Schermerhorn, linux-kernel On Thu, 25 Oct 2007, Paul Jackson wrote: > Are you seeing this in a real world situation? Can you describe the > situation? I don't mean just describing how it looks to this kernel > code, but what is going on in the system, what sort of job mix or > applications, what kind of users, ... In short, a "use case", or brief > approximation thereto. See further: > Yes, when using cpusets for resource control. If memory pressure is being felt for that cpuset and additional mems are added to alleviate possible OOM conditions, it is insufficient to allow tasks within that cpuset to continue using memory policies that prohibit them from taking advantage of the extra memory. The best remedy for that situation is to give the cpuset owner the option of allowing tasks with MPOL_INTERLEAVE policies to always interleave over the entire set of available mems so they can be dynamically expanded and contracted at will without triggering OOM conditions. > Do you have a link to what Lee proposed? I agree that a full general > solution would seem to require a new or changed set_mempolicy API, > which may well be more than we want to do, absent a more compelling > "use case" requiring it than we have now. > http://marc.info/?l=linux-mm&m=118849999128086 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 2:45 ` David Rientjes @ 2007-10-26 3:14 ` Paul Jackson 2007-10-26 3:58 ` David Rientjes 0 siblings, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-26 3:14 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, akpm, ak, Lee.Schermerhorn, linux-kernel David wrote: > Yes, when using cpusets for resource control. If memory pressure is being > felt for that cpuset and additional mems are added to alleviate possible > OOM conditions, it is insufficient to allow tasks within that cpuset to > continue using memory policies that prohibit them from taking advantage of > the extra memory. Well ... "resource control" is a tad thin for a decent "use case". But ok ... that's a little more compelling. The user space man pages for set_mempolicy(2) are now even more behind the curve, by not mentioning that MPOL_INTERLEAVE's mask might mean nothing, if (1) in a cpuset marked memory_spread_user, (2) after the cpuset has changed 'mems'. I wonder if there is any way to fix that. Who does the man pages for Linux system calls? Hmmm ... that reminds me ... the period of time between when the task issues the set_mempolicy(2) MPOL_INTERLEAVE call and when some cpuset 'mems' change subsequently moves its memory placement is an anomaly here. During that period of time, the MPOL_INTERLEAVE mask -does- apply, even if a subset of the 'mems' in the tasks cpuset. This could result in test cases missing some failures. If they test with a particular, carefully crafted MPOL_INTERLEAVE mask that is a proper (strictly less than) subset of the nodes allowed in the cpuset, they might not notice that their code is broken if they happen to be in a memory_spread_user cpuset after a 'mems' change has jammed the entire cpusets 'mems' into their interleave mask. Perhaps we should make it so that doing a set_mempolicy(2) call to set MPOL_INTERLEAVE immediately changes the memory policy to the cpusets mems_allowed. A key advantage in doing this would be that the set_mempolicy user documentation could simply state that the MPOL_INTERLEAVE mask is ignored when in a cpuset marked memory_spread_user, instead interleaving over all the memory nodes in the cpuset. This would be quite a bit simpler and clearer than saying that the cpusets nodes are used only after subsequent cpuset 'mems' changes. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 3:14 ` Paul Jackson @ 2007-10-26 3:58 ` David Rientjes 2007-10-26 4:34 ` Paul Jackson 2007-10-26 15:37 ` Lee Schermerhorn 0 siblings, 2 replies; 98+ messages in thread From: David Rientjes @ 2007-10-26 3:58 UTC (permalink / raw) To: Paul Jackson; +Cc: clameter, akpm, ak, Lee.Schermerhorn, linux-kernel On Thu, 25 Oct 2007, Paul Jackson wrote: > The user space man pages for set_mempolicy(2) are now even more > behind the curve, by not mentioning that MPOL_INTERLEAVE's mask > might mean nothing, if (1) in a cpuset marked memory_spread_user, > (2) after the cpuset has changed 'mems'. > Yeah. They were already outdated in the sense that they did not specify that the interleave nodemask could change as a result of a cpuset mems change. > I wonder if there is any way to fix that. Who does the man pages > for Linux system calls? > Good question. > Hmmm ... that reminds me ... the period of time between when the > task issues the set_mempolicy(2) MPOL_INTERLEAVE call and when some > cpuset 'mems' change subsequently moves its memory placement is an > anomaly here. During that period of time, the MPOL_INTERLEAVE mask > -does- apply, even if a subset of the 'mems' in the tasks cpuset. > This could result in test cases missing some failures. If they > test with a particular, carefully crafted MPOL_INTERLEAVE mask > that is a proper (strictly less than) subset of the nodes allowed > in the cpuset, they might not notice that their code is broken if > they happen to be in a memory_spread_user cpuset after a 'mems' > change has jammed the entire cpusets 'mems' into their interleave > mask. > Well, sure, but mempolicy's already get overridden by cpusets anyway. For example, if you were to attach a task with an MPOL_BIND mempolicy to a cpuset with a disjoint set of allowed mems. The important distinction is that you can still interleave over a subset of the mems_allowed if you set your memory policy after being attached to the cpuset. > Perhaps we should make it so that doing a set_mempolicy(2) call > to set MPOL_INTERLEAVE immediately changes the memory policy to > the cpusets mems_allowed. > No, because that would negate the above. We still want to be able to restrict interleaved memory policies to a subset of allowed mems. This option gives the most power to applications. > A key advantage in doing this would be that the set_mempolicy user > documentation could simply state that the MPOL_INTERLEAVE mask is > ignored when in a cpuset marked memory_spread_user, instead interleaving > over all the memory nodes in the cpuset. This would be quite a bit > simpler and clearer than saying that the cpusets nodes are used only > after subsequent cpuset 'mems' changes. > I think that documenting the change in the man page as saying that "the nodemask will include all allowed nodes if the mems_allowed of a memory_spread_user cpuset is expanded" is better. I've got a few fixes for my patchset queued so I'll resend it later; it's mostly style changes but there is a subtle bug where the task changing the value of a cpuset's memory_spread_page is not in the same cpuset. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 3:58 ` David Rientjes @ 2007-10-26 4:34 ` Paul Jackson 2007-10-26 15:37 ` Lee Schermerhorn 1 sibling, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-26 4:34 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, akpm, ak, Lee.Schermerhorn, linux-kernel David wrote: > I think that documenting the change in the man page as saying that > "the nodemask will include all allowed nodes if the mems_allowed > of a memory_spread_user cpuset is expanded" is better. Ok. I'm inclined the other way, but not certain enough of my position to push the point any further. Good enough. > I've got a few fixes for my patchset queued so I'll resend it later Ok. Good work - thanks. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 3:58 ` David Rientjes 2007-10-26 4:34 ` Paul Jackson @ 2007-10-26 15:37 ` Lee Schermerhorn 2007-10-26 17:04 ` Paul Jackson 1 sibling, 1 reply; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-26 15:37 UTC (permalink / raw) To: David Rientjes Cc: Paul Jackson, clameter, akpm, ak, linux-kernel, Michael Kerrisk On Thu, 2007-10-25 at 20:58 -0700, David Rientjes wrote: > On Thu, 25 Oct 2007, Paul Jackson wrote: > > > The user space man pages for set_mempolicy(2) are now even more > > behind the curve, by not mentioning that MPOL_INTERLEAVE's mask > > might mean nothing, if (1) in a cpuset marked memory_spread_user, > > (2) after the cpuset has changed 'mems'. > > > > Yeah. They were already outdated in the sense that they did not specify > that the interleave nodemask could change as a result of a cpuset mems > change. > > > I wonder if there is any way to fix that. Who does the man pages > > for Linux system calls? > > Michael Kerrisk, whom I've copied, does. I recently sent in an update to all of the mempolicy man pages that describe the behavior as it currently exists. [I need to send in an update for MPOL_F_MEMS_ALLOWED]. One of the things that has bothered me is that there are no cpuset man pages to reference from the mempolicy man pages. [I know, we can and do refer to the kernel source Documentation, but that might not be available to everyone w/o some digging. "See Also" refs typically point at other man pages...]. To get around this, I had to talk about "nodes allowed in the current context" or some such weasel-wording in my updates. Paul: what do you think about subsetting the cpuset.txt into a man page or 2 that can be referenced by other man pages' See Also sections? > <snip> > > David Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 15:37 ` Lee Schermerhorn @ 2007-10-26 17:04 ` Paul Jackson 2007-10-26 17:28 ` Lee Schermerhorn 2007-10-26 20:21 ` Michael Kerrisk 0 siblings, 2 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-26 17:04 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel, mtk-manpages Lee wrote: > Paul: what do you think about subsetting the cpuset.txt into a man page > or 2 that can be referenced by other man pages' See Also sections? Oh dear --- looking back in my work queue I have with my employer, I see I have a task that is now over a year old, still unfinished, to provide man pages for cpusets to Michael Kerrisk" <mtk-manpages@gmx.net> So, yes, I agree this would be a "good thing". I just haven't gotten a round to it (http://www.quantumenterprises.co.uk/roundtuit/index.htm) yet. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 17:04 ` Paul Jackson @ 2007-10-26 17:28 ` Lee Schermerhorn 2007-10-26 20:21 ` Michael Kerrisk 1 sibling, 0 replies; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-26 17:28 UTC (permalink / raw) To: Paul Jackson; +Cc: rientjes, clameter, akpm, ak, linux-kernel, mtk-manpages On Fri, 2007-10-26 at 10:04 -0700, Paul Jackson wrote: > Lee wrote: > > Paul: what do you think about subsetting the cpuset.txt into a man page > > or 2 that can be referenced by other man pages' See Also sections? > > Oh dear --- looking back in my work queue I have with my employer, I > see I have a task that is now over a year old, still unfinished, to > provide man pages for cpusets to Michael Kerrisk" <mtk-manpages@gmx.net> > > So, yes, I agree this would be a "good thing". I just haven't gotten a > round to it (http://www.quantumenterprises.co.uk/roundtuit/index.htm) > yet. I'm a little backed up myself, right now, or I'd offer to take a cut for you to review. Once I get some free time [Hah!], I'll check with you again. If you get started before then, I'd be happy to review. Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 17:04 ` Paul Jackson 2007-10-26 17:28 ` Lee Schermerhorn @ 2007-10-26 20:21 ` Michael Kerrisk 2007-10-26 20:25 ` Paul Jackson 1 sibling, 1 reply; 98+ messages in thread From: Michael Kerrisk @ 2007-10-26 20:21 UTC (permalink / raw) To: Paul Jackson Cc: Lee Schermerhorn, rientjes, clameter, akpm, ak, linux-kernel, mtk-manpages On 10/26/07, Paul Jackson <pj@sgi.com> wrote: > Lee wrote: > > Paul: what do you think about subsetting the cpuset.txt into a man page > > or 2 that can be referenced by other man pages' See Also sections? > > Oh dear --- looking back in my work queue I have with my employer, I > see I have a task that is now over a year old, still unfinished, to > provide man pages for cpusets to Michael Kerrisk" <mtk-manpages@gmx.net> Yes, it would be great to have those pages. Is there anything I can do to assist? Cheers, Michael PS Note my new addres for man-apges: mtk.manpages@gmail.com ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 20:21 ` Michael Kerrisk @ 2007-10-26 20:25 ` Paul Jackson 2007-10-26 20:33 ` Michael Kerrisk 0 siblings, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-26 20:25 UTC (permalink / raw) To: Michael Kerrisk Cc: Lee.Schermerhorn, rientjes, clameter, akpm, ak, linux-kernel, mtk-manpages Michael wrote: > PS Note my new addres for man-apges: mtk.manpages@gmail.com Noted. > Is there anything I can do to assist? Got any spare round tuit's ;)? -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 20:25 ` Paul Jackson @ 2007-10-26 20:33 ` Michael Kerrisk 0 siblings, 0 replies; 98+ messages in thread From: Michael Kerrisk @ 2007-10-26 20:33 UTC (permalink / raw) To: Paul Jackson Cc: Lee.Schermerhorn, rientjes, clameter, akpm, ak, linux-kernel, mtk-manpages On 10/26/07, Paul Jackson <pj@sgi.com> wrote: > Michael wrote: > > PS Note my new addres for man-apges: mtk.manpages@gmail.com > > Noted. > > > Is there anything I can do to assist? > > Got any spare round tuit's ;)? I ran out quite some time ago unfortunately. Cheers, Michael ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 2:11 ` David Rientjes 2007-10-26 2:29 ` Paul Jackson @ 2007-10-26 15:30 ` Lee Schermerhorn 2007-10-26 18:46 ` David Rientjes 1 sibling, 1 reply; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-26 15:30 UTC (permalink / raw) To: David Rientjes; +Cc: Paul Jackson, Christoph Lameter, akpm, ak, linux-kernel On Thu, 2007-10-25 at 19:11 -0700, David Rientjes wrote: > On Thu, 25 Oct 2007, Paul Jackson wrote: > > > David - could you describe the real world situation in which you > > are finding that this new 'interleave_over_allowed' option, aka > > 'memory_spread_user', is useful? I'm not always opposed to special > > case solutions; but they do usually require special case needs to > > justify them ;). > > > > Yes, when a task with MPOL_INTERLEAVE has its cpuset mems_allowed expanded > to include more memory. The task itself can't access all that memory with > the memory policy of its choice. > > Since the cpuset has changed the mems_allowed of the task without its > knowledge, it would require a constant get_mempolicy() and set_mempolicy() > loop in the application to catch these changes. That's obviously not in > the best interest of anyone. > > So my change allows those tasks that have already expressed the desire to > interleave their memory with MPOL_INTERLEAVE to always use the full range > of memory available that is dynamically changing beneath them as a result > of cpusets. Keep in mind that it is still possible to request an > interleave only over a subset of allowed mems: but you must do it when you > create the interleaved mempolicy after it has been attached to the cpuset. > set_mempolicy() changes are always honored. > > The only other way to support such a feature is through a modification to > mempolicies themselves, which Lee has already proposed. The problem with > that is it requires mempolicy support for cpuset cases and modification to > the set_mempolicy() API. My solution presents a cpuset fix for a cpuset > problem. Actually, my patch doesn't change the set_mempolicy() API at all, it just co-opts a currently unused/illegal value for the nodemask to indicate "all allowed nodes". Again, I need to provide a libnuma API to request this. Soon come, mon... Here's a link the last posting of my patch, as Paul requested: http://marc.info/?l=linux-mm&m=118849999128086&w=4 A bit out of date, but I'll fix that maybe next week. Lee <snip> ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 15:30 ` Lee Schermerhorn @ 2007-10-26 18:46 ` David Rientjes 2007-10-26 19:00 ` Paul Jackson 2007-10-26 20:43 ` Lee Schermerhorn 0 siblings, 2 replies; 98+ messages in thread From: David Rientjes @ 2007-10-26 18:46 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: Paul Jackson, Christoph Lameter, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > Actually, my patch doesn't change the set_mempolicy() API at all, it > just co-opts a currently unused/illegal value for the nodemask to > indicate "all allowed nodes". Again, I need to provide a libnuma API to > request this. Soon come, mon... > If something that was previously unaccepted is now allowed with a newly-introduced semantic, that's an API change. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 18:46 ` David Rientjes @ 2007-10-26 19:00 ` Paul Jackson 2007-10-26 20:45 ` David Rientjes 2007-10-26 20:43 ` Lee Schermerhorn 1 sibling, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-26 19:00 UTC (permalink / raw) To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel David wrote: > If something that was previously unaccepted is now allowed with a > newly-introduced semantic, that's an API change. Agreed, as I wrote earlier: > It should work with libnuma and be > fully upward compatible with current code (except perhaps code that > depends on getting an error from requesting MPOL_INTERLEAVE on a node > not allowed.) Without at least this sort of change to MPOL_INTERLEAVE nodemasks, allowing either empty nodemasks (Lee's proposal) or extending them outside the current cpuset (what I'm cooking up now), there is no way for a task that is currently confined to a single node cpuset to say anything about how it wants be interleaved in the event that it is subsequently moved to a larger cpuset. Currently, such a task is only allowed to pass exactly one particular nodemask to set_mempolicy MPOL_INTERLEAVE calls, with exactly the one bit corresponding to its current node. No useful information can be passed via an API that only allows a single legal value. But you knew that ... You were just correcting my erroneously unqualified statement. Good. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 19:00 ` Paul Jackson @ 2007-10-26 20:45 ` David Rientjes 2007-10-26 21:05 ` Christoph Lameter 2007-10-26 21:13 ` Lee Schermerhorn 0 siblings, 2 replies; 98+ messages in thread From: David Rientjes @ 2007-10-26 20:45 UTC (permalink / raw) To: Paul Jackson; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Paul Jackson wrote: > Without at least this sort of change to MPOL_INTERLEAVE nodemasks, > allowing either empty nodemasks (Lee's proposal) or extending them > outside the current cpuset (what I'm cooking up now), there is no way > for a task that is currently confined to a single node cpuset to say > anything about how it wants be interleaved in the event that it is > subsequently moved to a larger cpuset. Currently, such a task is only > allowed to pass exactly one particular nodemask to set_mempolicy > MPOL_INTERLEAVE calls, with exactly the one bit corresponding to its > current node. No useful information can be passed via an API that only > allows a single legal value. > Well, passing a single node to set_mempolicy() for MPOL_INTERLEAVE doesn't make a whole lot of sense in the first place. I prefer your solution of allowing set_mempolicy(MPOL_INTERLEAVE, NODE_MASK_ALL) to mean "interleave me over everything I'm allowed to access." NODE_MASK_ALL would be stored in the struct mempolicy and used later on mpol_rebind_policy(). David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 20:45 ` David Rientjes @ 2007-10-26 21:05 ` Christoph Lameter 2007-10-26 21:08 ` David Rientjes 2007-10-26 21:13 ` Lee Schermerhorn 1 sibling, 1 reply; 98+ messages in thread From: Christoph Lameter @ 2007-10-26 21:05 UTC (permalink / raw) To: David Rientjes; +Cc: Paul Jackson, Lee.Schermerhorn, akpm, ak, linux-kernel On Fri, 26 Oct 2007, David Rientjes wrote: > Well, passing a single node to set_mempolicy() for MPOL_INTERLEAVE doesn't > make a whole lot of sense in the first place. I prefer your solution of > allowing set_mempolicy(MPOL_INTERLEAVE, NODE_MASK_ALL) to mean "interleave > me over everything I'm allowed to access." NODE_MASK_ALL would be stored > in the struct mempolicy and used later on mpol_rebind_policy(). So instead of an empty nodemask we would pass a nodemask where all bits are set? And they would stay set but the cpuset restrictions would effectively limit the interleaving to the allowed set? rebind could ignore rebinds if all bits are set. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:05 ` Christoph Lameter @ 2007-10-26 21:08 ` David Rientjes 2007-10-26 21:12 ` Christoph Lameter 0 siblings, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-26 21:08 UTC (permalink / raw) To: Christoph Lameter; +Cc: Paul Jackson, Lee.Schermerhorn, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Christoph Lameter wrote: > > Well, passing a single node to set_mempolicy() for MPOL_INTERLEAVE doesn't > > make a whole lot of sense in the first place. I prefer your solution of > > allowing set_mempolicy(MPOL_INTERLEAVE, NODE_MASK_ALL) to mean "interleave > > me over everything I'm allowed to access." NODE_MASK_ALL would be stored > > in the struct mempolicy and used later on mpol_rebind_policy(). > > So instead of an empty nodemask we would pass a nodemask where all bits > are set? And they would stay set but the cpuset restrictions would > effectively limit the interleaving to the allowed set? > You would pass NODE_MASK_ALL if your intent was to interleave over everything you have access to, yes. Otherwise you can pass whatever you want access to and your interleaved nodemask becomes mpol_rebind_policy()'s newmask formal (the cpuset's new mems_allowed) AND'd with pol->passed_nodemask. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:08 ` David Rientjes @ 2007-10-26 21:12 ` Christoph Lameter 2007-10-26 21:15 ` David Rientjes 0 siblings, 1 reply; 98+ messages in thread From: Christoph Lameter @ 2007-10-26 21:12 UTC (permalink / raw) To: David Rientjes; +Cc: Paul Jackson, Lee.Schermerhorn, akpm, ak, linux-kernel On Fri, 26 Oct 2007, David Rientjes wrote: > You would pass NODE_MASK_ALL if your intent was to interleave over > everything you have access to, yes. Otherwise you can pass whatever you > want access to and your interleaved nodemask becomes > mpol_rebind_policy()'s newmask formal (the cpuset's new mems_allowed) > AND'd with pol->passed_nodemask. We would need two fields in the policy structure 1. The specified nodemask (generally ignored) 2. The effective nodemask (specified & cpuset_mems_allowed) If we have these two then its easy to get a bit further by making the first nodemask a relative nodemask. The calculation of the effective nodemask changes somewhat but the logic is then applicable to MPOL_BIND as well. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:12 ` Christoph Lameter @ 2007-10-26 21:15 ` David Rientjes 0 siblings, 0 replies; 98+ messages in thread From: David Rientjes @ 2007-10-26 21:15 UTC (permalink / raw) To: Christoph Lameter; +Cc: Paul Jackson, Lee.Schermerhorn, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Christoph Lameter wrote: > We would need two fields in the policy structure > > 1. The specified nodemask (generally ignored) > What I've called pol->passed_nodemask. > 2. The effective nodemask (specified & cpuset_mems_allowed) > Which is pol->v.nodes. > If we have these two then its easy to get a bit further by making > the first nodemask a relative nodemask. The calculation of the effective > nodemask changes somewhat but the logic is then applicable to MPOL_BIND as > well. > Agreed. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 20:45 ` David Rientjes 2007-10-26 21:05 ` Christoph Lameter @ 2007-10-26 21:13 ` Lee Schermerhorn 2007-10-26 21:17 ` Christoph Lameter 2007-10-26 21:18 ` David Rientjes 1 sibling, 2 replies; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-26 21:13 UTC (permalink / raw) To: David Rientjes; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel On Fri, 2007-10-26 at 13:45 -0700, David Rientjes wrote: > On Fri, 26 Oct 2007, Paul Jackson wrote: > > > Without at least this sort of change to MPOL_INTERLEAVE nodemasks, > > allowing either empty nodemasks (Lee's proposal) or extending them > > outside the current cpuset (what I'm cooking up now), there is no way > > for a task that is currently confined to a single node cpuset to say > > anything about how it wants be interleaved in the event that it is > > subsequently moved to a larger cpuset. Currently, such a task is only > > allowed to pass exactly one particular nodemask to set_mempolicy > > MPOL_INTERLEAVE calls, with exactly the one bit corresponding to its > > current node. No useful information can be passed via an API that only > > allows a single legal value. > > > > Well, passing a single node to set_mempolicy() for MPOL_INTERLEAVE doesn't > make a whole lot of sense in the first place. I prefer your solution of > allowing set_mempolicy(MPOL_INTERLEAVE, NODE_MASK_ALL) to mean "interleave > me over everything I'm allowed to access." NODE_MASK_ALL would be stored > in the struct mempolicy and used later on mpol_rebind_policy(). You don't need to save the entire mask--just note that NODE_MASK_ALL was passed--like with my internal MPOL_CONTEXT flag. This would involve special casing NODE_MASK_ALL in the error checking, as currently set_mempolicy() complains loudly if you pass non-allowed nodes--see "contextualize_policy()". [mbind() on the other hand, appears to allow any nodemask, even outside the cpuset. guess we catch this during allocation.] This is pretty much the spirit of my patch w/o the API change/extension [/improvement :)] For some systems [not mine], the nodemasks can get quite large. I have a patch, that I've tested atop Mel Gorman's "onezonelist" patches that replaces the nodemasks embedded in struct mempolicy with pointers to dynamically allocated ones. However, it's probably not much of a win, memorywise, if most of the uses are for interleave and bind policies--both of which would always need the nodemasks in addition to the pointers. Now, if we could replace the 'cpuset_mems_allowed' nodemask with a pointer to something stable, it might be a win. Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:13 ` Lee Schermerhorn @ 2007-10-26 21:17 ` Christoph Lameter 2007-10-26 21:26 ` Lee Schermerhorn 2007-10-26 21:18 ` David Rientjes 1 sibling, 1 reply; 98+ messages in thread From: Christoph Lameter @ 2007-10-26 21:17 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: David Rientjes, Paul Jackson, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > For some systems [not mine], the nodemasks can get quite large. I have > a patch, that I've tested atop Mel Gorman's "onezonelist" patches that > replaces the nodemasks embedded in struct mempolicy with pointers to > dynamically allocated ones. However, it's probably not much of a win, > memorywise, if most of the uses are for interleave and bind > policies--both of which would always need the nodemasks in addition to > the pointers. > > Now, if we could replace the 'cpuset_mems_allowed' nodemask with a > pointer to something stable, it might be a win. The memory policies are already shared and have refcounters for that purpose. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:17 ` Christoph Lameter @ 2007-10-26 21:26 ` Lee Schermerhorn 2007-10-26 21:37 ` Christoph Lameter 0 siblings, 1 reply; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-26 21:26 UTC (permalink / raw) To: Christoph Lameter; +Cc: David Rientjes, Paul Jackson, akpm, ak, linux-kernel On Fri, 2007-10-26 at 14:17 -0700, Christoph Lameter wrote: > On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > > > For some systems [not mine], the nodemasks can get quite large. I have > > a patch, that I've tested atop Mel Gorman's "onezonelist" patches that > > replaces the nodemasks embedded in struct mempolicy with pointers to > > dynamically allocated ones. However, it's probably not much of a win, > > memorywise, if most of the uses are for interleave and bind > > policies--both of which would always need the nodemasks in addition to > > the pointers. > > > > Now, if we could replace the 'cpuset_mems_allowed' nodemask with a > > pointer to something stable, it might be a win. > > The memory policies are already shared and have refcounters for that > purpose. I must have missed that in the code I'm reading :) Have a nice weekend. Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:26 ` Lee Schermerhorn @ 2007-10-26 21:37 ` Christoph Lameter 2007-10-29 15:00 ` Lee Schermerhorn 0 siblings, 1 reply; 98+ messages in thread From: Christoph Lameter @ 2007-10-26 21:37 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: David Rientjes, Paul Jackson, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > > > Now, if we could replace the 'cpuset_mems_allowed' nodemask with a > > > pointer to something stable, it might be a win. > > > > The memory policies are already shared and have refcounters for that > > purpose. > > I must have missed that in the code I'm reading :) What is the benefit of having pointers to nodemasks? We likely would need to have refcounts in those nodemasks too? So we duplicate a lot of the characteristics of memory policies? ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:37 ` Christoph Lameter @ 2007-10-29 15:00 ` Lee Schermerhorn 2007-10-29 17:33 ` Paul Jackson 2007-10-29 20:35 ` Christoph Lameter 0 siblings, 2 replies; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-29 15:00 UTC (permalink / raw) To: Christoph Lameter Cc: David Rientjes, Paul Jackson, akpm, ak, linux-kernel, Mel Gorman [-- Attachment #1: Type: text/plain, Size: 2442 bytes --] On Fri, 2007-10-26 at 14:37 -0700, Christoph Lameter wrote: > On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > > > > > Now, if we could replace the 'cpuset_mems_allowed' nodemask with a > > > > pointer to something stable, it might be a win. > > > > > > The memory policies are already shared and have refcounters for that > > > purpose. > > > > I must have missed that in the code I'm reading :) > > What is the benefit of having pointers to nodemasks? We likely would need > to have refcounts in those nodemasks too? So we duplicate a lot of > the characteristics of memory policies? Hi, Christoph: remoting the nodemasks from the mempolicy and allocating them only when needed is something that you and Mel and I discussed last month, in the context of Mel's "one zonelist filtered by nodemask" patches. I just put together the dynamic nodemask patch [included below FYI, NOT for serious consideration] to see what it looked like and whether it helped. Conclusion: it's ugly/complex [especially trying to keep the nodemasks embedded for systems that don't require > a pointer's worth of bits] and they probably don't help much if most uses of non-default mempolicy requires a nodemask. I only brought it up again because now you all are considering another nodemask per policy. In fact, I only considered it in the first place because nodemasks on our [HP's] platform don't require more than a pointer's worth of bits [today, at least--I don't know about future plans]. However, since we share an arch--ia64-with SGI and distros don't want to support special kernels for different vendors, if they can avoid it, we have 1K-bit nodemasks. Since this is ia64 we're talking about, most folks don't care. Now that you're going to do the same for x86_64, it might become more visible. Then again, maybe there are few enough mempolicy structs that no-one will care anyway. Note: I don't [didn't] think I need to ref count the nodemasks associated with the mempolicies because they are allocated when the mempolicy is and destroyed when the policy is--not shared. Just like the custom zonelist for bind policy, and we have no ref count there. I.e., they're protected by the mempol's ref. However, now that you bring it up, I'm wondering about the effects of policy remapping, and whether we have the reference counting or indirect protection [mmap_sem, whatever] correct there in current code. I'll have to take a look. Lee [-- Attachment #2: dynamically-allocate-mempolicy-nodemasks.patch --] [-- Type: text/x-patch, Size: 10131 bytes --] PATCH/RFC Memory Policy: dynamically allocate policy nodemaps Something that Christoph Lameter, Mel Gorman and I discussed. Once Mel's "one zonelist" patches go in, we no longer have the MPOL_BIND custom zonelist in the mempolicy struct. However, we still have 2 nodemask_t's that can get quite large: one in the union with the preferred_node, and one for the cpuset current mems allowed. Note that this 2nd nodemask_t does NOT depend on whether or not cpusets are configured. So, on an ia64 platform with NODES_SHIFT configured at 8 [256 nodes], this results in a nodemask_t size of 32 bytes and a mempolicy size of 72 bytes. Real soon now, we'll be seeing x86_64 platforms with hundreds of nodes. Indirect/dynamically allocated policy nodemasks can reduce this size, for policies that don't need them, to: TODO [Some distros ship with NODES_SHIFT=10 => 128 byte nodemasks.] However, on platforms and configurations where BITS_PER_LONG >= (1 << NODES_SHIFT), indirect policy nodemasks are unnecessary overhead, as the maximum number of nodes will fit in a pointers worth of data. So, this patch implements indirect, dynamically allocated nodemasks for memory policies when: BITS_PER_LONG < (1 << NODES_SHIFT). etc... Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> include/linux/mempolicy.h | 20 +++++- mm/mempolicy.c | 134 ++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 135 insertions(+), 19 deletions(-) Index: Linux/include/linux/mempolicy.h =================================================================== --- Linux.orig/include/linux/mempolicy.h 2007-10-02 14:11:20.000000000 -0400 +++ Linux/include/linux/mempolicy.h 2007-10-02 16:07:43.000000000 -0400 @@ -47,6 +47,12 @@ struct mm_struct; #ifdef CONFIG_NUMA +#if BITS_PER_LONG < (1 << NODES_SHIFT) +#define INDIRECT_POLICY_NODEMASK 1 +#else +#define INDIRECT_POLICY_NODEMASK 0 +#endif + /* * Describe a memory policy. * @@ -71,11 +77,19 @@ struct mempolicy { atomic_t refcnt; short policy; /* See MPOL_* above */ union { - short preferred_node; /* preferred */ - nodemask_t nodes; /* interleave/bind */ + short preferred_node; /* preferred */ +#if INDIRECT_POLICY_NODEMASK + nodemask_t *nodes; /* interleave/bind */ +#else + nodemask_t nodes; /* interleave/bind */ +#endif /* undefined for default */ } v; - nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */ +#if INDIRECT_POLICY_NODEMASK + nodemask_t *cpuset_mems_allowed; /* mempolicy relative to these nodes */ +#else + nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */ +#endif }; /* Index: Linux/mm/mempolicy.c =================================================================== --- Linux.orig/mm/mempolicy.c 2007-10-02 14:12:35.000000000 -0400 +++ Linux/mm/mempolicy.c 2007-10-02 17:18:24.000000000 -0400 @@ -100,6 +100,7 @@ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; +static struct kmem_cache *nodemask_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ @@ -158,10 +159,68 @@ static int is_valid_nodemask(nodemask_t return 0; } +#if INDIRECT_POLICY_NODEMASK +/* + * mempolicy operations for indirect policy nodemasks + * operate on pointers to nodemask_t pointers in mempolicy struct + */ +static int set_policy_nodemask(nodemask_t **pol_nodes, nodemask_t *nodes) +{ + **pol_nodes = *nodes; + return 0; +} + +static int new_policy_nodemask(nodemask_t **pol_nodes, nodemask_t *nodes) +{ + *pol_nodes = kmem_cache_alloc(nodemask_cache, GFP_KERNEL); + if (!*pol_nodes) + return -ENOMEM; + return set_policy_nodemask(pol_nodes, nodes); +} + +static nodemask_t *policy_nodemask_ref(nodemask_t **pol_nodes) +{ + return *pol_nodes; +} + +static void free_policy_nodemask(nodemask_t **pol_nodes) +{ + kmem_cache_free(nodemask_cache, *pol_nodes); +} + +#else + +/* + * mempolicy operations for embedded policy nodemasks + * operate on pointers to nodemasks embedded in mempolicy structs + */ +static int set_policy_nodemask(nodemask_t *pol_nodes, nodemask_t *nodes) +{ + *policy->v.nodes = *nodes; + return 0; +} + +static int new_policy_nodemask(nodemask_t *pol_nodes, nodemask_t *nodes) +{ + return set_policy_nodemask(pol_nodes, nodes); +} + +static nodemask_t *policy_nodemask_ref(nodemask_t *pol_nodes) +{ + return pol_nodes; +} + +static void free_policy_nodemask(nodemask_t *pol_nodes) +{ +} +#endif + /* Create a new policy */ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) { struct mempolicy *policy; + nodemask_t mems_allowed; + int ret; pr_debug("setting mode %d nodes[0] %lx\n", mode, nodes ? nodes_addr(*nodes)[0] : -1); @@ -175,14 +234,18 @@ static struct mempolicy *mpol_new(int mo atomic_set(&policy->refcnt, 1); switch (mode) { case MPOL_INTERLEAVE: - policy->v.nodes = *nodes; + ret = new_policy_nodemask(&policy->v.nodes, nodes); + if (ret) + return ERR_PTR(ret); if (nodes_weight(*nodes) == 0) { mode |= MPOL_CONTEXT; break; } - nodes_and(policy->v.nodes, policy->v.nodes, + nodes_and(*policy_nodemask_ref(&policy->v.nodes), + *policy_nodemask_ref(&policy->v.nodes), node_states[N_HIGH_MEMORY]); - if (nodes_weight(policy->v.nodes) == 0) { + if (nodes_weight(*policy_nodemask_ref(&policy->v.nodes)) == 0) { + free_policy_nodemask(&policy->v.nodes); kmem_cache_free(policy_cache, policy); return ERR_PTR(-EINVAL); } @@ -197,11 +260,21 @@ static struct mempolicy *mpol_new(int mo kmem_cache_free(policy_cache, policy); return ERR_PTR(-EINVAL); } - policy->v.nodes = *nodes; + ret = new_policy_nodemask(&policy->v.nodes, nodes); + if (ret) + return ERR_PTR(ret); break; } policy->policy = mode; - policy->cpuset_mems_allowed = cpuset_mems_allowed(current); + +//TODO: I ought to be able to figure out how to reference the +// mems_allowed [current task's or node_possible_map] w/o +// allocating a new copy. Need more helper function? + mems_allowed = cpuset_mems_allowed(current); + ret = new_policy_nodemask(&policy->cpuset_mems_allowed, + &mems_allowed); + if (ret) + return ERR_PTR(ret); return policy; } @@ -464,7 +537,7 @@ static nodemask_t *get_interleave_nodes( if (unlikely(p->policy & MPOL_CONTEXT)) return &cpuset_current_mems_allowed; - return &p->v.nodes; + return policy_nodemask_ref(&p->v.nodes); } /* Set the process memory policy */ @@ -1135,8 +1208,8 @@ static inline nodemask_t *nodemask_polic /* Lower zones don't get a nodemask applied for MPOL_BIND */ if (unlikely(policy->policy == MPOL_BIND && gfp_zone(gfp) >= policy_zone && - cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))) - return &policy->v.nodes; + cpuset_nodemask_valid_mems_allowed(policy_nodemask_ref(&policy->v.nodes)))) + return policy_nodemask_ref(&policy->v.nodes); return NULL; } @@ -1200,8 +1273,9 @@ unsigned slab_node(struct mempolicy *pol struct zoneref *z; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(numa_node_id())->node_zonelist; - z = first_zones_zonelist(zonelist, &policy->v.nodes, - highest_zoneidx); + z = first_zones_zonelist(zonelist, + policy_nodemask_ref(&policy->v.nodes), + highest_zoneidx); return zonelist_node_idx(z); } @@ -1421,7 +1495,25 @@ struct mempolicy *__mpol_copy(struct mem nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(old, &mems); } - *new = *old; + + /* + * need to copy members explicitly to handle indirect vs + * embedded nodemasks + */ + new->policy = old->policy; + switch (policy_mode(old)) { + case MPOL_PREFERRED: + new->v.preferred_node = old->v.preferred_node; + break; + + case MPOL_BIND: + /*FALL THROUGH*/ + case MPOL_INTERLEAVE: + new_policy_nodemask(&new->v.nodes, + policy_nodemask_ref(&old->v.nodes)); + } + new_policy_nodemask(&new->cpuset_mems_allowed, + policy_nodemask_ref(&old->cpuset_mems_allowed)); atomic_set(&new->refcnt, 1); return new; } @@ -1441,7 +1533,9 @@ int __mpol_equal(struct mempolicy *a, st * works for MPOL_BIND as it shouldn't have MPOL_CONTEXT set */ return a->policy & MPOL_CONTEXT || - nodes_equal(a->v.nodes, b->v.nodes); + nodes_equal( + *policy_nodemask_ref(&a->v.nodes), + *policy_nodemask_ref(&b->v.nodes)); case MPOL_PREFERRED: return a->v.preferred_node == b->v.preferred_node; default: @@ -1455,6 +1549,8 @@ void __mpol_free(struct mempolicy *p) { if (!atomic_dec_and_test(&p->refcnt)) return; + free_policy_nodemask(&p->v.nodes); + free_policy_nodemask(&p->cpuset_mems_allowed); kmem_cache_free(policy_cache, p); } @@ -1646,7 +1742,8 @@ int mpol_set_shared_policy(struct shared pr_debug("set_shared_policy %lx sz %lu %d %lx\n", vma->vm_pgoff, sz, npol? npol->policy : -1, - npol ? nodes_addr(npol->v.nodes)[0] : -1); + npol ? nodes_addr(*policy_nodemask_ref(&npol->v.nodes))[0] : + -1); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -1694,6 +1791,10 @@ void __init numa_policy_init(void) sizeof(struct sp_node), 0, SLAB_PANIC, NULL); + if (INDIRECT_POLICY_NODEMASK) + nodemask_cache = kmem_cache_create("nodemask", + sizeof(nodemask_t), + 0, SLAB_PANIC, NULL); /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or @@ -1738,7 +1839,7 @@ static void mpol_rebind_policy(struct me if (!pol) return; - mpolmask = &pol->cpuset_mems_allowed; + mpolmask = policy_nodemask_ref(&pol->cpuset_mems_allowed); if (nodes_equal(*mpolmask, *newmask)) return; @@ -1746,8 +1847,9 @@ static void mpol_rebind_policy(struct me case MPOL_BIND: /* Fall through */ case MPOL_INTERLEAVE: - nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); - pol->v.nodes = tmp; + nodes_remap(tmp, *policy_nodemask_ref(&pol->v.nodes), + *mpolmask, *newmask); + set_policy_nodemask(&pol->v.nodes, &tmp); *mpolmask = *newmask; current->il_next = node_remap(current->il_next, *mpolmask, *newmask); ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 15:00 ` Lee Schermerhorn @ 2007-10-29 17:33 ` Paul Jackson 2007-10-29 17:46 ` Lee Schermerhorn 2007-10-29 20:35 ` Christoph Lameter 1 sibling, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-29 17:33 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: clameter, rientjes, akpm, ak, linux-kernel, mel Lee wrote: > I only brought it up again because now you all are considering another > nodemask per policy. The patch David and I are discussing will replace the cpuset_mems_allowed nodemask in struct mempolicy, not add a new nodemask. In other words, the meaning and name of that existing nodemask will change, with no change in the overall structure size. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 17:33 ` Paul Jackson @ 2007-10-29 17:46 ` Lee Schermerhorn 0 siblings, 0 replies; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-29 17:46 UTC (permalink / raw) To: Paul Jackson; +Cc: clameter, rientjes, akpm, ak, linux-kernel, mel On Mon, 2007-10-29 at 10:33 -0700, Paul Jackson wrote: > Lee wrote: > > I only brought it up again because now you all are considering another > > nodemask per policy. > > The patch David and I are discussing will replace the > cpuset_mems_allowed nodemask in struct mempolicy, not > add a new nodemask. In other words, the meaning and > name of that existing nodemask will change, with no > change in the overall structure size. Kool! Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 15:00 ` Lee Schermerhorn 2007-10-29 17:33 ` Paul Jackson @ 2007-10-29 20:35 ` Christoph Lameter 1 sibling, 0 replies; 98+ messages in thread From: Christoph Lameter @ 2007-10-29 20:35 UTC (permalink / raw) To: Lee Schermerhorn Cc: David Rientjes, Paul Jackson, akpm, ak, linux-kernel, Mel Gorman On Mon, 29 Oct 2007, Lee Schermerhorn wrote: > Note: I don't [didn't] think I need to ref count the nodemasks > associated with the mempolicies because they are allocated when the > mempolicy is and destroyed when the policy is--not shared. Just like > the custom zonelist for bind policy, and we have no ref count there. > I.e., they're protected by the mempol's ref. However, now that you > bring it up, I'm wondering about the effects of policy remapping, and > whether we have the reference counting or indirect protection [mmap_sem, > whatever] correct there in current code. I'll have to take a look. In that case we could just put the nodemask at the end of the mempolicy structure and then allocate the size needed? That way we would not need to deref an additional pointer? ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:13 ` Lee Schermerhorn 2007-10-26 21:17 ` Christoph Lameter @ 2007-10-26 21:18 ` David Rientjes 2007-10-26 21:31 ` Lee Schermerhorn 1 sibling, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-26 21:18 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > You don't need to save the entire mask--just note that NODE_MASK_ALL was > passed--like with my internal MPOL_CONTEXT flag. This would involve > special casing NODE_MASK_ALL in the error checking, as currently > set_mempolicy() complains loudly if you pass non-allowed nodes--see > "contextualize_policy()". [mbind() on the other hand, appears to allow > any nodemask, even outside the cpuset. guess we catch this during > allocation.] This is pretty much the spirit of my patch w/o the API > change/extension [/improvement :)] > Not really, because perhaps your application doesn't want to interleave over all nodes. I suggested NODE_MASK_ALL as the way to get access to all the memory you are allowed, but it's certainly plausible that an application could request to interleave only over a subset. That's the entire reason set_mempolicy(MPOL_INTERLEAVE) takes a nodemask anyway right now instead of just using task->mems_allowed on each allocation. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:18 ` David Rientjes @ 2007-10-26 21:31 ` Lee Schermerhorn 2007-10-26 21:39 ` David Rientjes 0 siblings, 1 reply; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-26 21:31 UTC (permalink / raw) To: David Rientjes; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel On Fri, 2007-10-26 at 14:18 -0700, David Rientjes wrote: > On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > > > You don't need to save the entire mask--just note that NODE_MASK_ALL was > > passed--like with my internal MPOL_CONTEXT flag. This would involve > > special casing NODE_MASK_ALL in the error checking, as currently > > set_mempolicy() complains loudly if you pass non-allowed nodes--see > > "contextualize_policy()". [mbind() on the other hand, appears to allow > > any nodemask, even outside the cpuset. guess we catch this during > > allocation.] This is pretty much the spirit of my patch w/o the API > > change/extension [/improvement :)] > > > > Not really, because perhaps your application doesn't want to interleave > over all nodes. I suggested NODE_MASK_ALL as the way to get access to all > the memory you are allowed, but it's certainly plausible that an > application could request to interleave only over a subset. That's the > entire reason set_mempolicy(MPOL_INTERLEAVE) takes a nodemask anyway right > now instead of just using task->mems_allowed on each allocation. So, you pass the subset, you don't set the flag to indicate you want interleaving over all available. You must be thinking of some other use for saving the subset mask that I'm not seeing here. Maybe restoring to the exact nodes requested if they're taken away and then re-added to the cpuset? Later, Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:31 ` Lee Schermerhorn @ 2007-10-26 21:39 ` David Rientjes 2007-10-27 1:07 ` Paul Jackson 2007-10-29 15:10 ` Lee Schermerhorn 0 siblings, 2 replies; 98+ messages in thread From: David Rientjes @ 2007-10-26 21:39 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > So, you pass the subset, you don't set the flag to indicate you want > interleaving over all available. You must be thinking of some other use > for saving the subset mask that I'm not seeing here. Maybe restoring to > the exact nodes requested if they're taken away and then re-added to the > cpuset? > Paul's motivation for saving the passed nodemask to set_mempolicy() is so that the _intent_ of the application is never lost. That's the biggest advantage that this method has and that I totally agree with. So whenever the mems_allowed of a cpuset changes, the MPOL_INTERLEAVE nodemask of all attached tasks becomes their intent (pol->passed_nodemask) AND'd with the new mems_allowed. That can be done on mpol_rebind_policy() and shouldn't be an extensive change. So MPOL_INTERLEAVE, and possibly other, mempolicies will always try to accomodate the intent of the application but only as far as the task's cpuset restriction allows them. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:39 ` David Rientjes @ 2007-10-27 1:07 ` Paul Jackson 2007-10-27 1:26 ` Christoph Lameter 2007-10-27 17:45 ` David Rientjes 2007-10-29 15:10 ` Lee Schermerhorn 1 sibling, 2 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-27 1:07 UTC (permalink / raw) To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel Issue: Are the nodes and nodemasks passed into set_mempolicy() to be presumed relative to the cpuset or not? [Careful, this question doesn't mean what you might think it means.] Let's say our system has 100 nodes, numbered 0-99, and we have a task in a cpuset that includes the twenty nodes 10-29 at the moment. Currently, if that task does say an MPOL_PREFERRED on node 12, we take that to mean the 3rd node of its cpuset. If we move that task to a cpuset on nodes 40-59, the kernel will change that MPOL_PREFERRED to node 42. Similarly for the other MPOL_* policies. Ok so far ... seems reasonable. Node numbers passed into the set_mempolicy call are taken to be absolute node numbers that are to be mapped relative to the tasks current cpuset, perhaps unbeknownst to the calling task, and remapped if that cpuset changes. But now imagine that a task happens to be in a cpuset of just two nodes, and wants to request an MPOL_PREFERRED policy for the fourth node of its cpuset, anytime there actually is a fourth node. That task can't say that using numbering relative to its current cpuset, because that cpuset only has two nodes. It could say it relative to a mask of all possible nodes by asking for the fourth possible node, likely numbered node 3. If that task happened to be in a cpuset on nodes 10 and 11, asking for the fourth node in the system (node 3) would still be rather unambiguous, as node 3 can't be either of 10 or 11, so must be relative to all possible nodes, meaning "the fourth available node, if I'm ever fortunate enough to have that many nodes." But if that task happened to be in a cpuset on nodes 2 and 3, then the node number 3 could mean: Choice A: as it does today, the second node in the tasks cpuset or it could mean Choice B: the fourth node in the cpuset, if available, just as it did in the case above involving a cpuset on nodes 10 and 11. Let me restate this. Either way, passing in node 3 means node 3, as numbered in the system. But the question is whether (Choice A) node 3 is specified because it is the second node in the tasks cpuset, or (Choice B) because it is the fourth node in the system. Choice A is what we do now. But if we stay with Choice A, then a task stuck in a small cpuset at the moment can't express non-trivial mempolicy's for larger cpusets that it might be in later. Choice B lets the task calculate its mempolicy mask as if it owned the entire system, and express whatever elaborate mempolicy placement it might need, when blessed with enough memory nodes to matter. The system would automatically scrunch that request down to whatever is the current size and placement of the cpuset holding that task. Given a clean slate, I prefer Choice B. But Choice B is incompatible. Switching now would break tasks that had been carefully adapting their set_mempolicy requests to whatever nodes were in their current cpuset. This is probably too incompatible to be acceptable. Therefore it must be Choice A. However ... If I approach this from another angle, I can show it should be Choice B. Fasten your seatbelt ... Before the days of cpusets, Choice B was essentially how it was. Tasks computing memory policies for set_mempolicy() calls computed node numbers as if they owned the entire system. Essentially, cpusets introduced an incompatibility, imposing Choice A instead. If a task wants to name the fourth node allowed to it in a memory policy, it can no longer just say node "3", but now has to determine its cpuset, and count off the fourth node that is currently allowed to it. This is an inherently racey calculation, of the sort that some of us would find unacceptable, because it can't cope very easily with simultaneously changing cpusets. My hunch (unsupported by any real evidence or experience) is that there is very little user level code that actually depends on this incompatible change imposed by cpusets. I'm guessing that most codes making heavy use of memory policies are still coded as if the task owns the system, and would be ill prepared to cope with a heavy cpuset environment. If that's the case, we'd break less user code by going with Choice B. I have a little bit of code that will notice the difference (so if we go with Choice B, there has to be a way for user level code that cares to probe which choice applies), but I'm not a major user of mempolicy calls. I'll have to rely on the experience of others more involved with memory policy aware user code as to which Choice would be less disruptive. Recommendations? -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 1:07 ` Paul Jackson @ 2007-10-27 1:26 ` Christoph Lameter 2007-10-27 2:41 ` Paul Jackson 2007-10-27 17:45 ` David Rientjes 1 sibling, 1 reply; 98+ messages in thread From: Christoph Lameter @ 2007-10-27 1:26 UTC (permalink / raw) To: Paul Jackson; +Cc: David Rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Paul Jackson wrote: > Choice B lets the task calculate its mempolicy mask as if it owned > the entire system, and express whatever elaborate mempolicy placement > it might need, when blessed with enough memory nodes to matter. > The system would automatically scrunch that request down to whatever > is the current size and placement of the cpuset holding that task. > > Given a clean slate, I prefer Choice B. Yes. We should default to Choice B. Add an option MPOL_MF_RELATIVE to enable that functionality? A new version of numactl can then enable that by default for newer applications. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 1:26 ` Christoph Lameter @ 2007-10-27 2:41 ` Paul Jackson 2007-10-27 2:50 ` Christoph Lameter 2007-10-27 17:50 ` David Rientjes 0 siblings, 2 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-27 2:41 UTC (permalink / raw) To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel Christoph wrote: > Yes. We should default to Choice B. Add an option MPOL_MF_RELATIVE to > enable that functionality? A new version of numactl can then enable > that by default for newer applications. I'm confused. If B is the default, then we don't need a flag to enable it, rather we need a flag to go back to the old choice A. So are you saying that: 1) Choice A remains the default for the kernel unless MPOL_MF_RELATIVE is added, or 2) that the new default for the kernel is Choice B, unless MPOL_MF_RELATIVE is specified, asking to revert to the original Choice A behaviour? Perhaps, either way, whatever compatibility flag we have should be something that can be forced on an application from the outside, perhaps as a per-system mode flag in /sys, or a per-cpuset mode flag, or a per-task operation, by what mechanism is not clear. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 2:41 ` Paul Jackson @ 2007-10-27 2:50 ` Christoph Lameter 2007-10-27 5:16 ` Paul Jackson 2007-10-27 17:50 ` David Rientjes 1 sibling, 1 reply; 98+ messages in thread From: Christoph Lameter @ 2007-10-27 2:50 UTC (permalink / raw) To: Paul Jackson; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Paul Jackson wrote: > Christoph wrote: > > Yes. We should default to Choice B. Add an option MPOL_MF_RELATIVE to > > enable that functionality? A new version of numactl can then enable > > that by default for newer applications. > > I'm confused. If B is the default, then we don't need a flag to > enable it, rather we need a flag to go back to the old choice A. Dont we need it for numactl to preserve backward compatibility? numactl can set that flag by default for newer software. We likely need a new major release of numactl. > Perhaps, either way, whatever compatibility flag we have should be > something that can be forced on an application from the outside, > perhaps as a per-system mode flag in /sys, or a per-cpuset mode flag, > or a per-task operation, by what mechanism is not clear. libnuma can take of that. But we need to have that flag for numactl to be backward compatible. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 2:50 ` Christoph Lameter @ 2007-10-27 5:16 ` Paul Jackson 2007-10-27 6:07 ` Christoph Lameter 0 siblings, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-27 5:16 UTC (permalink / raw) To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel I'm still confused, Christoph. Are you saying: 1) The kernel continues to default to Choice A, unless the flag enables Choice B, or 2) The kernel defaults to the new Choice B, unless the flag reverts to the old Choice A? Alternative (2) breaks libnuma and hence numactl until it is changed to use the flag, or changed to use choice B (in which case it wouldn't need the flag.) So I guess you mean alternative (1) above, since you seem to be taking the position that we can't break compatibility here. But I could quote statements from you that seem to clearly state the exact opposite. So I remain confused. Actually, alternative (1) is kinda ugly. It leaves a permanent wart on the set_mempolicy API -- two different variants to what the node numbers and node masks mean, depending on whether this MPOL_MF_RELATIVE is set on each call. We'll have to ship out an extra serving of brain food for most folks looking at this to have much chance that they will confidently understand the difference between the two options selected by this flag. I wonder if there might be some way to avoid that permanent ugly wart on each and every set/get mempolicy system call forever afterward. Please try to double check your next reply, Christoph. I'm beginning to worry that we might be failing to communicate clearly. Thanks. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 5:16 ` Paul Jackson @ 2007-10-27 6:07 ` Christoph Lameter 2007-10-27 8:36 ` Paul Jackson 0 siblings, 1 reply; 98+ messages in thread From: Christoph Lameter @ 2007-10-27 6:07 UTC (permalink / raw) To: Paul Jackson; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Paul Jackson wrote: > Are you saying: > 1) The kernel continues to default to Choice A, unless > the flag enables Choice B, or > 2) The kernel defaults to the new Choice B, unless the > flag reverts to the old Choice A? If 2) is keeping the API semantics then 2. > Alternative (2) breaks libnuma and hence numactl until it is changed > to use the flag, or changed to use choice B (in which case it wouldn't > need the flag.) 2) keeps everything in order. Let everything be as it is today unless numactl sets the new. > So I guess you mean alternative (1) above, since you seem to be taking > the position that we can't break compatibility here. I am getting confused as to which alternative means what. > Actually, alternative (1) is kinda ugly. It leaves a permanent wart > on the set_mempolicy API -- two different variants to what the node > numbers and node masks mean, depending on whether this MPOL_MF_RELATIVE > is set on each call. We'll have to ship out an extra serving of brain > food for most folks looking at this to have much chance that they will > confidently understand the difference between the two options selected > by this flag. Tough. The API needs to remain stable. We can only change it through an additional flag that enables the relativeness and the folding the way you want it. libnuma may set the flag on its own without the user having to do anything. > I wonder if there might be some way to avoid that permanent ugly wart > on each and every set/get mempolicy system call forever afterward. Hmmm.. The alternative is to add new set/get mempolicy functions. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 6:07 ` Christoph Lameter @ 2007-10-27 8:36 ` Paul Jackson 2007-10-27 17:47 ` Christoph Lameter 0 siblings, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-27 8:36 UTC (permalink / raw) To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel > > Are you saying: > > 1) The kernel continues to default to Choice A, unless > > the flag enables Choice B, or > > 2) The kernel defaults to the new Choice B, unless the > > flag reverts to the old Choice A? > > If 2) is keeping the API semantics then 2. No .. (1) keeps the same API semantics. > Let everything be as it is today unless > numactl sets the new. > ... > Tough. The API needs to remain stable. Good - that I understand. Your position is clear now. You have chosen (1) above, which keeps Choice A as the default. Before I leave this part, there is one more thing I kinda really need, if you could, Christoph. Could you describe in your own words what you think Choices A and B mean? We seem to be having trouble communicating, and hence there is some risk right now that we don't mean the same thing by this new "Choice B". === Now ... onto the matter of permanent API warts: > > I wonder if there might be some way to avoid that permanent ugly wart > > on each and every set/get mempolicy system call forever afterward. > > Hmmm.. The alternative is to add new set/get mempolicy functions. Other alternatives include a per-system, per-cpuset or per-process flag, in addition to the per-system call flag you suggested earlier (MPOL_MF_RELATIVE), or whatever you mean by "new set/get mempolicy functions" ... could you elaborate on that one? So ... the question becomes this: How do we migrate to Choice B, without leaving both Choices permanently supported, and an ugly mode flag selecting the non-default Choice, while not breaking API's too abruptly? Thanks. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 8:36 ` Paul Jackson @ 2007-10-27 17:47 ` Christoph Lameter 2007-10-27 20:59 ` Paul Jackson 0 siblings, 1 reply; 98+ messages in thread From: Christoph Lameter @ 2007-10-27 17:47 UTC (permalink / raw) To: Paul Jackson; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel On Sat, 27 Oct 2007, Paul Jackson wrote: > > Tough. The API needs to remain stable. > > Good - that I understand. Your position is clear now. > > You have chosen (1) above, which keeps Choice A as the default. There can be different defaults for the user space API via libnuma that are indepdent from the kernel API which needs to remain stable. The kernel API can be extended but not changed. > > Hmmm.. The alternative is to add new set/get mempolicy functions. > > Other alternatives include a per-system, per-cpuset or per-process > flag, in addition to the per-system call flag you suggested earlier > (MPOL_MF_RELATIVE), or whatever you mean by "new set/get mempolicy > functions" ... could you elaborate on that one? None of those sound appealing. Multiple processes may run in one cpuset. Some of those may be linked to older libnumas and therefore depend on old behavior. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 17:47 ` Christoph Lameter @ 2007-10-27 20:59 ` Paul Jackson 0 siblings, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-27 20:59 UTC (permalink / raw) To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, akpm, ak, linux-kernel > > You have chosen (1) above, which keeps Choice A as the default. > > There can be different defaults for the user space API via libnuma that > are indepdent from the kernel API which needs to remain stable. The kernel > API can be extended but not changed. Yes - the user level code can have different defaults too. I was discussing what should be the default kernel API. > None of those [alternatives] sound appealing. Multiple processes may run > in one cpuset. Well, that would justify keeping this choice per-task. I tend to agree with that. But that doesn't justify having to specify it on each system call. In another reply David recommends against supporting Choice A at all. I'm inclined to agree with him. I'll reply there, with more thoughts. But if we did support Choice A, as a backwards compatible alternative to Choice B, I'd suggest a per-task mode, not per-system call mode. This would reduce the impact on the API of the ugly, unobvious, modal flag needed to select the optional, non kernel default, Choice B semantics. I still have low confidence that you (Christoph) and I have the same understanding of what these Choice A and B are. Hopefully you can address that, perhaps by briefly describing these choices in your words. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 2:41 ` Paul Jackson 2007-10-27 2:50 ` Christoph Lameter @ 2007-10-27 17:50 ` David Rientjes 2007-10-27 23:19 ` Paul Jackson 1 sibling, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-27 17:50 UTC (permalink / raw) To: Paul Jackson; +Cc: Christoph Lameter, Lee.Schermerhorn, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Paul Jackson wrote: > > Yes. We should default to Choice B. Add an option MPOL_MF_RELATIVE to > > enable that functionality? A new version of numactl can then enable > > that by default for newer applications. > > I'm confused. If B is the default, then we don't need a flag to > enable it, rather we need a flag to go back to the old choice A. > I think there's a mixup in the flag name there, but I actually would recommend against any flag to effect Choice A. It's simply going to be too complex to describe and is going to be a headache to code and support. The MPOL_PREFERRED behavior when constrained by cpusets was previously, to my knowledge, undocumented; you're in the position to make the behavior do what you want it to do and then release documentation so we'll finally have a complete and unambiguous API for it. Right now it should be considered undefined and thus you are free to implement it as you choose. Then all callers of set_mempolicy(MPOL_PREFERRED) will standardize on that and not have to worry about the machine's mpol_preferred_relative_to_cpuset setting. Then, any task that is attached to a cpuset and expecting the fourth node in their set_mempolicy(MPOL_PREFERRED) call to mean system node 3 if it's in the cpuset's mems_allowed will be broken. If you want that, you'll need your task to be attached to a cpuset with at least mems 0-3; programmers will pick that up quickly enough if it's clearly documented. I think Choice B is correct and makes more sense in terms of the semantics and at least allows mempolicies and cpusets to play nicely together without a bidirectional dependency on one another. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 17:50 ` David Rientjes @ 2007-10-27 23:19 ` Paul Jackson 2007-10-28 18:19 ` David Rientjes 2007-10-29 16:54 ` Lee Schermerhorn 0 siblings, 2 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-27 23:19 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel David wrote: > I think there's a mixup in the flag name [MPOL_MF_RELATIVE] there Most likely. The discussion involving that flag name was kinda mixed up ;). > but I actually would recommend against any flag to effect Choice A. > It's simply going to be too complex to describe and is going to be a > headache to code and support. While I am sorely tempted to agree entirely with this, I suspect that Christoph has a point when he cautions against breaking this kernel API. Especially for users of the set/get mempolicy calls coming in via libnuma, we have to be very careful not to break the current behaviour, whether it is documented API or just an accident of the implementation. There is a fairly deep and important stack of software, involving a well known DBMS product whose name begins with 'O', sitting on that libnuma software stack. Steering that solution stack is like steering a giant oil tanker near shore. You take it slow and easy, and listen closely to the advice of the ancient harbor master. The harbor masters in this case are or were Andi Kleen and Christoph Lameter. > It's simply going to be too complex to describe and is going to be a > headache to code and support. True, which is why I am hoping we can keep this modal flag, if such be, from having to be used on every set/get mempolicy call. The ordinary coder of new code using these calls directly should just see Choice B behaviour. However the user of libnuma should continue to see whatever API libnuma supports, with no change whatsoever, and various versions of libnuma, including those already shipped years ago, must continue to behave without any changes in node numbering. There are two decent looking ways (and some ugly ways) that I can see to accomplish this: 1) One could claim that no important use of Oracle over libnuma over these memory policy calls is happening on a system using cpusets. There would be a fair bit of circumstantial evidence for this claim, but I don't know it for a fact, and would not be the expert to determine this. On systems making no use of cpusets, these two Choices A and B are identical, and this is a non-issue. Those systems will see no API changes whatsoever from any of this. 2) We have a per-task mode flag selecting whether Choice A or B node numbering apply to the masks passed in to set_mempolicy. The kernel implementation is fairly easy. (Yeah, I know, I too cringe everytime I read that line ;) If Choice A is active, then we continue to enforce the current check, in mm/mempolicy.c: contextualize_policy(), that the passed in mask be a subset of the current cpusets allowed memory nodes, and we add a call to nodes_remap(), to map the passed in nodemask from cpuset centric to system centric (if they asked for the fourth node in their current cpuset, they get the fourth node in the entire system.) Similarly, for masks passed back by get_mempolicy, if Choice A is active, we use nodes_remap() to convert the mask from system centric back to cpuset centric. There is a subtle change in the kernel API's here: In current kernels, which are Choice A, if a task is moved from a big cpuset (many nodes) to a small cpuset and then -back- to a big cpuset, the nodemasks returned by get_mempolicy will still show the smaller masks (fewer set nodes) imposed by the smaller cpuset. In todays kernels, once scrunched or folded down, the masks don't recover their original size after the task is moved back to a large cpuset. With this change, even a task asking for Choice A would, once back on a larger cpuset, again see the larger masks from get_mempolicy queries. This is a change in the kernel API's visible to user space; but I really do not think that there is sufficient use of Oracle over libnuma on systems actively moving tasks between differing size cpusets for this to be a problem. Indeed, if there was much such usage, I suspect they'd be complaining that the current kernel API was borked, and they'd be filing a request for enhancement -asking- for just this subtle change in the kernel API's here. In other words, this subtle API change is a feature, not a bug ;) The bulk of the kernel's mempolicy code is coded for Choice B. If Choice B is active, we don't enforce the subset check in contextualize_policy(), and we don't invoke nodes_remap() in either of the set or get mempolicy code paths. A new option to get_mempolicy() would query the current state of this mode flag, and a new option to set_mempolicy() would set and clear this mode flag. Perhaps Christoph had this in mind when he wrote in an earlier message "The alternative is to add new set/get mempolicy functions." The default kernel API for each task would be Choice B (!). However, in deference to the needs of libnuma, if the following call was made, this would change the mode for that task to Choice A: get_mempolicy(NULL, NULL, 0, 0, 0); This last detail above is an admitted hack. *So far as I know* it happens that all current infield versions of libnuma issue the above call, as their first mempolicy query, to detemine whether the active kernel supports mempolicy. The mode for each task would be inherited across fork, and reset to Choice (B) on exec. If we determine that we must go with a new flag bit to be passed in on each and every get and set mempolicy call that wants Choice B node numbering rather than Choice A, then I will need (1) a bottle of rum, and (2) a credible plan in place to phase out this abomination ;). There would be just way too many coding errors and coder frustrations introduced by requiring such a flag on each and every mempolicy system call that wants the alternative numbering. There must be some international treaty that prohibits landmines capable of blowing ones foot off that would apply here. There are two major user level libraries sitting on top of this API, libnuma and libcpuset. Libnuma is well known; it was written by Andi Kleen. I wrote libcpuset, and while it is LGPL licensed, it has not been publicized very well yet. I can speak for libcpuset: it could adapt to the above proposal, in particular to the details in way (2), just fine. Old versions of libcpuset running on new kernels will have a little bit of subtle breakage, but not in areas that I expect will cause much grief. Someone more familiar with libnuma than I would have to examine the above proposal in way (2) to be sure that we weren't throwing libnuma some curveball that was unnecessarily troublesome. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 23:19 ` Paul Jackson @ 2007-10-28 18:19 ` David Rientjes 2007-10-28 23:46 ` Paul Jackson 2007-10-29 16:54 ` Lee Schermerhorn 1 sibling, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-28 18:19 UTC (permalink / raw) To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel On Sat, 27 Oct 2007, Paul Jackson wrote: > > but I actually would recommend against any flag to effect Choice A. > > It's simply going to be too complex to describe and is going to be a > > headache to code and support. > > While I am sorely tempted to agree entirely with this, I suspect that > Christoph has a point when he cautions against breaking this kernel API. > > Especially for users of the set/get mempolicy calls coming in via > libnuma, we have to be very careful not to break the current behaviour, > whether it is documented API or just an accident of the implementation. > >From a standpoint of the MPOL_PREFERRED memory policy itself, there is no documented behavior or standard that specifies its interaction with cpusets. Thus, it's "undefined." We are completely free to implement an undefined behavior as we choose and change it as Linux matures. Once it is defined, however, we carry the burden of protecting applications that are written on that definition. That's the point where we need to get it right and if we don't, we're stuck with it forever; I don't believe we're at that point with MPOL_PREFERRED policies under cpusets right now. > There is a fairly deep and important stack of software, involving a > well known DBMS product whose name begins with 'O', sitting on that > libnuma software stack. Steering that solution stack is like steering > a giant oil tanker near shore. You take it slow and easy, and listen > closely to the advice of the ancient harbor master. The harbor masters > in this case are or were Andi Kleen and Christoph Lameter. > Ok, let's take a look at some specific unproprietary examples of tasks that use set_mempolicy(MPOL_PREFERRED) for a specific node, intending it to be the actual system node offset, that is then assigned to a cpuset that doesn't require that offset to be allowed. I think it's going to become pretty difficult to find an example because the whole scenario is pretty lame: you would need to already know which nodes you're going to be assigned to in the cpuset to ask for one of them as your preferred node. I don't imagine any application can have that type of foresight and, if it does, then we certainly shouldn't support the preferred node_remap() when it changes mems. You're trying to support a scheme, in Choice A, where an application knows it's going to be assigned to a range of nodes (for example, 1-3) and wants the preferred node to be included (for example, 2). So now the application must have control over both its memory policy and its cpuset placement. Then it must be willing to change its cpuset placement to a different set of nodes (with equal or greater cardinality) and have the preferred node offset respected. Why can't it simply then issue another set_mempolicy(MPOL_PREFERRED) call for the new preferred node? See? The problem is that you're trying to protect applications that know its initial cpuset mems [the only way it could ever send a set_mempolicy(MPOL_PREFERRED) for the right node in that range in the first place] but then seemingly loses control over its cpuset and intends for the kernel to fix it up for it without having the burden of issuing another set_mempolicy() call. And you're trying to protect this application that based this implementation not on a standard or documentation, but on its observed behavior. My bet is that it's going to issue that subsequent set_mempolicy(), at least if libnuma returned a numa_preferred() value that it wasn't expecting. > True, which is why I am hoping we can keep this modal flag, if such be, > from having to be used on every set/get mempolicy call. The ordinary > coder of new code using these calls directly should just see Choice B > behaviour. However the user of libnuma should continue to see whatever > API libnuma supports, with no change whatsoever, and various versions of > libnuma, including those already shipped years ago, must continue to > behave without any changes in node numbering. > I don't see how you can accomplish that. If the default behavior is Choice B, which is different from what is currently implemented in the kernel, you're going to either require a modification to the application to set a flag asking for Choice A again or make the default kernel behavior that of Choice A and set a flag implicitly via libnuma when future versions are released. In the former case, just ask the application to adjust its node numbering scheme or check the result of numa_preferred(). In the latter case, we're not even talking about changing the kernel default anymore to Choice B. > 2) We have a per-task mode flag selecting whether Choice A or B > node numbering apply to the masks passed in to set_mempolicy. > > The kernel implementation is fairly easy. (Yeah, I know, I > too cringe everytime I read that line ;) > If you add this per-task mode flag to default to Choice A for preferred memory policies, it'll be extremely confusing to document and support. If it's already decided that we should default to Choice B, it's going to require an update to the application to write to /proc/pid/i_want_choice_A or use the new set_mempolicy() option anyway, so instead of adding that hack you should simply fix your node numbering. And I suspect that if that per-task mode flag is added, it will eventually be the subject of a thread with the subject "is this highly specialized flag even used anymore?" at which point it will be marked deprecated and eventually obsoleted. > The bulk of the kernel's mempolicy code is coded for Choice B. > > If Choice B is active, we don't enforce the subset check in > contextualize_policy(), and we don't invoke nodes_remap() in either > of the set or get mempolicy code paths. > Yeah, remapping the nodemask is a bad idea anyway to get a preferred node. Preferred nodes inherently deal with offsets from node 0 anyway. > A new option to get_mempolicy() would query the current state of > this mode flag, and a new option to set_mempolicy() would set > and clear this mode flag. Perhaps Christoph had this in mind > when he wrote in an earlier message "The alternative is to add > new set/get mempolicy functions." > That still requires a change to the application. So they should simply rethink their node numbering instead and fix their application to follow a behavior that will, at that point, be documented. Any application that doesn't respect the return value of set_mempolicy(MPOL_PREFERRED) node isn't worth supporting anyway. There's two cases to think about: - When the cpuset assignment changes from the root cpuset to a user-created cpuset with a subset of system mems and then set_mempolicy() is called, and - When set_mempolicy() is called and then the cpuset mems change either because it was attached to a different cpuset or someone wrote to its 'mems' file. In the first case, the new API should return -EINVAL if you ask for a preferred node offset that is smaller than the cardinality of your mems_allowed. That will catch some of these applications that may have actually been implemented based on the current undocumented behavior. In the second case, the first node in the nodemask passed to set_mempolicy() was a system node offset anyway and had nothing to do with cpusets (it was a member of the root cpuset with access to all mems) so it already behaves as Choice B. > There are two major user level libraries sitting on top of this API, > libnuma and libcpuset. Libnuma is well known; it was written by Andi > Kleen. I wrote libcpuset, and while it is LGPL licensed, it has not > been publicized very well yet. I can speak for libcpuset: it could > adapt to the above proposal, in particular to the details in way (2), > just fine. Old versions of libcpuset running on new kernels will > have a little bit of subtle breakage, but not in areas that I expect > will cause much grief. Someone more familiar with libnuma than I would > have to examine the above proposal in way (2) to be sure that we weren't > throwing libnuma some curveball that was unnecessarily troublesome. > I think any application that gets constrained to a subset of nodes in its mems_allowed and then bases its preferred node number off that subset to create an offset that is intended to be preserved over subsequent mems changes without rechecking the result with numa_preferred() or issuing a subsequent set_mempolicy() is poorly written. Especially since that behavior was undocumented. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-28 18:19 ` David Rientjes @ 2007-10-28 23:46 ` Paul Jackson 2007-10-29 1:04 ` David Rientjes 0 siblings, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-28 23:46 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel David wrote: > From a standpoint of the MPOL_PREFERRED memory policy itself, there > is no documented behavior or standard that specifies its interaction > with cpusets. Thus, it's "undefined." We are completely free > to implement an undefined behavior as we choose and change it as > Linux matures. You state this point clearly, but I have to disagree. The Linux documentation is not a legal contract. Anytime we change the actual behaviour of the code, we have to ask ourselves what will be the impact of that change on existing users and usages. The burden is on us to minimize breaking things (by that I mean, what users would consider breakage, even if we think it is all for the better and that their code was the real problem.) I didn't say no breakage, but minimum breakage, doing our best to guide users through changes with minimum disruption to their work. Linux is gaining market share rapidly because we co-operate with our users to give us both the best chance of succeeding. We don't just play gotcha games with the documentation -- ha ha -- we didn't document that detail, so it's your fault for ever depending on it. And besides your code sucks. So there! Let's leave that game for others. > And you're trying to protect this application that based this > implementation not on a standard or documentation, but on its observed > behavior. Yes, if there is such an application, I'm trying to protect it. > My bet is that it's going to issue that subsequent > set_mempolicy(), at least if libnuma returned a numa_preferred() value > that it wasn't expecting. Perhaps. Perhaps not. I don't know. > That still requires a change to the application. If that were so, then yes much of your subsequent reasoning would follow. However, let me repeat the following from my previous message: > However, in deference to the needs of libnuma, if the following > call was made, this would change the mode for that task to > Choice A: > > get_mempolicy(NULL, NULL, 0, 0, 0); > > This last detail above is an admitted hack. *So far as I know* > it happens that all current infield versions of libnuma issue the > above call, as their first mempolicy query, to detemine whether > the active kernel supports mempolicy. The above is the hack that allows us to support existing libnuma based applications (the most significant users of memory policy historically) with a default of Choice A, while other code and future code defaults to Choice B. > See? The problem is that you're trying to protect applications that know > its initial cpuset mems [the only way it could ever send a > set_mempolicy(MPOL_PREFERRED) for the right node in that range in the > first place] but then seemingly loses control over its cpuset and intends > for the kernel to fix it up for it without having the burden of issuing > another set_mempolicy() call. That's not the only sort of application I'm trying to protect. I'm trying to protect almost any application that uses both set_mempolicy or mbind, while in a cpuset. If a task is in a cpuset on say nodes 16-23, and it wants to issue any mbind, or any MPOL_PREFERRED, MPOL_BIND, or MPOL_INTERLEAVE mempolicy call, then under Choice A it must issue nodemasks offset by 16, relative to what it would issue under Choice B. Almost any task using memory policies on a system making active use of cpusets will be affected, even well written ones doing simple things. I am more concerned that the above hack for libnuma isn't enough, rather than it is unnecessary. I think the above hack covers existing libnuma users rather well, though I could be wrong even here, as I don't actually work with most of the existing libnuma users. And I can cover those using both memory policies and cpusets via libcpuset, as there I am the expert and know that I can guide libcpuset and its users through this change. There will be some breakage, but I know how to manage it. However, if anyone is deploying product or has important (to them) but not easy to change software using both memory policies and cpusets that are not doing so via libnuma or libcpuset, then any change that changes the default for their memory policy calls from Choice A to Choice B will probably break them. Perhaps there are no significant cases like this, using memory policies on cpuset managed systems, but not via libnuma or libcpuset. It might be that the only way to smoke out such cases is to ship the change (to Choice B default) and see what squawks. That kinda sucks. I'm tempted to think I need to go a bit further: 1) Add a per-system or per-cpuset runtime mode, to enable a system administrator to revert to the Choice A default. 2) Make sure that both the new libnuma and libcpuset versions dynamically probe the state of this default, and dynamically adapt to running on a kernel, or in a cpuset, of either default. If this mode is per-cpuset, then so long as there are not two applications that must both run in the same cpuset, both using memory policies, neither using libnuma or libcpuset, requiring conflicting defaults (one Choice A, the other Choice B) then this would provide an administrator sufficient flexibility to adapt. There would be one key advantage to a per-cpuset mode flag here. It exposes in a fairly visible way this change in the default numbering of memory policy node masks, from cpuset relative to system relative (with the system now automatically making them cpuset relative across any changes in the number of nodes in the cpuset.) This is a classic way to guide users to understanding new alternatives and changes; expose it as a 'button on the dashboard', a feature, not a hidden gotcha. It's the old "it's a feature, not a bug" approach. It can be a useful mechanism to educate and enpower customers. Users end up seeing it, learning about it, enjoying some sense of control, and usually being able to make it work for them, one way or the other. That works out better than just hitting some random subset of users with subtle bugs (from their perspective) over which they have little or no control and fewer clues. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-28 23:46 ` Paul Jackson @ 2007-10-29 1:04 ` David Rientjes 2007-10-29 4:27 ` Paul Jackson 0 siblings, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-29 1:04 UTC (permalink / raw) To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel On Sun, 28 Oct 2007, Paul Jackson wrote: > The Linux documentation is not a legal contract. Anytime we change the > actual behaviour of the code, we have to ask ourselves what will be the > impact of that change on existing users and usages. The burden is on > us to minimize breaking things (by that I mean, what users would > consider breakage, even if we think it is all for the better and that > their code was the real problem.) I didn't say no breakage, but > minimum breakage, doing our best to guide users through changes with > minimum disruption to their work. > Nobody can show an example of an application that would be broken because of this and, given the scenario and sequence of events that it requires to be broken when implementing the default as Choice B, I don't think it's as much of an issue as you believe. > > However, in deference to the needs of libnuma, if the following > > call was made, this would change the mode for that task to > > Choice A: > > > > get_mempolicy(NULL, NULL, 0, 0, 0); > > > > This last detail above is an admitted hack. *So far as I know* > > it happens that all current infield versions of libnuma issue the > > above call, as their first mempolicy query, to detemine whether > > the active kernel supports mempolicy. > > The above is the hack that allows us to support existing libnuma based > applications (the most significant users of memory policy historically) > with a default of Choice A, while other code and future code defaults > to Choice B. > So all applications that use the libnuma interface and numactl will have different default behavior than those that simply issue {get,set}_mempolicy() calls. libnuma is a collection of higher level functions that should be built upon {get,set}_mempolicy() like they currently are and not introduce new subtleties like changing the semantics of a preferred node argument. This is going to quickly become a documentation nightmare and, in my opinion, isn't worth the time or effort to support because we haven't even idenitifed any real-world examples. Maybe Andi Kleen should weigh in on this topic because, if we go with what you're suggesting, we'll never get rid of the two differing behaviors and we'll be introducing different semantics to arguments of libnuma functions than the kernel API they are built upon. > I'm trying to protect almost any application that uses both > set_mempolicy or mbind, while in a cpuset. > > If a task is in a cpuset on say nodes 16-23, and it wants to issue > any mbind, or any MPOL_PREFERRED, MPOL_BIND, or MPOL_INTERLEAVE > mempolicy call, then under Choice A it must issue nodemasks offset > by 16, relative to what it would issue under Choice B. > True, but the ordering of that scenario is troublesome. The correct way to implement it is to use set_mempolicy() or a higher level libnuma function with the same semantics and _then_ attach the task to a cpuset. Then the nodes_remap() takes care of the rest. The scenario you describe above has a problem because it requires the task to have knowledge of the cpuset's mems in which it is attached when, for portability, it should have been written so that it is robust to any range of nodes you happen to assign it to. > Almost any task using memory policies on a system making active use of > cpusets will be affected, even well written ones doing simple things. > No, because nodes_remap() takes care of the instances you describe above when the task sets its memory policy (usually done when it is started) and is then attached to a cpuset. > However, if anyone is deploying product or has important (to them) but > not easy to change software using both memory policies and cpusets that > are not doing so via libnuma or libcpuset, then any change that > changes the default for their memory policy calls from Choice A to > Choice B will probably break them. > Supporting two different behaviors is going to be more problematic than simply selecting one and going with it and its associated documentation in future versions of the kernel. > I'm tempted to think I need to go a bit further: > 1) Add a per-system or per-cpuset runtime mode, to enable a system > administrator to revert to the Choice A default. Paul, the changes required to an application that is currently using {get,set}_mempolicy() calls to setup the memory policy or the higher level functions through libnuma is so easy to use Choice B as a default instead of Choice A that it would be ridiculous to support configuring it on a per-system or per-cpuset basis. > 2) Make sure that both the new libnuma and libcpuset versions dynamically > probe the state of this default, and dynamically adapt to running > on a kernel, or in a cpuset, of either default. If this mode is > per-cpuset, then so long as there are not two applications that > must both run in the same cpuset, both using memory policies, > neither using libnuma or libcpuset, requiring conflicting defaults > (one Choice A, the other Choice B) then this would provide an > administrator sufficient flexibility to adapt. > Choosing only one behavior for the kernel (Choice B) is by far the superior selection because then any task can share a cpuset with any other task and implement its memory policy preferences in terms of low level system calls, numactl, or libnuma. That's the power that we should be giving users, not the addition of hacks or more configuration knobs that is going to clutter and confuse anybody who wants to simply pick a preferred node. > There would be one key advantage to a per-cpuset mode flag here. It > exposes in a fairly visible way this change in the default numbering of > memory policy node masks, from cpuset relative to system relative (with > the system now automatically making them cpuset relative across any > changes in the number of nodes in the cpuset.) This is a classic way > to guide users to understanding new alternatives and changes; expose it > as a 'button on the dashboard', a feature, not a hidden gotcha. > Yet the 'mems' file would still be system-wide; otherwise it would be impossible to expand the memory your cpuset has access to. Everything else would be relative to 'mems'. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 1:04 ` David Rientjes @ 2007-10-29 4:27 ` Paul Jackson 2007-10-29 4:47 ` David Rientjes 0 siblings, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-29 4:27 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel > Nobody can show an example of an application that would be broken because > of this and, given the scenario and sequence of events that it requires to > be broken when implementing the default as Choice B, I don't think it's as > much of an issue as you believe. Well, neither you nor I have shown an example. That's different than "nobody can." Since it would affect any task setting memory policies while in a cpuset holding less than all memory nodes, it seems potentially serious to me. Actually, I have one example. The libcpuset library would have some breakage with Choice B the only Choice. But I'm in a position to deal with that, so it's not a big deal. > So all applications that use the libnuma interface and numactl will have > different default behavior than those that simply issue > {get,set}_mempolicy() calls. Breaking the libnuma-Oracle solution stack is not an option. And, unless someone in the know tells us otherwise, I have to assume that this could break them. Now, the odds are that they simply don't run that solution stack on any system making active use of cpusets, so the odds are this would be no problem for them. But I don't presently have enough knowledge of their situation to take that risk. > if we go with what you're suggesting, we'll never get rid of the two > differing behaviors and we'll be introducing different semantics > to arguments of libnuma functions than the kernel API they are > built upon. We could get rid of Choice A once libnuma and libcpuset have adapted to Choice B, and any other uses of Choice A that we've subsequently identified have had sufficient time to adapt. But dual support is pretty easy so far as the kernel code is concerned. It's just a few nodes_remap() calls optionally invoked at a few key spots in mm/mempolicy.c. Consequently there won't be a big hurry to remove Choice A. > > I'm trying to protect almost any application that uses both > > set_mempolicy or mbind, while in a cpuset. > > > > If a task is in a cpuset on say nodes 16-23, and it wants to issue > > any mbind, or any MPOL_PREFERRED, MPOL_BIND, or MPOL_INTERLEAVE > > mempolicy call, then under Choice A it must issue nodemasks offset > > by 16, relative to what it would issue under Choice B. > > > > True, but the ordering of that scenario is troublesome. The correct way > to implement it is to use set_mempolicy() or a higher level libnuma > function with the same semantics and _then_ attach the task to a cpuset. > Then the nodes_remap() takes care of the rest. There is no "_then_ attach the task to a cpuset." On systems with kernels configured with CONFIG_CPUSETS=y, all tasks are in a cpuset all the time. Moreover, from a practical point of view, on large systems managed with cpuset based mechanisms, almost all tasks are in cpusets that do not include all nodes, for the entire life of the task. And besides, I can't break existing applications willy-nilly, and then claim it's their fault, because they should have been coded differently. So "correct way" arguments don't hold alot of weight for already released and deployed product. > Paul, the changes required to an application that is currently using > {get,set}_mempolicy() calls to setup the memory policy or the higher level > functions through libnuma is so easy to use Choice B as a default instead > of Choice A that it would be ridiculous to support configuring it on a > per-system or per-cpuset basis. David ;) I make some effort to avoid forcing applications to be recoded and rebuilt in order to continue functioning. > Yet the 'mems' file would still be system-wide; otherwise it would be > impossible to expand the memory your cpuset has access to. I had to read that a couple of times to make sense of it. I take that it means that the node numbering used in each cpuset's 'mems' file has to be system-wide. Yes, agreed. (Well, actually, the node numbering of each cpusets 'mems' file could be relative to its parent cpusets 'mem' numbers, but let's not go there, as this discussion is already sufficiently complicated ;) > Everything else would be relative to 'mems'. That's what Choice B states, yes. Though to be clear, time for another example: * task is in cpuset with mems: 24-31 * task wants some memory policy on the first two nodes of its cpuset. * by Choice A, it asks for nodes 24 and 25 * by Choice B, it asks for nodes 0 and 1 The Choice B numbering can be thought of as cpuset relative. In it, node N means the N-th node in my current cpuset, modulo whatever is the node size of that cpuset. However ... We need to continue to support Choice A as well, perhaps for some interim, perhaps forever. Which doesn't much matter for now. === David - how would the following do for you? Would it meet the need that prompted your initial patch set if we added Choice B memory policy node numbering, but left Choice A as the kernel default, with a per-task option (perhaps invokable by a new option to one of the {get,set}_mempolicy() calls) to choose Choice B? This lets us get Choice B out there, and lets the two main libraries, libnuma and libcpuset, dynamically adapt to whichever Choice is active for the current task. Unchanged applications and existing binaries would simply continue with Choice A. With one additional line of code, a user application could get Choice B, with its ability for example to request MPOL_INTERLEAVE over all cpuset allowed nodes, where the kernel automatically adapts that to changing cpuset changes from larger 'mems' to smaller 'mems' and back to larger 'mems' again. It would mean you would have to make a change to your applications to get this improved interleaving. But I trust from all you've been advocating that such a code change and rebuild would not be any problem for whatever situation you're concerned with. We could recommend that new code probe to see if Choice B is available and prefer it if it is. At some future time, we might deprecate and eventually remove Choice A. I appreciate that you don't want to leave in place the complications of dual Choices, but I lack the experience, knowledge or clarity I need to support fully changing over to Choice B at this time. Getting Choice B out there will go a long way toward providing us with the feedback we will need to guide future decisions. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 4:27 ` Paul Jackson @ 2007-10-29 4:47 ` David Rientjes 2007-10-29 5:45 ` Paul Jackson 2007-10-29 7:15 ` Paul Jackson 0 siblings, 2 replies; 98+ messages in thread From: David Rientjes @ 2007-10-29 4:47 UTC (permalink / raw) To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel On Sun, 28 Oct 2007, Paul Jackson wrote: > And, unless someone in the know tells us otherwise, I have to assume > that this could break them. Now, the odds are that they simply don't > run that solution stack on any system making active use of cpusets, > so the odds are this would be no problem for them. But I don't > presently have enough knowledge of their situation to take that risk. > If we can't identify any applications that would be broken by this, what's the difference in simply implementing Choice B and then, if we hear complaints, add your hack to revert back to Choice A behavior based on the get_mempolicy() call you specified is always part of libnuma? The problem that I see with immediately offering both choices is that we don't know if anybody is actually reverting back to Choice A behavior because libnuma, by default, would use it. That's going to making it very painful to remove later because we've supported both options and have made libnuma and {get,set}_mempolicy() arguments ambiguous. We should only support both choices if they will both be used and there's no hard evidence to suggest that at this point. > But dual support is pretty easy so far as the kernel code is concerned. > It's just a few nodes_remap() calls optionally invoked at a few key > spots in mm/mempolicy.c. Consequently there won't be a big hurry to > remove Choice A. > You earlier insisted on an ease of documentation for the MPOL_INTERLEAVE case and now this dual support that you're proposing is going to make the documentation very difficult to understand for anyone who simply wants to use mempolicies. Others even in this thread have had a hard enough time understanding the difference between the two choices and you explained them very thoroughly. It's going to be much more trouble than it's worth, I predict. > There is no "_then_ attach the task to a cpuset." On systems with > kernels configured with CONFIG_CPUSETS=y, all tasks are in a cpuset > all the time. Moreover, from a practical point of view, on large > systems managed with cpuset based mechanisms, almost all tasks are in > cpusets that do not include all nodes, for the entire life of the task. > And that application would need to be implemented to know the nodes that it has access to before it issues its set_mempolicy(MPOL_PREFERRED) command anyway if it truly uses Choice A behavior. So unless these tasks are looking in /proc/pid/status and parsing Mems_allowed and then specifying one as its preferred node or always being guaranteed a certain set of nodes that they are always attached to in a cpuset so they have such foresight of what node to prefer, Choice A can't possibly be what they want. > > Yet the 'mems' file would still be system-wide; otherwise it would be > > impossible to expand the memory your cpuset has access to. > > I had to read that a couple of times to make sense of it. I take that > it means that the node numbering used in each cpuset's 'mems' file has > to be system-wide. Yes, agreed. > > (Well, actually, the node numbering of each cpusets 'mems' file could > be relative to its parent cpusets 'mem' numbers, but let's not go > there, as this discussion is already sufficiently complicated ;) > I appreciate that very much. > Would it meet the need that prompted your initial patch set if we > added Choice B memory policy node numbering, but left Choice A as the > kernel default, with a per-task option (perhaps invokable by a new > option to one of the {get,set}_mempolicy() calls) to choose Choice B? > The needs I was addressing with my initial patchset was so that when a cpuset is expanded, any MPOL_INTERLEAVE memory policy of attached tasks automatically get expanded as well. This discussion has somewhat diverged from that, but I hope you still support what we earlier talked about in terms of adding a field to struct mempolicy to remember the intended nodemask the application asked to interleave over. > This lets us get Choice B out there, and lets the two main libraries, > libnuma and libcpuset, dynamically adapt to whichever Choice is active > for the current task. > > Unchanged applications and existing binaries would simply continue with > Choice A. With one additional line of code, a user application could > get Choice B, with its ability for example to request MPOL_INTERLEAVE > over all cpuset allowed nodes, where the kernel automatically adapts > that to changing cpuset changes from larger 'mems' to smaller 'mems' > and back to larger 'mems' again. > You don't actually need to choose between the two choices for adapting MPOL_INTERLEAVE over _all_ allowed cpuset nodes. I thought what we agreed upon and what you were going to implement was adding a nodemask_t to struct mempolicy for the intended nodemask of the memory policy and then AND it with pol->cpuset_mems_allowed. That completely satisfies my needs and my applications that want to allocate over all available nodes (by simply passing numa_all_nodes to set_mempolicy(MPOL_INTERLEAVE)). If I wanted to interleave only over a subset, the choices would matter. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 4:47 ` David Rientjes @ 2007-10-29 5:45 ` Paul Jackson 2007-10-29 7:00 ` David Rientjes 2007-10-29 7:15 ` Paul Jackson 1 sibling, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-29 5:45 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel David wrote: > If we can't identify any applications that would be broken by this, what's > the difference in simply implementing Choice B and then, if we hear > complaints, add your hack to revert back to Choice A behavior based on the > get_mempolicy() call you specified is always part of libnuma? I'll probably reply to other parts of your message later, but this one catches my eye right now. "if we hear complaints, add your hack ... back" -- this doesn't seem like a good idea to me. Maybe inside Google you don't see it, but for those of us shipping computer systems using major distributions such as SUSE or Red Hat, there can be a year lag between when I send a feature patch to Andrew, and when my customers send their first feedback to me resulting from using that new feature. There are ways to expedite fixes for specific situations, of course, but in general, this is rather like sending out a deep space probe. You have to conservatively cover your options pre-launch, because post-launch repairs are costly, slow and limited. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 5:45 ` Paul Jackson @ 2007-10-29 7:00 ` David Rientjes 2007-10-29 7:26 ` Paul Jackson 0 siblings, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-29 7:00 UTC (permalink / raw) To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel On Sun, 28 Oct 2007, Paul Jackson wrote: > > If we can't identify any applications that would be broken by this, what's > > the difference in simply implementing Choice B and then, if we hear > > complaints, add your hack to revert back to Choice A behavior based on the > > get_mempolicy() call you specified is always part of libnuma? > > I'll probably reply to other parts of your message later, but this > one catches my eye right now. > > "if we hear complaints, add your hack ... back" -- this doesn't seem > like a good idea to me. Maybe inside Google you don't see it, but > for those of us shipping computer systems using major distributions > such as SUSE or Red Hat, there can be a year lag between when I send a > feature patch to Andrew, and when my customers send their first > feedback to me resulting from using that new feature. > Let's add a Choice C: Any nodemask that is passed to set_mempolicy() is saved as the intent of the application in struct mempolicy. All policies are effected on a contextualized per-allocation basis. Policies such as MPOL_INTERLEAVE always get AND'd with pol->cpuset_mems_allowed. If that yields numa_no_nodes, MPOL_DEFAULT is used instead. Policies such as MPOL_PREFERRED are respected if the node is set in pol->cpuset_mems_allowed, otherwise MPOL_DEFAULT is used. If an application attempts to setup a memory policy for an MPOL_PREFERRED node that it doesn't have access to or an MPOL_INTERLEAVE nodemask that is empty when AND'd with pol->cpuset_mems_allowed, -EINVAL is returned and no new policy is effected. If an application gains nodes in pol->cpuset_mems_allowed that now include the nodes from MPOL_INTERLEAVE or MPOL_PREFERRED, that policy is then effected once again. Otherwise, MPOL_DEFAULT is still used. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 7:00 ` David Rientjes @ 2007-10-29 7:26 ` Paul Jackson 2007-10-30 22:53 ` David Rientjes 0 siblings, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-29 7:26 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel > Let's add a Choice C: > > Any nodemask that is passed to set_mempolicy() is saved as > the intent of the application in struct mempolicy. Yes > All policies are effected on a contextualized per-allocation > basis. "contextualized" - I guess that means converted to cpuset relative numbering - yes. "per-allocation" - Most of the calculation of nodemasks and zonelists is done when memory policies change. > Policies such as MPOL_INTERLEAVE always get AND'd with > pol->cpuset_mems_allowed. Not AND'd - Folded, as in bitmap_remap(). > If that yields numa_no_nodes, MPOL_DEFAULT is used instead. Not an issue with Folding. > Policies such as MPOL_PREFERRED are respected if the node > is set in pol->cpuset_mems_allowed, otherwise MPOL_DEFAULT > is used. Not an issue with Folding. > If an application attempts to setup a memory policy for > an MPOL_PREFERRED node that it doesn't have access to or > an MPOL_INTERLEAVE nodemask that is empty when AND'd with > pol->cpuset_mems_allowed, -EINVAL is returned and no new > policy is effected. Not issues with Folding. > If an application gains nodes in pol->cpuset_mems_allowed that > now include the nodes from MPOL_INTERLEAVE or MPOL_PREFERRED, > that policy is then effected once again. Otherwise, > MPOL_DEFAULT is still used. Not issues with Folding. With folding, an application that layed out an elaborate memory policy configuration covering say 16 nodes can run in a 4 node cpuset, where whatever would have been on node N gets folded down to node N % 4. With AND'ing, such an application would find 3/4's of its fancy memory policy configuration replaced with MPOL_DEFAULT and -EINVAL fallbacks. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 7:26 ` Paul Jackson @ 2007-10-30 22:53 ` David Rientjes 2007-10-30 23:17 ` Paul Jackson 0 siblings, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-30 22:53 UTC (permalink / raw) To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel On Mon, 29 Oct 2007, Paul Jackson wrote: > > Policies such as MPOL_INTERLEAVE always get AND'd with > > pol->cpuset_mems_allowed. > > Not AND'd - Folded, as in bitmap_remap(). > > > If that yields numa_no_nodes, MPOL_DEFAULT is used instead. > > Not an issue with Folding. > > > Policies such as MPOL_PREFERRED are respected if the node > > is set in pol->cpuset_mems_allowed, otherwise MPOL_DEFAULT > > is used. > > Not an issue with Folding. > > > If an application attempts to setup a memory policy for > > an MPOL_PREFERRED node that it doesn't have access to or > > an MPOL_INTERLEAVE nodemask that is empty when AND'd with > > pol->cpuset_mems_allowed, -EINVAL is returned and no new > > policy is effected. > > Not issues with Folding. > > > If an application gains nodes in pol->cpuset_mems_allowed that > > now include the nodes from MPOL_INTERLEAVE or MPOL_PREFERRED, > > that policy is then effected once again. Otherwise, > > MPOL_DEFAULT is still used. > > Not issues with Folding. > > With folding, an application that layed out an elaborate memory > policy configuration covering say 16 nodes can run in a 4 node > cpuset, where whatever would have been on node N gets folded down > to node N % 4. > Missing the point; this is an alternative to the previous choices; Choice C explicitly removes all remaps ("folding") from mempolicies. The nodemask passed to set_mempolicy() will always have exactly one meaning: the system nodes that the policy is intended for. Cpusets, which are built upon mempolicies, can obviously take access to some of those nodes away. That's why the existing mempolicies are AND'd with the cpuset's mems_allowed to represent the current nodemask that the mempolicy is effecting. If none of them are available because of cpusets, the mempolicy is invalidated and MPOL_DEFAULT is used. If access to some nodes from the mempolicy's nodemask become available once again, the policy is again effected. I'm arguing that remapping a policy's nodemask, although that is what currently is done, is troublesome because it can use a policy such as MPOL_PREFERRED to work on a node for which it was never intended. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 22:53 ` David Rientjes @ 2007-10-30 23:17 ` Paul Jackson 2007-10-30 23:25 ` David Rientjes 0 siblings, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-30 23:17 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel David wrote: > The nodemask passed to set_mempolicy() will always have exactly one > meaning: the system nodes that the policy is intended for. Ok - that makes the meaning of Choice C clearer to me. Thank-you. We've already got two Choices, one released and one in the oven. Is there an actual, real world situation, motivating this third Choice? -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 23:17 ` Paul Jackson @ 2007-10-30 23:25 ` David Rientjes 2007-10-31 0:03 ` Paul Jackson 2007-10-31 0:05 ` Paul Jackson 0 siblings, 2 replies; 98+ messages in thread From: David Rientjes @ 2007-10-30 23:25 UTC (permalink / raw) To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel On Tue, 30 Oct 2007, Paul Jackson wrote: > We've already got two Choices, one released and one in the oven. Is > there an actual, real world situation, motivating this third Choice? > Let's put Choice C into the lower oven, then. Of course there's actual and real world examples of this, because right now we're not meeting the full intent of the application. Cpusets deal with cpus and memory, they don't have anything to do with affinity to particular I/O devices; that part is left up to the creator of the cpuset to sort out correctly based on their system topology. If my application does tons of I/O on one particular device to which my memory has access, I can use MPOL_PREFERRED to prefer the memory be allocated on a node with the best affinity to my device. If cpusets change my access to that node, I'm still using an MPOL_PREFERRED policy with a remapped node that no longer has affinity to that device because nodes_remap() doesn't take that into account. My preference would be to fallback to MPOL_DEFAULT behavior, since it's certainly plausible that other cpusets share the same node, instead of unnecessarily filling up a node that I don't even prefer anymore. Same situation exists of MPOL_INTERLEAVE policies where my NUMA optimization is no longer helpful because I'm interleaving over a set of nodes that was simply remapped and their affinity (which isn't guaranteed to be unifom) wasn't even taken into account. But, with Choice C, my intent is still preserved in the mempolicy even though it's not effected because my access rights to the node has changed. If I get access to that node back later, and I haven't issued subsequent set_mempolicy() calls to change my policy, my MPOL_PREFERRED or MPOL_INTERLEAVE policy is again effected and I then benefit from my NUMA optimization once again. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 23:25 ` David Rientjes @ 2007-10-31 0:03 ` Paul Jackson 2007-10-31 0:05 ` Paul Jackson 1 sibling, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-31 0:03 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel David wrote: > Of course there's actual and real world examples of this, because right > now we're not meeting the full intent of the application. Please describe one, an actual one, not a hypothetical one, of which you have personal knowledge. There are many refinements we could add, an endless stream of them. Each one adds a burden to those who didn't need it. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 23:25 ` David Rientjes 2007-10-31 0:03 ` Paul Jackson @ 2007-10-31 0:05 ` Paul Jackson 1 sibling, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-31 0:05 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel David wrote: > But, with Choice C, my intent is still preserved in the mempolicy even > though it's not effected because my access rights to the node has changed. Choice B, as I'm coding it, has this property as well. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 4:47 ` David Rientjes 2007-10-29 5:45 ` Paul Jackson @ 2007-10-29 7:15 ` Paul Jackson 2007-10-30 23:12 ` David Rientjes 1 sibling, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-29 7:15 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel David wrote: > The problem that I see with immediately offering both choices is that we > don't know if anybody is actually reverting back to Choice A behavior > because libnuma, by default, would use it. That's going to making it very > painful to remove later. Yes, that's a problem. I would rather end up with both Choices forever, than breaking stuff because we changed how memory policy nodes are numbered. > We should only > support both choices if they will both be used and there's no hard > evidence to suggest that at this point. No. We could only remove Choice A if we had hard evidence that it wouldn't break things, especially for the libnuma-Oracle stack. Either way, we obviously have to decide this lacking sufficient hard evidence. Changing memory policy node numbering is just way too likely to break things, in ways that users initially find difficult to diagnose. We -can-not- inflict that on our users in a single, sudden, change. We must stage it, starting by adding the new. > You earlier insisted on an ease of documentation for the MPOL_INTERLEAVE > case and now this dual support that you're proposing is going to make the > documentation very difficult to understand for anyone who simply wants to > use mempolicies. Yup - that's a problem. But it is one that users can control. If they just continue using memory policies and libnuma as before, it continues to work as before. If they need to deal with situations in which applications using memory policies are being moved around between larger and smaller cpusets, and they are willing and able to modify and improve the part of their code that handles memory policies, then they can read the new section of the documentation about this improved cpuset-relative node numbering, and give it a try. Blind siding users with a unilateral change like this will leave orphaned bits gasping in agony on the computer room floor. It can sometimes takes months of elapsed time and hundreds of hours of various peoples time across a dozen departments in three to five corporations to track down the root cause of such a problem, from the point of the initial failure, back to the desk of someone like you or me. And then it can take tens or hundreds more hours of human effort to deliver a fix. I refuse to knowingly go down that road. I will not agree to suddenly replacing Choice A with Choice B. > And that application would need to [...] Choice A can't possibly > be what they want. People do this sort of stuff all the time; they just don't realize what all is going on beneath the surface of the various tools, libraries, scripts and magic incantations that they cobble together to meet their needs. Choice A is meeting most of our needs. Not until you brought up this case of MPOL_INTERLEAVE across the nodes of a job being moved between varying size cpusets did it prove inadequate. > I hope you still support what we earlier talked about in > terms of adding a field to struct mempolicy to remember the intended > nodemask the application asked to interleave over. Yes - that's a key element of the Choice B implementation. > I thought what we agreed upon and what you were going to implement was > adding a nodemask_t to struct mempolicy for the intended nodemask of the > memory policy and then AND it with pol->cpuset_mems_allowed. Not "AND". Fold - the n-th bit is set in a tasks mems_allowed iff there exists m such that (m % w) == n, and such that the m-th bit is set in the tasks mempolicy's remembered nodemask, where w is the weight (number of '1' bits) in the tasks current cpusets mems_allowed. See lib/bitmap.c:bitmap_remap(), and its wrapper nodes_remap() for the implementation. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 7:15 ` Paul Jackson @ 2007-10-30 23:12 ` David Rientjes 2007-10-30 23:44 ` Paul Jackson 0 siblings, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-30 23:12 UTC (permalink / raw) To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel On Mon, 29 Oct 2007, Paul Jackson wrote: > Blind siding users with a unilateral change like this will leave > orphaned bits gasping in agony on the computer room floor. It can > sometimes takes months of elapsed time and hundreds of hours of various > peoples time across a dozen departments in three to five corporations > to track down the root cause of such a problem, from the point of the > initial failure, back to the desk of someone like you or me. And then > it can take tens or hundreds more hours of human effort to deliver a > fix. I refuse to knowingly go down that road. > If your argument is that most applications are written to implement mempolicies without necessarily thinking too much about its cpuset placement or interactions with cpusets, then the requirement of remapping nodes when a cpuset changes for effected mempolicies isn't actually that important. In other words, my Choice C with AND'd behavior as opposed to remapping behavior could be introduced as a replacement for Choice A. Those applications that currently rely on the remapping are going to be broken anyway because they are unknowingly receiving different nodes than they intended, this is the objection to remapping that Lee agreed with. The remap doesn't take into account any notion of locality or affinity to physical controllers and seems to be merely a convenience of not invalidating the entire mempolicy in light of an ever-changing cpuset policy. > Not "AND". Fold - the n-th bit is set in a tasks mems_allowed iff > there exists m such that (m % w) == n, and such that the m-th bit is > set in the tasks mempolicy's remembered nodemask, where w is the weight > (number of '1' bits) in the tasks current cpusets mems_allowed. See > lib/bitmap.c:bitmap_remap(), and its wrapper nodes_remap() for the > implementation. > Yes, I know, and my Choice C does _not_ want that folding behavior; it wants the AND'd behavior because it fully respects the intent of the application with regard to the actual nodes that it specified in its memory policies. A node should only have one definition and policies that are effected on a set of nodes, or one node in the preferred case, should not change from beneath the application because it was not the intent of the implementation. Doing so is dangerous, regardless of whether or not it is currently the mempolicy behavior in HEAD. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 23:12 ` David Rientjes @ 2007-10-30 23:44 ` Paul Jackson 2007-10-30 23:53 ` David Rientjes 0 siblings, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-30 23:44 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel David wrote: > If your argument is that most applications are written to implement > mempolicies without necessarily thinking too much about its cpuset > placement or interactions with cpusets, then the requirement of remapping > nodes when a cpuset changes for effected mempolicies isn't actually that > important. Just because they didn't think about cpuset remapping when they coded their mempolicy calls, doesn't mean they wouldn't be broken by changes in how mempolicy numbers nodes. Often, it's the other way around: the less they though of it, the more likely changing it would break them. > In other words, my Choice C with AND'd behavior as opposed to > remapping behavior could be introduced as a replacement for Choice A. No - I will not agree to changing the default mempolicy kernel API node numbering at this time. Period. Full stop. We can add non-default choices for now, and perhaps in the light of future experience, we may choose to do more later. > Those applications that currently rely on the remapping are going to be > broken anyway because they are unknowingly receiving different nodes than > they intended, this is the objection to remapping that Lee agreed with. No, they may or may not be broken. That depends on whether or not they had specific hardware locality or affinity needs. > The remap doesn't take into account any notion of locality or affinity to > physical controllers and seems to be merely a convenience of not > invalidating the entire mempolicy in light of an ever-changing cpuset > policy. If you're running apps that have specific hardware affinity requirements, then perhaps you shouldn't be moving them about in the first place ;). And if they did have such needs, aren't they just as likely to be busted by AND'ing off some of their nodes as they are by remapping those nodes? I sure wish I knew what real world, actual, not hypothetical, situations were motivating this. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 23:44 ` Paul Jackson @ 2007-10-30 23:53 ` David Rientjes 2007-10-31 0:29 ` Paul Jackson 0 siblings, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-30 23:53 UTC (permalink / raw) To: Paul Jackson; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel On Tue, 30 Oct 2007, Paul Jackson wrote: > > Those applications that currently rely on the remapping are going to be > > broken anyway because they are unknowingly receiving different nodes than > > they intended, this is the objection to remapping that Lee agreed with. > > No, they may or may not be broken. That depends on whether or not they had > specific hardware locality or affinity needs. > Of course they have specific affinity needs, that's why they used mempolicies. Remapping those policies to a set of nodes that resembles the original mempolicy's nodemask in terms of construction but without regard for the affinity those nodes have with respect to system topology could lead to performance degredations. > If you're running apps that have specific hardware affinity requirements, > then perhaps you shouldn't be moving them about in the first place ;). > And if they did have such needs, aren't they just as likely to be busted > by AND'ing off some of their nodes as they are by remapping those nodes? > No, because you're interleaving over the set of actual nodes you wanted to interleave over in the first place and not some pseudo-random set that your cpuset has access to. > I sure wish I knew what real world, actual, not hypothetical, situations > were motivating this. > You're defending the current remap behavior in terms of semantics of mempolicies? My position, and Choice C's position, is that you either get the exact (or partially-constructed) policy that you asked for, or you get the MPOL_DEFAULT behavior. What you don't get, even though it's currently how we do it, is a completely different set of nodes that you never intended to have a specific policy over. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 23:53 ` David Rientjes @ 2007-10-31 0:29 ` Paul Jackson 0 siblings, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-31 0:29 UTC (permalink / raw) To: David Rientjes; +Cc: clameter, Lee.Schermerhorn, akpm, ak, linux-kernel David wrote: > Of course they have specific affinity needs, that's why they used > mempolicies. No. Good grief. If they are just looking for some set of memory banks, not to other node-specific hardware, then they might not need a specific node. Consider for example a multi-threaded, compute bound, long running scientific computation that has a substantial and fussy memory layout. Remapping it from one cpuset to another having the same NUMA topology may well work fine, once its memory caches recover. Reverting it to the lowest common denominator MPOL_DEFAULT policy because (Choice C) it no longer has access to its initial nodes might devastate its performance. pj wrote: > I sure wish I knew what real world, actual, not hypothetical, situations > were motivating this. I'm still wishing ... -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 23:19 ` Paul Jackson 2007-10-28 18:19 ` David Rientjes @ 2007-10-29 16:54 ` Lee Schermerhorn 2007-10-29 19:40 ` Paul Jackson ` (3 more replies) 1 sibling, 4 replies; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-29 16:54 UTC (permalink / raw) To: Paul Jackson; +Cc: David Rientjes, clameter, akpm, ak, linux-kernel On Sat, 2007-10-27 at 16:19 -0700, Paul Jackson wrote: > David wrote: > > I think there's a mixup in the flag name [MPOL_MF_RELATIVE] there > > Most likely. The discussion involving that flag name was kinda mixed up ;). > > > but I actually would recommend against any flag to effect Choice A. > > It's simply going to be too complex to describe and is going to be a > > headache to code and support. > > While I am sorely tempted to agree entirely with this, I suspect that > Christoph has a point when he cautions against breaking this kernel API. > > Especially for users of the set/get mempolicy calls coming in via > libnuma, we have to be very careful not to break the current behaviour, > whether it is documented API or just an accident of the implementation. > > There is a fairly deep and important stack of software, involving a > well known DBMS product whose name begins with 'O', sitting on that > libnuma software stack. Steering that solution stack is like steering > a giant oil tanker near shore. You take it slow and easy, and listen > closely to the advice of the ancient harbor master. The harbor masters > in this case are or were Andi Kleen and Christoph Lameter. > > > It's simply going to be too complex to describe and is going to be a > > headache to code and support. > > True, which is why I am hoping we can keep this modal flag, if such be, > from having to be used on every set/get mempolicy call. The ordinary > coder of new code using these calls directly should just see Choice B > behaviour. However the user of libnuma should continue to see whatever > API libnuma supports, with no change whatsoever, and various versions of > libnuma, including those already shipped years ago, must continue to > behave without any changes in node numbering. If most apps use libnuma APIs instead of directly calling the sys calls, libnuma could query something as simple as an environment variable, or a new flag to get_mempolicy(), or the value of a file in it's current cpuset--but I'd like to avoid a dependency on libcpuset--to determine whether to implement "new" semantics. > > There are two decent looking ways (and some ugly ways) that I can see > to accomplish this: > > 1) One could claim that no important use of Oracle over libnuma over > these memory policy calls is happening on a system using cpusets. > There would be a fair bit of circumstantial evidence for this > claim, but I don't know it for a fact, and would not be the > expert to determine this. On systems making no use of cpusets, > these two Choices A and B are identical, and this is a non-issue. > Those systems will see no API changes whatsoever from any of this. I'd certainly like to hear from Oracle what libnuma features they use and their opinion of the changes being discussed here. > > 2) We have a per-task mode flag selecting whether Choice A or B > node numbering apply to the masks passed in to set_mempolicy. > > The kernel implementation is fairly easy. (Yeah, I know, I > too cringe everytime I read that line ;) > > If Choice A is active, then we continue to enforce the current > check, in mm/mempolicy.c: contextualize_policy(), that the passed > in mask be a subset of the current cpusets allowed memory nodes, > and we add a call to nodes_remap(), to map the passed in nodemask > from cpuset centric to system centric (if they asked for the > fourth node in their current cpuset, they get the fourth node in > the entire system.) > > Similarly, for masks passed back by get_mempolicy, if Choice A > is active, we use nodes_remap() to convert the mask from system > centric back to cpuset centric. > > There is a subtle change in the kernel API's here: > > In current kernels, which are Choice A, if a task is moved from > a big cpuset (many nodes) to a small cpuset and then -back- > to a big cpuset, the nodemasks returned by get_mempolicy > will still show the smaller masks (fewer set nodes) imposed > by the smaller cpuset. > > In todays kernels, once scrunched or folded down, the masks > don't recover their original size after the task is moved > back to a large cpuset. Yeah. This bothered me about policy remapping when I looked at it a while back. Worse, this behavior isn't documented as intended [or not]. I thought at the time that this could be solved by retaining the original argument nodemask, but 1) I was worried about the size when ~1K nodes are required to be supported and 2) it still doesn't solve the problem of ensuring the same locality characteristics w/o a lot of documentation about the implications of changing cpuset resources or moving tasks between cpusets in such a way to preserve the locality characteristics requested by the original mask. Again, we stumble upon the notion of "intent". If the intent is just to spread allocations to share bandwidth, it probably doesn't matter. If, on the other hand, the original mask was carefully constructed, taking into consideration the distances between the memories specified and other resources [cpus in the cpuset, other memories in the cpuset, IO adpater connection points, ...], there is a lot more to consider than just preserving the cpuset relative positions of the nodes. > > With this change, even a task asking for Choice A would, > once back on a larger cpuset, again see the larger masks from > get_mempolicy queries. This is a change in the kernel API's > visible to user space; but I really do not think that there > is sufficient use of Oracle over libnuma on systems actively > moving tasks between differing size cpusets for this to be > a problem. > > Indeed, if there was much such usage, I suspect they'd > be complaining that the current kernel API was borked, and > they'd be filing a request for enhancement -asking- for just > this subtle change in the kernel API's here. In other words, > this subtle API change is a feature, not a bug ;) Agreed. > > The bulk of the kernel's mempolicy code is coded for Choice B. > > If Choice B is active, we don't enforce the subset check in > contextualize_policy(), and we don't invoke nodes_remap() in either > of the set or get mempolicy code paths. > > A new option to get_mempolicy() would query the current state of > this mode flag, and a new option to set_mempolicy() would set > and clear this mode flag. Perhaps Christoph had this in mind > when he wrote in an earlier message "The alternative is to add > new set/get mempolicy functions." > > The default kernel API for each task would be Choice B (!). > > However, in deference to the needs of libnuma, if the following > call was made, this would change the mode for that task to > Choice A: > > get_mempolicy(NULL, NULL, 0, 0, 0); > > This last detail above is an admitted hack. *So far as I know* > it happens that all current infield versions of libnuma issue the > above call, as their first mempolicy query, to detemine whether > the active kernel supports mempolicy. In libnuma in numactl-1.0.2 that I recently grabbed off Andi's site, numa_available() indeed issues this call. But, I don't see any internal calls to numa_available() [comments says all other calls undefined when numa_available() returns an error] nor any other calls to get_mempolicy() with all null/0 args. So, you'd be depending on the application to call numa_available(). However, you could define an additional MPOL_F_* flag to get_mempolicy() that is issued in library init code to enable new behavior--again, based on some indication that new behavior is desired or not. > > The mode for each task would be inherited across fork, and reset > to Choice (B) on exec. > > If we determine that we must go with a new flag bit to be passed in > on each and every get and set mempolicy call that wants Choice B node > numbering rather than Choice A, then I will need (1) a bottle of rum, > and (2) a credible plan in place to phase out this abomination ;). > > There would be just way too many coding errors and coder frustrations > introduced by requiring such a flag on each and every mempolicy system > call that wants the alternative numbering. Only for apps that use the sys calls directly, right? This can be hidden by libnuma(), if all apps use that. The "behavior switch" flag suggested above would obviate a flag on each sys call and could also be hidden by libnuma. Any of these changes will require some better documentation than we have now... > There must be some > international treaty that prohibits landmines capable of blowing ones > foot off that would apply here. > > There are two major user level libraries sitting on top of this API, > libnuma and libcpuset. Libnuma is well known; it was written by Andi > Kleen. I wrote libcpuset, and while it is LGPL licensed, it has not > been publicized very well yet. I can speak for libcpuset: it could > adapt to the above proposal, in particular to the details in way (2), > just fine. Old versions of libcpuset running on new kernels will > have a little bit of subtle breakage, but not in areas that I expect > will cause much grief. Someone more familiar with libnuma than I would > have to examine the above proposal in way (2) to be sure that we weren't > throwing libnuma some curveball that was unnecessarily troublesome. > I worry more about applications that take a more physical view of the node ids and that emphasize locality more than bandwidth spreading. If libnuma explicitly enables new behavior when requested, however that might be implemented, I don't know that it would be a problem. Lee Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 16:54 ` Lee Schermerhorn @ 2007-10-29 19:40 ` Paul Jackson 2007-10-29 19:45 ` Paul Jackson ` (2 subsequent siblings) 3 siblings, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-29 19:40 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel Lee wrote: > If most apps use libnuma APIs instead of directly calling the sys calls, > libnuma could query something as simple as an environment variable, or a > new flag to get_mempolicy(), or the value of a file in it's current > cpuset--but I'd like to avoid a dependency on libcpuset--to determine > whether to implement "new" semantics. The patch I'm working has a new set of options to get_mempolicy to set and get the per-task kernel state indicating whether to use the old or new semantics. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 16:54 ` Lee Schermerhorn 2007-10-29 19:40 ` Paul Jackson @ 2007-10-29 19:45 ` Paul Jackson 2007-10-29 19:57 ` Paul Jackson 2007-10-29 20:02 ` Paul Jackson 3 siblings, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-29 19:45 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel Lee wrote: > Again, we stumble upon the notion of "intent". If the intent is just to > spread allocations to share bandwidth, it probably doesn't matter. If, > on the other hand, the original mask was carefully constructed, taking > into consideration the distances between the memories specified and > other resources [cpus in the cpuset, other memories in the cpuset, IO > adpater connection points, ...], there is a lot more to consider than > just preserving the cpuset relative positions of the nodes. Yes - as I noted in an earlier reply, the kernel just provides the mechanisms. It's up to user level code and people to decide whether moving jobs around is a worthwhile activity in their situation. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 16:54 ` Lee Schermerhorn 2007-10-29 19:40 ` Paul Jackson 2007-10-29 19:45 ` Paul Jackson @ 2007-10-29 19:57 ` Paul Jackson 2007-10-29 20:02 ` Paul Jackson 3 siblings, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-29 19:57 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel Lee wrote: > > Indeed, if there was much such usage, I suspect they'd > > be complaining that the current kernel API was borked, and > > they'd be filing a request for enhancement -asking- for just > > this subtle change in the kernel API's here. In other words, > > this subtle API change is a feature, not a bug ;) > > Agreed. Hmmm ... put your thinking hat for my next comment ... I could do one of two things in mm/mempolicy.c: B1) continue accepting nodemasks across the set_mempolicy and mbind system call APIs that are just like now (only nodes in the current tasks cpuset matter), but then remember what was passed in, so that if the tasks cpuset subsequently shrank down and then expanded again back to its original size, they would end up with the same memory policy placement they first had, or B2) accept nodemasks as if relative to the entire system, regardless of what cpuset they were in at the moment (all nodes in the system matter and can be specified.) If I did B1, then that's just a subtle change in the API, and what you agreed to above holds. If I did B2, then that's a serious change in the way that nodes are numbered in the nodemasks passed into mbind and set_mempolicy, from being only nodes that happen to be in the tasks current cpuset, to being nodes relative to all possible nodes on the system. We need B2, I think. Otherwise, if a job happens to be running in a shrunken cpuset, it can't request what memory policy placement it wants should it end up in a larger cpuset later on. With B1, we would continue to have the timing dependencies between when a task is moved between different size cpusets, and when it happens to issue mbind/set_mempolicy calls. But B2 is an across the board change in how we number the nodes passed into mbind and set_mempolicy. That is in no way an upward compatible change. I am strongly inclined toward B2, but it must be a non-default optional mode, at least for a while, perhaps a long while. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 16:54 ` Lee Schermerhorn ` (2 preceding siblings ...) 2007-10-29 19:57 ` Paul Jackson @ 2007-10-29 20:02 ` Paul Jackson 3 siblings, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-29 20:02 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel Lee wrote: > In libnuma in numactl-1.0.2 that I recently grabbed off Andi's site, > numa_available() indeed issues this call. But, I don't see any internal > calls to numa_available() [comments says all other calls undefined when > numa_available() returns an error] nor any other calls to > get_mempolicy() with all null/0 args. So, you'd be depending on the > application to call numa_available(). Aha - good point. It happened to be the numactl command line utility that I tested with that issued the get_mempolicy(0,0,0,0,0) call. Yup - this proposed hack, to have the kernel revert to the original memory policy nodemask numbering if it sees such a getmempolicy call is now officially dead meat. Thanks. > However, you could define an > additional MPOL_F_* flag to get_mempolicy() that is issued in library > init code to enable new behavior--again, based on some indication that > new behavior is desired or not. Yes - I am intending to define such MPOL_F_* flags, to set and get which behavior applies to the current task. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 1:07 ` Paul Jackson 2007-10-27 1:26 ` Christoph Lameter @ 2007-10-27 17:45 ` David Rientjes 2007-10-27 21:22 ` Paul Jackson 1 sibling, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-27 17:45 UTC (permalink / raw) To: Paul Jackson; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Paul Jackson wrote: > Choice A: > as it does today, the second node in the tasks cpuset or it could > mean > > Choice B: > the fourth node in the cpuset, if available, just as > it did in the case above involving a cpuset on nodes 10 and 11. > Thanks for describing the situation with MPOL_PREFERRED so thoroughly. I prefer Choice B because it does not force mempolicies to have any dependence on cpusets with regard to what nodemask is passed. [rientjes@xroads ~]$ man set_mempolicy | grep -i cpuset | wc -l 0 It would be very good to store the passed nodemask to set_mempolicy in struct mempolicy, as you've already recommended for MPOL_INTERLEAVE, so that you can try to match the intent of the application as much as possible. But since cpusets are built on top of mempolicies, I don't think there's any reason why we should respect any nodemask in terms of the current cpuset context, whether it's preferred or interleave. So if you were to pass a nodemask with only the fourth node set for an MPOL_PREFERRED mempolicy, the correct behavior would be to prefer the fourth node on the system or, if constrained by cpusets, the fourth node in the cpuset. If the cpuset has fewer than four nodes, the behavior should be undefined (probably implemented to just cycle the set of mems_allowed until you reach the fourth entry). That's the result of constraining a task to a cpuset that obviously wants access to more nodes -- it's a userspace mistake and abusing cpusets so that the task does not get what it expects. That concept isn't actually new: we already restrict tasks to a certain amount of memory by writing to the mems file and just because it happens to have access to more memory when unconstrained by cpusets doesn't matter. You've placed it in a cpuset that wasn't prepared to deal with what the task was asking for. At least in the MPOL_PREFERRED case you describe above, it'll be dealt with much more pleasantly by at least giving it a preferred node as opposed to OOM killing it when a task has exhausted its available cpuset-constrained memory. I'd prefer a solution where mempolicies can always be described and used without ever considering cpusets. Then, a sane implementation will configure the cpuset accordingly to accomodate its tasks' mempolicies. We don't want to get in a situation where we are denying a task to be attached to a cpuset when there are fewer nodes than the preferred node, for example, but we can fallback to better behavior by at least giving it a preferred node in the MPOL_PREFERRED case. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 17:45 ` David Rientjes @ 2007-10-27 21:22 ` Paul Jackson 0 siblings, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-27 21:22 UTC (permalink / raw) To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel David wrote: > I prefer Choice B because it does not force mempolicies to have any > dependence on cpusets with regard to what nodemask is passed. Yes, well said. > It would be very good to store the passed nodemask to set_mempolicy in > struct mempolicy, Yes - that's what I'm intending to do. > If the cpuset has fewer than four nodes, the behavior > should be undefined (probably implemented to just cycle the set of > mems_allowed until you reach the fourth entry). I do intend to implement it as you suggest. See the lib/bitmap.c routines bitmap_remap() and bitmap_bitremap(), and the nodemask wrappers for these, nodes_remap() and node_remap(). They will define the cycling, or I sometimes call it folding. I would have tended to make this folding a defined part of the API, though I will grant that the possibility of being lazy and forgetting to document it seems attractive (less to document ;). > That [running in a cpuset with fewer nodes than used in a memory policy > mask] is the result of constraining a task to a cpuset that obviously > wants access to more nodes -- it's a userspace mistake and abusing > cpusets so that the task does not get what it expects. Nah - I wouldn't put it that way. It's no mistake or abuse. It's just one more example of a kernel making too few resources look sufficient by sharing, multiplexing and virtualizing them. That's what kernels do. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 21:39 ` David Rientjes 2007-10-27 1:07 ` Paul Jackson @ 2007-10-29 15:10 ` Lee Schermerhorn 2007-10-29 18:41 ` Paul Jackson 2007-10-30 22:57 ` David Rientjes 1 sibling, 2 replies; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-29 15:10 UTC (permalink / raw) To: David Rientjes; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel On Fri, 2007-10-26 at 14:39 -0700, David Rientjes wrote: > On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > > > So, you pass the subset, you don't set the flag to indicate you want > > interleaving over all available. You must be thinking of some other use > > for saving the subset mask that I'm not seeing here. Maybe restoring to > > the exact nodes requested if they're taken away and then re-added to the > > cpuset? > > > > Paul's motivation for saving the passed nodemask to set_mempolicy() is so > that the _intent_ of the application is never lost. That's the biggest > advantage that this method has and that I totally agree with. So whenever > the mems_allowed of a cpuset changes, the MPOL_INTERLEAVE nodemask of all > attached tasks becomes their intent (pol->passed_nodemask) AND'd with the > new mems_allowed. That can be done on mpol_rebind_policy() and shouldn't > be an extensive change. > > So MPOL_INTERLEAVE, and possibly other, mempolicies will always try to > accomodate the intent of the application but only as far as the task's > cpuset restriction allows them. > > David Maybe it's just me, but I think it's pretty presumptuous to think we can infer the intent of the application from the nodemask w/o additional flags such as Christoph proposed [cpuset relative]--especially for subsets of the cpuset. E.g., the application could intend the nodemask to specify memories within a certain distance of a physical resource, such as where a particular IO adapter or set thereof attach to the platform. And even when the intent is to preserve the cpuset relative positions of the nodes in the nodemask, this really only makes sense if the original and modified cpusets have the same physical topology w/rt multi-level NUMA interconnects. This is something that has bothered me about dynamic cpusets and current policy remapping. We don't do a good job of explaining the implications of changing cpuset topology on applications, nor do we handle it very well in the code. Paul addresses one of my concerns in a later message in this thread, so I'll comment there. Later, Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 15:10 ` Lee Schermerhorn @ 2007-10-29 18:41 ` Paul Jackson 2007-10-29 19:01 ` Lee Schermerhorn 2007-10-30 23:17 ` David Rientjes 2007-10-30 22:57 ` David Rientjes 1 sibling, 2 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-29 18:41 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel Lee wrote: > Maybe it's just me, but I think it's pretty presumptuous to think we can > infer the intent of the application from the nodemask w/o additional > flags such as Christoph proposed [cpuset relative]--especially for > subsets of the cpuset. E.g., the application could intend the nodemask > to specify memories within a certain distance of a physical resource, > such as where a particular IO adapter or set thereof attach to the > platform. Well, yes, we can't presume to know whether some application can move or not. But our kernel work is not presuming that. It's providing mechanisms useful for moving apps. The people using this decide what and when and if to move. For example, the particular customers (HPC) I focus on for my job don't move jobs because they don't want to take the transient performance hit that would come from blowing out all their memory caches. I'm guessing that David's situation involves something closer what you see with a shared web hosting service, running jobs that are very independent of hardware particulars. But in any case, we (the kernel) are just providing the mechanisms. If they don't fit ones needs, don't use them ;). -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 18:41 ` Paul Jackson @ 2007-10-29 19:01 ` Lee Schermerhorn 2007-10-30 23:17 ` David Rientjes 1 sibling, 0 replies; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-29 19:01 UTC (permalink / raw) To: Paul Jackson; +Cc: rientjes, clameter, akpm, ak, linux-kernel On Mon, 2007-10-29 at 11:41 -0700, Paul Jackson wrote: > Lee wrote: > > Maybe it's just me, but I think it's pretty presumptuous to think we can > > infer the intent of the application from the nodemask w/o additional > > flags such as Christoph proposed [cpuset relative]--especially for > > subsets of the cpuset. E.g., the application could intend the nodemask > > to specify memories within a certain distance of a physical resource, > > such as where a particular IO adapter or set thereof attach to the > > platform. > > Well, yes, we can't presume to know whether some application can move > or not. > > But our kernel work is not presuming that. > > It's providing mechanisms useful for moving apps. > > The people using this decide what and when and if to move. > > For example, the particular customers (HPC) I focus on for my job don't > move jobs because they don't want to take the transient performance > hit that would come from blowing out all their memory caches. > > I'm guessing that David's situation involves something closer what you > see with a shared web hosting service, running jobs that are very > independent of hardware particulars. > > But in any case, we (the kernel) are just providing the mechanisms. > If they don't fit ones needs, don't use them ;). > I'm with you on this last point! I was reacting to the notion that we can infer intent from a nodemask and that preserving the cpuset relative numbering after changing cpuset resources or moving tasks preserves that intent--especially if it involves locality and distance considerations. I can envision sets of such transformations on HP platforms where locality and distance would be preserved by preserving cpuset-relative numbering, and many where they would not. I expect you could do the same for SGI platforms. I'm not opposed to what you're trying to do, modulo complexity concerns. And I'm not saying that the complexity is not worth it to customers. But, given that we just "providing the mechanism", I think we need to provide very good documentation on the implications of these mechanism vis a vis whatever characteristics--locality, distance, bandwidth sharing, ...--the application intends when it installs a policy. Like you, no doubt, I'm eyeballs deep in a number of things. At some point, I'll take a cut at enumerating various "intents" that different types of applications might have when using mem policies and cpusets. Others can add to that, or may even beat me to it. We can then evaluate how well these scenarios are served by the current mechanisms and by whatever changes are proposed. I should note that I really like cpusets--i.e., find them useful--and I'm painfully aware of the awkward interactions with mempolicy. On the other hand, I don't want to sacrifice mem policy capabilities to shoe horn them into cpusets. In fact, I want to add additional mechanisms that may also be awkward in cpusets. As you say, "if they don't fit your needs, don't use them." Later, Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 18:41 ` Paul Jackson 2007-10-29 19:01 ` Lee Schermerhorn @ 2007-10-30 23:17 ` David Rientjes 2007-10-31 0:03 ` Paul Jackson 1 sibling, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-30 23:17 UTC (permalink / raw) To: Paul Jackson; +Cc: Lee Schermerhorn, clameter, akpm, ak, linux-kernel On Mon, 29 Oct 2007, Paul Jackson wrote: > But in any case, we (the kernel) are just providing the mechanisms. > If they don't fit ones needs, don't use them ;). > The kernel is providing the mechanism to interleave over a set of nodes or prefer a single node for allocations, but it also provides for remapping those to different nodes, without regard to locality or affinity to specific hardware, when the cpuset changes. That's what Choice C is intended to replace: a node means a node so either you get an effected mempolicy over the nodemask you asked for, or MPOL_DEFAULT is used because you lack sufficient access. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 23:17 ` David Rientjes @ 2007-10-31 0:03 ` Paul Jackson 0 siblings, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-31 0:03 UTC (permalink / raw) To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel David wrote: > That's what Choice C is intended to replace Yes, one remaps nodes it can't provide, and the other removes nodes it can't provide. Yup - that's a logical difference. So ... I would think that the only solution that would be satisfactory to apps that require specific hardware nodes would be to simply not move them in the first place. If you do that, then none of these Choices matter in the slightest. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 15:10 ` Lee Schermerhorn 2007-10-29 18:41 ` Paul Jackson @ 2007-10-30 22:57 ` David Rientjes 2007-10-30 23:46 ` Paul Jackson 1 sibling, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-30 22:57 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: Paul Jackson, clameter, akpm, ak, linux-kernel On Mon, 29 Oct 2007, Lee Schermerhorn wrote: > And even when the intent is to preserve the cpuset relative positions of > the nodes in the nodemask, this really only makes sense if the original > and modified cpusets have the same physical topology w/rt multi-level > NUMA interconnects. This is something that has bothered me about > dynamic cpusets and current policy remapping. We don't do a good job of > explaining the implications of changing cpuset topology on applications, > nor do we handle it very well in the code. Paul addresses one of my > concerns in a later message in this thread, so I'll comment there. > I agree with your assessment of our current policy remapping with respect to the passed nodemask, I think it's troublesome. Whether we can change that now is another question, but the remap certainly doesn't help respect the intent of the application and the mempolicies they have set up when influenced by an outside entity such as cpusets. See my new Choice C alternative. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 22:57 ` David Rientjes @ 2007-10-30 23:46 ` Paul Jackson 0 siblings, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-30 23:46 UTC (permalink / raw) To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel David wrote: > but the remap certainly doesn't help respect the intent of the > application and the mempolicies they have set up when influenced > by an outside entity such as cpusets. ... guess that depends on the intent, doesn't it? -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 18:46 ` David Rientjes 2007-10-26 19:00 ` Paul Jackson @ 2007-10-26 20:43 ` Lee Schermerhorn 1 sibling, 0 replies; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-26 20:43 UTC (permalink / raw) To: David Rientjes; +Cc: Paul Jackson, Christoph Lameter, akpm, ak, linux-kernel On Fri, 2007-10-26 at 11:46 -0700, David Rientjes wrote: > On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > > > Actually, my patch doesn't change the set_mempolicy() API at all, it > > just co-opts a currently unused/illegal value for the nodemask to > > indicate "all allowed nodes". Again, I need to provide a libnuma API to > > request this. Soon come, mon... > > > > If something that was previously unaccepted is now allowed with a > newly-introduced semantic, that's an API change. Well, it's an extension for sure, but a backward compatible one. It should not affect any correct existing application--i.e., one that checks it's return status--except maybe the odd test program that needs to be updated to handle the new semantics. We're allowed to extend APIs as long as we don't break correct applications, right? I mean, it's not like it's a new argument or such. Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 0:28 ` Christoph Lameter 2007-10-26 1:55 ` Paul Jackson @ 2007-10-26 15:18 ` Lee Schermerhorn 2007-10-26 17:36 ` Christoph Lameter 2007-10-26 18:45 ` David Rientjes 1 sibling, 2 replies; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-26 15:18 UTC (permalink / raw) To: Christoph Lameter Cc: David Rientjes, Andrew Morton, Andi Kleen, Paul Jackson, linux-kernel On Thu, 2007-10-25 at 17:28 -0700, Christoph Lameter wrote: > On Thu, 25 Oct 2007, David Rientjes wrote: > > > The problem occurs when you add cpusets into the mix and permit the > > allowed nodes to change without knowledge to the application. Right now, > > a simple remap is done so if the cardinality of the set of nodes > > decreases, you're interleaving over a smaller number of nodes. If the > > cardinality increases, your interleaved nodemask isn't expanded. That's > > the problem that we're facing. The remap itself is troublesome because it > > doesn't take into account the user's desire for a custom nodemask to be > > used anyway; it could remap an interleaved policy over several nodes that > > will already be contended with one another. > > Right. So I think we are fine if the application cannot setup boundaries > for interleave. > > > > Normally, MPOL_INTERLEAVE is used to reduce bus contention to improve the > > throughput of the application. If you remap the number of nodes to > > interleave over, which is currently how it's done when mems_allowed > > changes, you could actually be increasing latency because you're > > interleaving over the same bus. > > Well you may hit some nodes more than others so a slight performance > degradataion. > > > This isn't a memory policy problem because all it does is effect a > > specific policy over a set of nodes. With my change, cpusets are required > > to update the interleaved nodemask if the user specified that they desire > > the feature with interleave_over_allowed. Cpusets are, after all, the > > ones that changed the mems_allowed in the first place and invalidated our > > custom interleave policy. We simply can't make inferences about what we > > should do, so we allow the creator of the cpuset to specify it for us. So > > the proper place to modify an interleaved policy is in cpusets and not > > mempolicy itself. > > With that MPOL_INTERLEAVE would be context dependent and no longer > needs translation. Lee had similar ideas. Lee: Could we make > MPOL_INTERLEAVE generally cpuset context dependent? > That's what my "cpuset-independent interleave" patch does. David doesn't like the "null node mask" interface because it doesn't work with libnuma. I plan to fix that, but I'm chasing other issues. I should get back to the mempol work after today. What I like about the cpuset independent interleave is that the "policy remap" when cpusets are changed is a NO-OP--no need to change the policy. Just as "preferred local" policy chooses the node where the allocation occurs, my cpuset independent interleave patch interleaves across the set of nodes available at the time of the allocation. The application has to specifically ask for this behavior by the null/empty nodemask or the TBD libnuma API. IMO, this is the only reasonable interleave policy for apps running in dynamic cpusets. An aside: if David et al [at google] are using cpusets on fake numa for resource management [I don't know this is the case, but saw some discussions way back that indicate it might be?], then maybe this becomes less of an issue when control groups [a.k.a. containers] and memory resource controls come to fruition? Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 15:18 ` Lee Schermerhorn @ 2007-10-26 17:36 ` Christoph Lameter 2007-10-26 18:45 ` David Rientjes 1 sibling, 0 replies; 98+ messages in thread From: Christoph Lameter @ 2007-10-26 17:36 UTC (permalink / raw) To: Lee Schermerhorn Cc: David Rientjes, Andrew Morton, Andi Kleen, Paul Jackson, linux-kernel On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > > With that MPOL_INTERLEAVE would be context dependent and no longer > > needs translation. Lee had similar ideas. Lee: Could we make > > MPOL_INTERLEAVE generally cpuset context dependent? > > > > That's what my "cpuset-independent interleave" patch does. David > doesn't like the "null node mask" interface because it doesn't work with > libnuma. I plan to fix that, but I'm chasing other issues. I should > get back to the mempol work after today. But this makes it cpuset dependent. The set of nodes is dependent on the cpuset. If it would be independent then interleave could allow any nodes outside of the cpuset. > What I like about the cpuset independent interleave is that the "policy > remap" when cpusets are changed is a NO-OP--no need to change the > policy. Just as "preferred local" policy chooses the node where the > allocation occurs, my cpuset independent interleave patch interleaves > across the set of nodes available at the time of the allocation. The > application has to specifically ask for this behavior by the null/empty > nodemask or the TBD libnuma API. IMO, this is the only reasonable > interleave policy for apps running in dynamic cpusets. Hmmm.. But its an API change and requires more special casing. > An aside: if David et al [at google] are using cpusets on fake numa for > resource management [I don't know this is the case, but saw some > discussions way back that indicate it might be?], then maybe this > becomes less of an issue when control groups [a.k.a. containers] and > memory resource controls come to fruition? Yes very likely. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 15:18 ` Lee Schermerhorn 2007-10-26 17:36 ` Christoph Lameter @ 2007-10-26 18:45 ` David Rientjes 2007-10-26 19:02 ` Paul Jackson 2007-10-27 19:16 ` David Rientjes 1 sibling, 2 replies; 98+ messages in thread From: David Rientjes @ 2007-10-26 18:45 UTC (permalink / raw) To: Lee Schermerhorn Cc: Christoph Lameter, Andrew Morton, Andi Kleen, Paul Jackson, linux-kernel On Fri, 26 Oct 2007, Lee Schermerhorn wrote: > That's what my "cpuset-independent interleave" patch does. David > doesn't like the "null node mask" interface because it doesn't work with > libnuma. I plan to fix that, but I'm chasing other issues. I should > get back to the mempol work after today. > Hacking and requiring an updated version of libnuma to allow empty nodemasks to be passed is a poor solution; if mempolicy's are supposed to be independent from cpusets, then what semantics does an empty nodemask actually imply when using MPOL_INTERLEAVE? To me, it means the entire set_mempolicy() should be a no-op, and that's exactly how mainline currently treats it _as_well_ as libnuma. So justifying this change in the man page is respectible, but passing an empty nodemask just doesn't make sense. > What I like about the cpuset independent interleave is that the "policy > remap" when cpusets are changed is a NO-OP--no need to change the > policy. Just as "preferred local" policy chooses the node where the > allocation occurs, my cpuset independent interleave patch interleaves > across the set of nodes available at the time of the allocation. The > application has to specifically ask for this behavior by the null/empty > nodemask or the TBD libnuma API. IMO, this is the only reasonable > interleave policy for apps running in dynamic cpusets. > Passing empty nodemasks with MPOL_INTERLEAVE to set_mempolicy() is the only reasonable way of specifying you want, at all times, to interleave over all available nodes? I doubt it. I personally prefer an approach where cpusets take the responsibility for determining how policies change (they use set_mempolicy() anyway to effect their mems boundaries) because it's cpusets that has changed the available nodemask out from beneath the application. So instead of trying to create a solution where cpusets impact mempolicies and mempolicies impact cpusets, it should only be in a single direction. Cpusets change the set of available nodes and should update the attached tasks' mempolicies at the same time. That's the same as saying that cpusets should be built on top of mempolicies, which they are, and shouldn't have any reverse dependency. > An aside: if David et al [at google] are using cpusets on fake numa for > resource management [I don't know this is the case, but saw some > discussions way back that indicate it might be?], then maybe this > becomes less of an issue when control groups [a.k.a. containers] and > memory resource controls come to fruition? > Completely irrelevant; I care about the interaction between cpusets and mempolicies in mainline Linux. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 18:45 ` David Rientjes @ 2007-10-26 19:02 ` Paul Jackson 2007-10-27 19:16 ` David Rientjes 1 sibling, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-26 19:02 UTC (permalink / raw) To: David Rientjes; +Cc: Lee.Schermerhorn, clameter, akpm, ak, linux-kernel David wrote: > I personally prefer an approach where cpusets take the responsibility for > determining how policies change (they use set_mempolicy() anyway to effect > their mems boundaries) because it's cpusets that has changed the available > nodemask out from beneath the application. Agreed. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 18:45 ` David Rientjes 2007-10-26 19:02 ` Paul Jackson @ 2007-10-27 19:16 ` David Rientjes 2007-10-29 16:23 ` Lee Schermerhorn 1 sibling, 1 reply; 98+ messages in thread From: David Rientjes @ 2007-10-27 19:16 UTC (permalink / raw) To: Lee Schermerhorn Cc: Christoph Lameter, Andrew Morton, Andi Kleen, Paul Jackson, linux-kernel On Fri, 26 Oct 2007, David Rientjes wrote: > Hacking and requiring an updated version of libnuma to allow empty > nodemasks to be passed is a poor solution; if mempolicy's are supposed to > be independent from cpusets, then what semantics does an empty nodemask > actually imply when using MPOL_INTERLEAVE? To me, it means the entire > set_mempolicy() should be a no-op, and that's exactly how mainline > currently treats it _as_well_ as libnuma. So justifying this change in > the man page is respectible, but passing an empty nodemask just doesn't > make sense. > Another reason that passing an empty nodemask to set_mempolicy() doesn't make sense is that libnuma uses numa_set_interleave_mask(&numa_no_nodes) to disable interleaving completely. David ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-27 19:16 ` David Rientjes @ 2007-10-29 16:23 ` Lee Schermerhorn 2007-10-29 17:35 ` Andi Kleen 2007-10-29 19:35 ` Paul Jackson 0 siblings, 2 replies; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-29 16:23 UTC (permalink / raw) To: David Rientjes Cc: Christoph Lameter, Andrew Morton, Andi Kleen, Paul Jackson, linux-kernel On Sat, 2007-10-27 at 12:16 -0700, David Rientjes wrote: > On Fri, 26 Oct 2007, David Rientjes wrote: > > > Hacking and requiring an updated version of libnuma to allow empty > > nodemasks to be passed is a poor solution; if mempolicy's are supposed to > > be independent from cpusets, then what semantics does an empty nodemask > > actually imply when using MPOL_INTERLEAVE? To me, it means the entire > > set_mempolicy() should be a no-op, and that's exactly how mainline > > currently treats it _as_well_ as libnuma. So justifying this change in > > the man page is respectible, but passing an empty nodemask just doesn't > > make sense. > > > > Another reason that passing an empty nodemask to set_mempolicy() doesn't > make sense is that libnuma uses numa_set_interleave_mask(&numa_no_nodes) > to disable interleaving completely. > David: as we discussed when you contacted me off-list about this, the libnuma API and the system call interface are two quite different APIs. For example, numa_set_interleave_mask(&numa_no_nodes) does not pass MPOL_INTERLEAVE with an empty mask to set_mempolicy(). Rather it "installs" an MPOL_DEFAULT policy which internally just deletes the task's mempolicy, allowing fallback to system default policy. I would not propose to change this behavior, nor break libnuma in any way. For other, who weren't involved in the off-list exchange, here's an excerpt from my response to David: [ At the libnuma level, I think we need an explicit "numa_set_interleave_allowed()"--analogous to "numa_set_localalloc()". The current "numa_alloc_interleaved()" should, I think, allocate on all *allowed* nodes, rather than all nodes. It can do this using the sys call interface as defined. Independent of cpuset-independent interleave, an application needs to pass a valid subset of the current mems allowed to "numa_alloc_interleaved_subset()". An application can now obtain the mems_allowed using the MPOL_F_MEMS_ALLOWED flag that I added, but we need a libnuma wrapper for this as well. [Yeah, this info can change at any time, but that's always been the case....] "numa_interleave_memory()" is essentially mbind(), I think [not looking at the libnuma source code at this moment]. Maybe provide "numa_interleave_memory_allowed(void *mem, size_t size)" ??? Finally, I think we need to add a query function: "nodemask_t numa_get_mems_allowed()" to return the mask of valid nodes in the current context [cpuset]. This would just be a wrapper around get_mempolicy() with the MPOL_F_MEMS_ALLOWED flag. ] Couple of comments on the above: 1. "the sys call interface as defined" in the 2nd paragraph of the except refers to my patch that uses null/empty nodemask to indicate "all allowed". 2. As this thread progresses, you've discussed relaxing the requirement that applications pass a valid subset of mems_allowed. I.e., something that was illegal becomes legal. An API change, I think. But, a backward compatible one, so that's OK, right? :-) 3. If we do change the semantics of the mempolicy system calls to allow nodes outside of the cpuset, then maybe we don't need to query the mems allowed. I still find it useful, but not absolutely necessary--e.g., to construct a nodemask that will be acceptable in the current cpuset. 4. I looked at libnuma source. numa_interleave_memory() does use mbind() which, again, does not complain about nodemasks that include non-allowed nodes. Another thing occurs to me: perhaps numactl would need an additional 'nodes' specifier such as 'allowed'. Alternatively, 'all' could be redefined to me 'all allowed'. This is independent of how you specify 'all allowed' to the system call. Regards, Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 16:23 ` Lee Schermerhorn @ 2007-10-29 17:35 ` Andi Kleen 2007-10-29 19:35 ` Paul Jackson 1 sibling, 0 replies; 98+ messages in thread From: Andi Kleen @ 2007-10-29 17:35 UTC (permalink / raw) To: Lee Schermerhorn Cc: David Rientjes, Christoph Lameter, Andrew Morton, Paul Jackson, linux-kernel > > Another thing occurs to me: perhaps numactl would need an additional > 'nodes' specifier such as 'allowed'. Alternatively, 'all' could be > redefined to me 'all allowed'. This is independent of how you specify > 'all allowed' to the system call. cpuset support in libnuma/numactl is still incomplete. I'm also not sure what the best way to handle this is. Probably there should be a switch for both. -Andi ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 16:23 ` Lee Schermerhorn 2007-10-29 17:35 ` Andi Kleen @ 2007-10-29 19:35 ` Paul Jackson 2007-10-29 20:36 ` Christoph Lameter 2007-10-29 21:08 ` Andi Kleen 1 sibling, 2 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-29 19:35 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: rientjes, clameter, akpm, ak, linux-kernel Lee wrote: > 2. As this thread progresses, you've discussed relaxing the requirement > that applications pass a valid subset of mems_allowed. I.e., something > that was illegal becomes legal. An API change, I think. But, a > backward compatible one, so that's OK, right? :-) The more I have stared at this, the more certain I've become that we need to make the mbind/mempolicy calls modal -- the default mode continues to interpret node numbers and masks just as these calls do now, and the alternative mode provides the so called "Choice B", which takes node numbers and masks as if the task owned the entire system, and then the kernel internally and automatically scrunches those masks down to whatever happens to be the current cpuset of the task. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 19:35 ` Paul Jackson @ 2007-10-29 20:36 ` Christoph Lameter 2007-10-29 21:08 ` Andi Kleen 1 sibling, 0 replies; 98+ messages in thread From: Christoph Lameter @ 2007-10-29 20:36 UTC (permalink / raw) To: Paul Jackson; +Cc: Lee Schermerhorn, rientjes, akpm, ak, linux-kernel On Mon, 29 Oct 2007, Paul Jackson wrote: > The more I have stared at this, the more certain I've become that we > need to make the mbind/mempolicy calls modal -- the default mode > continues to interpret node numbers and masks just as these calls do > now, and the alternative mode provides the so called "Choice B", > which takes node numbers and masks as if the task owned the entire > system, and then the kernel internally and automatically scrunches > those masks down to whatever happens to be the current cpuset of > the task. Ack. ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 19:35 ` Paul Jackson 2007-10-29 20:36 ` Christoph Lameter @ 2007-10-29 21:08 ` Andi Kleen 2007-10-29 22:48 ` Paul Jackson 2007-10-30 19:47 ` Paul Jackson 1 sibling, 2 replies; 98+ messages in thread From: Andi Kleen @ 2007-10-29 21:08 UTC (permalink / raw) To: Paul Jackson; +Cc: Lee Schermerhorn, rientjes, clameter, akpm, linux-kernel On Monday 29 October 2007 20:35:58 Paul Jackson wrote: > Lee wrote: > > 2. As this thread progresses, you've discussed relaxing the requirement > > that applications pass a valid subset of mems_allowed. I.e., something > > that was illegal becomes legal. An API change, I think. But, a > > backward compatible one, so that's OK, right? :-) > > The more I have stared at this, the more certain I've become that we > need to make the mbind/mempolicy calls modal -- the default mode > continues to interpret node numbers and masks just as these calls do > now, and the alternative mode provides the so called "Choice B", > which takes node numbers and masks as if the task owned the entire > system, and then the kernel internally and automatically scrunches > those masks down to whatever happens to be the current cpuset of > the task. So the user space asks for 8 nodes because it knows the machine has that many from /sys and it only gets 4 if a cpuset says so? That's just bad semantics. And is not likely to make the user programs happy. I don't think you'll get around to teaching user space (or rather libnuma) about cpusets and let it handle it. >From the libnuma perspective the machine size would be essentially current cpuset size. On the syscall level I don't think it makes much sense to change though. The alternative would be to throw out the complete cpuset concept and go for virtual nodes inside containers with virtualized /sys. -Andi ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 21:08 ` Andi Kleen @ 2007-10-29 22:48 ` Paul Jackson 2007-10-30 19:47 ` Paul Jackson 1 sibling, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-29 22:48 UTC (permalink / raw) To: Andi Kleen; +Cc: Lee.Schermerhorn, rientjes, clameter, akpm, linux-kernel > So the user space asks for 8 nodes because it knows the machine > has that many from /sys and it only gets 4 if a cpuset says so? That's > just bad semantics. And is not likely to make the user programs happy. That's no different than what can happen today -- if a task actually is in an 8 node cpuset, sets up its mempolicies accordingly, and then gets shoe horned into a 4 node cpuset. It's not good or bad; it's just interactions between two mechanisms. If your app doesn't run well in a small cpuset, don't run it there (or do run it there, poorly ;). -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-29 21:08 ` Andi Kleen 2007-10-29 22:48 ` Paul Jackson @ 2007-10-30 19:47 ` Paul Jackson 2007-10-30 20:20 ` Lee Schermerhorn 2007-10-30 20:27 ` Andi Kleen 1 sibling, 2 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-30 19:47 UTC (permalink / raw) To: Andi Kleen; +Cc: Lee.Schermerhorn, rientjes, clameter, akpm, linux-kernel Andi, Christoph, or whomever: Are there any good regression tests of mempolicy functionality? This patch I'm coding is delicate enough that I probably broke something. It would be nice to catch it sooner rather than later. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 19:47 ` Paul Jackson @ 2007-10-30 20:20 ` Lee Schermerhorn 2007-10-30 20:26 ` Paul Jackson 2007-10-30 20:27 ` Andi Kleen 1 sibling, 1 reply; 98+ messages in thread From: Lee Schermerhorn @ 2007-10-30 20:20 UTC (permalink / raw) To: Paul Jackson; +Cc: Andi Kleen, rientjes, clameter, akpm, linux-kernel On Tue, 2007-10-30 at 12:47 -0700, Paul Jackson wrote: > Andi, Christoph, or whomever: > > Are there any good regression tests of mempolicy functionality? Paul: Andi has a regression test in the numactl source package. Try: http://freshmeat.net/redir/numactl/62210/url_tgz/numactl-1.0.2.tar.gz Lee ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 20:20 ` Lee Schermerhorn @ 2007-10-30 20:26 ` Paul Jackson 0 siblings, 0 replies; 98+ messages in thread From: Paul Jackson @ 2007-10-30 20:26 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: ak, rientjes, clameter, akpm, linux-kernel Lee wrote: > Paul: Andi has a regression test in the numactl source package. Good - thanks. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-30 19:47 ` Paul Jackson 2007-10-30 20:20 ` Lee Schermerhorn @ 2007-10-30 20:27 ` Andi Kleen 1 sibling, 0 replies; 98+ messages in thread From: Andi Kleen @ 2007-10-30 20:27 UTC (permalink / raw) To: Paul Jackson; +Cc: Lee.Schermerhorn, rientjes, clameter, akpm, linux-kernel On Tuesday 30 October 2007 20:47:51 Paul Jackson wrote: > Andi, Christoph, or whomever: > > Are there any good regression tests of mempolicy functionality? numactl has some basic tests (make test). I think newer LTP also has some but i haven't looked at them. And there is Lee's memtoy which does some things; but I don't think it's very automated. -Andi ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-25 22:54 ` [patch 2/2] cpusets: add interleave_over_allowed option David Rientjes 2007-10-25 23:37 ` Christoph Lameter @ 2007-10-26 1:13 ` Paul Jackson 2007-10-26 1:30 ` David Rientjes 1 sibling, 1 reply; 98+ messages in thread From: Paul Jackson @ 2007-10-26 1:13 UTC (permalink / raw) To: David Rientjes; +Cc: akpm, ak, clameter, Lee.Schermerhorn, linux-kernel I'm probably going to be ok with this ... after a bit. 1) First concern - my primary issue: One thing I really want to change, the name of the per-cpuset file that controls this option. You call it "interleave_over_allowed". Take a look at the existing per-cpuset file names: $ grep 'name = "' kernel/cpuset.c .name = "cpuset", .name = "cpus", .name = "mems", .name = "cpu_exclusive", .name = "mem_exclusive", .name = "sched_load_balance", .name = "memory_migrate", .name = "memory_pressure_enabled", .name = "memory_pressure", .name = "memory_spread_page", .name = "memory_spread_slab", .name = "cpuset", The name of every memory related option starts with "mem" or "memory", and the name of every memory interleave related option starts with "memory_spread_*". Can we call this "memory_spread_user" instead, or something else matching "memory_spread_*" ? The names of things in the public API's are a big issue of mine. 2) Second concern - lessor code clarity issue: The logic surrounding current_cpuset_interleaved_mems() seems a tad opaque to me. It appears on the surface as if the memory policy code, in mm/mempolicy.c, is getting a nodemask from the cpuset code by calling this routine, as if there were an independent per-cpuset nodemask stating over what nodes to interleave for MPOL_INTERLEAVE. But all that is returned is either (1) an empty node mask or (2) the current tasks allowed cpu mask. If an empty mask is returned, this tells the MPOL_INTERLEAVE code to use the mask the user specified in an earlier set_mempolicy MPOL_INTERLEAVE call. If a non-empty mask is returned, then the previous user specified mask is ignored and that non-empty mask (just all the current cpusets allowed nodes) is used instead. Restating this in pseudo code, from your patch, the mempolicy.c MPOL_INTERLEAVE code to rebind memory policies after a cpuset changes reads: tmp = current_cpuset_interleaved_mems(); if tmp empty: rebind over tmp (all the cpusets allowed nodes) break; rebind over the set_mempolicy MPOL_INTERLEAVE specified mask break; The above code is assymmetric, and the returning of a nodemask is an illusion, suggesting that cpusets might have an interleaved nodemask separate from the allowed memory nodemask. How about instead of your current_cpuset_interleaved_mems() routine that returns a nodemask, rather have a routine that returns a Boolean, indicating whether this new flag is set, used as in: if (cpuset_is_memory_spread_user()) tmp = cpuset_current_mems_allowed(); else nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); pol->v.nodes = tmp; I'll wager this saves a few bytes of kernel text space as well. 3) Maybe I haven't had enough caffiene yet third issue: The existing kernel code for mm/mempolicy.c:mpol_rebind_policy() looks buggy to me. The node_remap() call for the MPOL_INTERLEAVE case seems like it should come before, not after, updating mpolmask to the newmask. Fixing that, and consolidating the multiple lines doing "*mpolmask = *newmask" for each case, into a single such line at the end of the switch(){} statement, results in the following patch. Could you confirm my suspicions and push this one too. It should be a part of your patch set, so we don't waste Andrew's time resolving the inevitable patch collisions we'll see otherwise. --- 2.6.23-mm1.orig/mm/mempolicy.c 2007-10-16 18:55:34.745039423 -0700 +++ 2.6.23-mm1/mm/mempolicy.c 2007-10-25 18:06:08.474742762 -0700 @@ -1741,14 +1741,12 @@ static void mpol_rebind_policy(struct me case MPOL_INTERLEAVE: nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); pol->v.nodes = tmp; - *mpolmask = *newmask; current->il_next = node_remap(current->il_next, *mpolmask, *newmask); break; case MPOL_PREFERRED: pol->v.preferred_node = node_remap(pol->v.preferred_node, *mpolmask, *newmask); - *mpolmask = *newmask; break; case MPOL_BIND: { nodemask_t nodes; @@ -1773,13 +1771,14 @@ static void mpol_rebind_policy(struct me kfree(pol->v.zonelist); pol->v.zonelist = zonelist; } - *mpolmask = *newmask; break; } default: BUG(); break; } + + *mpolmask = *newmask; } /* Thanks. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 98+ messages in thread
* Re: [patch 2/2] cpusets: add interleave_over_allowed option 2007-10-26 1:13 ` Paul Jackson @ 2007-10-26 1:30 ` David Rientjes 0 siblings, 0 replies; 98+ messages in thread From: David Rientjes @ 2007-10-26 1:30 UTC (permalink / raw) To: Paul Jackson; +Cc: akpm, ak, clameter, Lee.Schermerhorn, linux-kernel On Thu, 25 Oct 2007, Paul Jackson wrote: > Can we call this "memory_spread_user" instead, or something else > matching "memory_spread_*" ? > Sounds better. I was hoping somebody was going to come forward with an alternative that sounded better than interleave_over_allowed. > How about instead of your current_cpuset_interleaved_mems() routine > that returns a nodemask, rather have a routine that returns a Boolean, > indicating whether this new flag is set, used as in: > if (cpuset_is_memory_spread_user()) > tmp = cpuset_current_mems_allowed(); > else > nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); > pol->v.nodes = tmp; > That sounds reasonable, it will simply be a wrapper around is_interleave_over_allowed() or what we're now calling is_spread_user(). > The existing kernel code for mm/mempolicy.c:mpol_rebind_policy() > looks buggy to me. The node_remap() call for the MPOL_INTERLEAVE > case seems like it should come before, not after, updating mpolmask > to the newmask. Fixing that, and consolidating the multiple lines > doing "*mpolmask = *newmask" for each case, into a single such line > at the end of the switch(){} statement, results in the following > patch. Could you confirm my suspicions and push this one too. > It should be a part of your patch set, so we don't waste Andrew's > time resolving the inevitable patch collisions we'll see otherwise. > For setting current->il_next, both cases work but yours will be better balanced for the next interleaved allocation. I'll apply it to my patchset. Thanks for the review. David ^ permalink raw reply [flat|nested] 98+ messages in thread
end of thread, other threads:[~2007-10-31 0:29 UTC | newest] Thread overview: 98+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2007-10-25 22:54 [patch 1/2] cpusets: extract mmarray loading from update_nodemask David Rientjes 2007-10-25 22:54 ` [patch 2/2] cpusets: add interleave_over_allowed option David Rientjes 2007-10-25 23:37 ` Christoph Lameter 2007-10-25 23:56 ` David Rientjes 2007-10-26 0:28 ` Christoph Lameter 2007-10-26 1:55 ` Paul Jackson 2007-10-26 2:11 ` David Rientjes 2007-10-26 2:29 ` Paul Jackson 2007-10-26 2:45 ` David Rientjes 2007-10-26 3:14 ` Paul Jackson 2007-10-26 3:58 ` David Rientjes 2007-10-26 4:34 ` Paul Jackson 2007-10-26 15:37 ` Lee Schermerhorn 2007-10-26 17:04 ` Paul Jackson 2007-10-26 17:28 ` Lee Schermerhorn 2007-10-26 20:21 ` Michael Kerrisk 2007-10-26 20:25 ` Paul Jackson 2007-10-26 20:33 ` Michael Kerrisk 2007-10-26 15:30 ` Lee Schermerhorn 2007-10-26 18:46 ` David Rientjes 2007-10-26 19:00 ` Paul Jackson 2007-10-26 20:45 ` David Rientjes 2007-10-26 21:05 ` Christoph Lameter 2007-10-26 21:08 ` David Rientjes 2007-10-26 21:12 ` Christoph Lameter 2007-10-26 21:15 ` David Rientjes 2007-10-26 21:13 ` Lee Schermerhorn 2007-10-26 21:17 ` Christoph Lameter 2007-10-26 21:26 ` Lee Schermerhorn 2007-10-26 21:37 ` Christoph Lameter 2007-10-29 15:00 ` Lee Schermerhorn 2007-10-29 17:33 ` Paul Jackson 2007-10-29 17:46 ` Lee Schermerhorn 2007-10-29 20:35 ` Christoph Lameter 2007-10-26 21:18 ` David Rientjes 2007-10-26 21:31 ` Lee Schermerhorn 2007-10-26 21:39 ` David Rientjes 2007-10-27 1:07 ` Paul Jackson 2007-10-27 1:26 ` Christoph Lameter 2007-10-27 2:41 ` Paul Jackson 2007-10-27 2:50 ` Christoph Lameter 2007-10-27 5:16 ` Paul Jackson 2007-10-27 6:07 ` Christoph Lameter 2007-10-27 8:36 ` Paul Jackson 2007-10-27 17:47 ` Christoph Lameter 2007-10-27 20:59 ` Paul Jackson 2007-10-27 17:50 ` David Rientjes 2007-10-27 23:19 ` Paul Jackson 2007-10-28 18:19 ` David Rientjes 2007-10-28 23:46 ` Paul Jackson 2007-10-29 1:04 ` David Rientjes 2007-10-29 4:27 ` Paul Jackson 2007-10-29 4:47 ` David Rientjes 2007-10-29 5:45 ` Paul Jackson 2007-10-29 7:00 ` David Rientjes 2007-10-29 7:26 ` Paul Jackson 2007-10-30 22:53 ` David Rientjes 2007-10-30 23:17 ` Paul Jackson 2007-10-30 23:25 ` David Rientjes 2007-10-31 0:03 ` Paul Jackson 2007-10-31 0:05 ` Paul Jackson 2007-10-29 7:15 ` Paul Jackson 2007-10-30 23:12 ` David Rientjes 2007-10-30 23:44 ` Paul Jackson 2007-10-30 23:53 ` David Rientjes 2007-10-31 0:29 ` Paul Jackson 2007-10-29 16:54 ` Lee Schermerhorn 2007-10-29 19:40 ` Paul Jackson 2007-10-29 19:45 ` Paul Jackson 2007-10-29 19:57 ` Paul Jackson 2007-10-29 20:02 ` Paul Jackson 2007-10-27 17:45 ` David Rientjes 2007-10-27 21:22 ` Paul Jackson 2007-10-29 15:10 ` Lee Schermerhorn 2007-10-29 18:41 ` Paul Jackson 2007-10-29 19:01 ` Lee Schermerhorn 2007-10-30 23:17 ` David Rientjes 2007-10-31 0:03 ` Paul Jackson 2007-10-30 22:57 ` David Rientjes 2007-10-30 23:46 ` Paul Jackson 2007-10-26 20:43 ` Lee Schermerhorn 2007-10-26 15:18 ` Lee Schermerhorn 2007-10-26 17:36 ` Christoph Lameter 2007-10-26 18:45 ` David Rientjes 2007-10-26 19:02 ` Paul Jackson 2007-10-27 19:16 ` David Rientjes 2007-10-29 16:23 ` Lee Schermerhorn 2007-10-29 17:35 ` Andi Kleen 2007-10-29 19:35 ` Paul Jackson 2007-10-29 20:36 ` Christoph Lameter 2007-10-29 21:08 ` Andi Kleen 2007-10-29 22:48 ` Paul Jackson 2007-10-30 19:47 ` Paul Jackson 2007-10-30 20:20 ` Lee Schermerhorn 2007-10-30 20:26 ` Paul Jackson 2007-10-30 20:27 ` Andi Kleen 2007-10-26 1:13 ` Paul Jackson 2007-10-26 1:30 ` David Rientjes
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox