* [patch 1/3] cpusets: extract mmarray loading from update_nodemask
@ 2007-10-26 2:14 David Rientjes
2007-10-26 2:14 ` [patch 2/3] mempolicy: mpol_rebind_policy cleanup David Rientjes
0 siblings, 1 reply; 15+ messages in thread
From: David Rientjes @ 2007-10-26 2:14 UTC (permalink / raw)
To: Andrew Morton
Cc: Andi Kleen, Paul Jackson, Christoph Lameter, Lee Schermerhorn,
linux-kernel
Extract a helper function from update_nodemask() to load an array of
mm_struct pointers with references to each task's mm_struct that is
currently attached to a given cpuset.
This will be used later for other purposes where memory policies need to
be rebound for each task attached to a cpuset.
Cc: Andi Kleen <ak@suse.de>
Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: David Rientjes <rientjes@google.com>
---
kernel/cpuset.c | 130 ++++++++++++++++++++++++++++++++++---------------------
1 files changed, 81 insertions(+), 49 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -702,6 +702,79 @@ done:
/* Don't kfree(doms) -- partition_sched_domains() does that. */
}
+/*
+ * Loads mmarray with pointers to all the mm_struct's of tasks attached to
+ * cpuset cs.
+ *
+ * The reference count to each mm is incremented before loading it into the
+ * array, so put_cpuset_mm_array() must be called after this function to
+ * decrement each reference count and free the memory allocated for mmarray
+ * via this function.
+ */
+static struct mm_struct **get_cpuset_mm_array(const struct cpuset *cs,
+ int *ntasks)
+{
+ struct mm_struct **mmarray;
+ struct task_struct *p;
+ struct cgroup_iter it;
+ int count;
+ int fudge;
+
+ *ntasks = 0;
+ fudge = 10; /* spare mmarray[] slots */
+ fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
+ /*
+ * Allocate mmarray[] to hold mm reference for each task in cpuset cs.
+ * Can't kmalloc GFP_KERNEL while holding tasklist_lock. We could use
+ * GFP_ATOMIC, but with a few more lines of code, we can retry until
+ * we get a big enough mmarray[] w/o using GFP_ATOMIC.
+ */
+ while (1) {
+ count = cgroup_task_count(cs->css.cgroup); /* guess */
+ count += fudge;
+ mmarray = kmalloc(count * sizeof(*mmarray), GFP_KERNEL);
+ if (!mmarray)
+ return NULL;
+ read_lock(&tasklist_lock); /* block fork */
+ if (cgroup_task_count(cs->css.cgroup) <= count)
+ break; /* got enough */
+ read_unlock(&tasklist_lock); /* try again */
+ kfree(mmarray);
+ }
+
+ /* Load up mmarray[] with mm reference for each task in cpuset. */
+ cgroup_iter_start(cs->css.cgroup, &it);
+ while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
+ struct mm_struct *mm;
+
+ if (*ntasks >= count) {
+ printk(KERN_WARNING
+ "Cpuset mempolicy rebind incomplete.\n");
+ break;
+ }
+ mm = get_task_mm(p);
+ if (!mm)
+ continue;
+ mmarray[(*ntasks)++] = mm;
+ }
+ cgroup_iter_end(cs->css.cgroup, &it);
+ read_unlock(&tasklist_lock);
+ return mmarray;
+}
+
+/*
+ * Decrements the reference count to each mm in mmarray and frees the memory
+ * allocated for mmarray.
+ *
+ * To be used in conjunction with get_cpuset_mm_array().
+ */
+static void put_cpuset_mm_array(struct mm_struct **mmarray, int ntasks)
+{
+ while (ntasks-- > 0)
+ mmput(mmarray[ntasks]);
+ kfree(mmarray);
+}
+
static inline int started_after_time(struct task_struct *t1,
struct timespec *time,
struct task_struct *t2)
@@ -915,13 +988,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
{
struct cpuset trialcs;
nodemask_t oldmem;
- struct task_struct *p;
struct mm_struct **mmarray;
- int i, n, ntasks;
+ int i, n;
int migrate;
- int fudge;
int retval;
- struct cgroup_iter it;
/*
* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -963,50 +1033,12 @@ static int update_nodemask(struct cpuset *cs, char *buf)
mutex_unlock(&callback_mutex);
cpuset_being_rebound = cs; /* causes mpol_copy() rebind */
-
- fudge = 10; /* spare mmarray[] slots */
- fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
retval = -ENOMEM;
-
- /*
- * Allocate mmarray[] to hold mm reference for each task
- * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
- * tasklist_lock. We could use GFP_ATOMIC, but with a
- * few more lines of code, we can retry until we get a big
- * enough mmarray[] w/o using GFP_ATOMIC.
- */
- while (1) {
- ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
- ntasks += fudge;
- mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
- if (!mmarray)
- goto done;
- read_lock(&tasklist_lock); /* block fork */
- if (cgroup_task_count(cs->css.cgroup) <= ntasks)
- break; /* got enough */
- read_unlock(&tasklist_lock); /* try again */
- kfree(mmarray);
- }
-
- n = 0;
-
- /* Load up mmarray[] with mm reference for each task in cpuset. */
- cgroup_iter_start(cs->css.cgroup, &it);
- while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
- struct mm_struct *mm;
-
- if (n >= ntasks) {
- printk(KERN_WARNING
- "Cpuset mempolicy rebind incomplete.\n");
- break;
- }
- mm = get_task_mm(p);
- if (!mm)
- continue;
- mmarray[n++] = mm;
- }
- cgroup_iter_end(cs->css.cgroup, &it);
- read_unlock(&tasklist_lock);
+ mmarray = get_cpuset_mm_array(cs, &n);
+ if (!mmarray)
+ goto done;
+ if (!n)
+ goto done_success;
/*
* Now that we've dropped the tasklist spinlock, we can
@@ -1028,12 +1060,12 @@ static int update_nodemask(struct cpuset *cs, char *buf)
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);
- mmput(mm);
}
/* We're done rebinding vma's to this cpusets new mems_allowed. */
- kfree(mmarray);
cpuset_being_rebound = NULL;
+done_success:
+ put_cpuset_mm_array(mmarray, n);
retval = 0;
done:
return retval;
^ permalink raw reply [flat|nested] 15+ messages in thread* [patch 2/3] mempolicy: mpol_rebind_policy cleanup 2007-10-26 2:14 [patch 1/3] cpusets: extract mmarray loading from update_nodemask David Rientjes @ 2007-10-26 2:14 ` David Rientjes 2007-10-26 2:14 ` [patch 3/3] cpusets: add memory_spread_user option David Rientjes 2007-10-26 2:46 ` [patch 2/3] mempolicy: mpol_rebind_policy cleanup Paul Jackson 0 siblings, 2 replies; 15+ messages in thread From: David Rientjes @ 2007-10-26 2:14 UTC (permalink / raw) To: Andrew Morton Cc: Andi Kleen, Paul Jackson, Christoph Lameter, Lee Schermerhorn, linux-kernel Set the memory policy nodemask to the new nodemask on rebind only in one place. The only memory policy that does not need an associated mpolmask is MPOL_DEFAULT. Cc: Andi Kleen <ak@suse.de> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: David Rientjes <rientjes@google.com> --- mm/mempolicy.c | 5 ++--- 1 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1741,14 +1741,12 @@ static void mpol_rebind_policy(struct mempolicy *pol, case MPOL_INTERLEAVE: nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); pol->v.nodes = tmp; - *mpolmask = *newmask; current->il_next = node_remap(current->il_next, *mpolmask, *newmask); break; case MPOL_PREFERRED: pol->v.preferred_node = node_remap(pol->v.preferred_node, *mpolmask, *newmask); - *mpolmask = *newmask; break; case MPOL_BIND: { nodemask_t nodes; @@ -1773,13 +1771,14 @@ static void mpol_rebind_policy(struct mempolicy *pol, kfree(pol->v.zonelist); pol->v.zonelist = zonelist; } - *mpolmask = *newmask; break; } default: BUG(); break; } + if (pol->policy != MPOL_DEFAULT) + *mpolmask = *newmask; } /* ^ permalink raw reply [flat|nested] 15+ messages in thread
* [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 2:14 ` [patch 2/3] mempolicy: mpol_rebind_policy cleanup David Rientjes @ 2007-10-26 2:14 ` David Rientjes 2007-10-26 6:04 ` Paul Jackson 2007-10-26 2:46 ` [patch 2/3] mempolicy: mpol_rebind_policy cleanup Paul Jackson 1 sibling, 1 reply; 15+ messages in thread From: David Rientjes @ 2007-10-26 2:14 UTC (permalink / raw) To: Andrew Morton Cc: Andi Kleen, Paul Jackson, Christoph Lameter, Lee Schermerhorn, linux-kernel Adds a new 'memory_spread_user' option to cpusets. When a task with an MPOL_INTERLEAVE memory policy is attached to a cpuset with this option set, the interleaved nodemask becomes the cpuset's mems_allowed. When the cpuset's mems_allowed changes, the interleaved nodemask for all tasks with MPOL_INTERLEAVE memory policies is also updated to be the new mems_allowed nodemask. This allows applications to specify that they want to interleave over all nodes that they are allowed to access. This set of nodes can be changed at any time via the cpuset interface and each individual memory policy is updated to reflect the changes for all attached tasks when this option is set. Cc: Andi Kleen <ak@suse.de> Cc: Paul Jackson <pj@sgi.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Signed-off-by: David Rientjes <rientjes@google.com> --- Documentation/cpusets.txt | 29 +++++++++++++------- include/linux/cpuset.h | 6 ++++ kernel/cpuset.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ mm/mempolicy.c | 5 +++- 4 files changed, 91 insertions(+), 11 deletions(-) diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt @@ -292,10 +292,10 @@ times 1000. 1.6 What is memory spread ? --------------------------- -There are two boolean flag files per cpuset that control where the -kernel allocates pages for the file system buffers and related in -kernel data structures. They are called 'memory_spread_page' and -'memory_spread_slab'. +There are three boolean flag files per cpuset that control where the +kernel allocates pages for the file system buffers, kernel data +structures, and user memory. They are called 'memory_spread_page', +'memory_spread_slab', and 'memory_spread_user'. If the per-cpuset boolean flag file 'memory_spread_page' is set, then the kernel will spread the file system buffers (page cache) evenly @@ -308,8 +308,13 @@ such as for inodes and dentries evenly over all the nodes that the faulting task is allowed to use, instead of preferring to put those pages on the node where the task is running. -The setting of these flags does not affect anonymous data segment or -stack segment pages of a task. +If the per-cpuset boolean flag file 'memory_spread_user' is set, then +all tasks with MPOL_INTERLEAVE memory policies that are attached to +the cpuset will receive interleaved nodemasks equal to the cpuset's +mems_allowed, regardless of what nodemask was passed to +set_mempolicy(). This nodemask will reflect the cpuset's mems_allowed +whenever a task is attached to the cpuset or the 'mems' file changes +for a cpuset with this boolean flag set. By default, both kinds of memory spreading are off, and memory pages are allocated on the node local to where the task is running, @@ -327,10 +332,10 @@ their containing tasks memory spread settings. If memory spreading is turned off, then the currently specified NUMA mempolicy once again applies to memory page allocations. -Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag -files. By default they contain "0", meaning that the feature is off -for that cpuset. If a "1" is written to that file, then that turns -the named feature on. +'memory_spread_page', 'memory_spread_slab', and 'memory_spread_user' +are boolean flag files. By default they contain "0", meaning that +the feature is off for that cpuset. If a "1" is written to that +file, then that turns the named feature on. The implementation is simple. @@ -345,6 +350,10 @@ Similarly, setting 'memory_spread_cache' turns on the flag PF_SPREAD_SLAB, and appropriately marked slab caches will allocate pages from the node returned by cpuset_mem_spread_node(). +Setting 'memory_spread_user' rebinds an MPOL_INTERLEAVE mempolicy to +the cpuset's mems_allowed whenever a task is attached to the cpuset +or the 'mems' file changes. + The cpuset_mem_spread_node() routine is also simple. It uses the value of a per-task rotor cpuset_mem_spread_rotor to select the next node in the current tasks mems_allowed to prefer for the allocation. diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -77,6 +77,7 @@ static inline int cpuset_do_slab_mem_spread(void) extern void cpuset_track_online_nodes(void); extern int current_cpuset_is_being_rebound(void); +extern int current_cpuset_is_spread_user(void); #else /* !CONFIG_CPUSETS */ @@ -157,6 +158,11 @@ static inline int current_cpuset_is_being_rebound(void) return 0; } +static inline int current_cpuset_is_spread_user(void) +{ + return 0; +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -121,6 +121,7 @@ typedef enum { CS_SCHED_LOAD_BALANCE, CS_SPREAD_PAGE, CS_SPREAD_SLAB, + CS_SPREAD_USER, } cpuset_flagbits_t; /* convenient tests for these bits */ @@ -154,6 +155,11 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } +static inline int is_spread_user(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_USER, &cs->flags); +} + /* * Increment this integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation @@ -1089,6 +1095,44 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) return 0; } +/* Rebinds the memory policies of all tasks attached to cs. + * + * Call with cgroup_mutex held. + */ +static int update_spread_user(struct cpuset *cs, char *buf) +{ + struct mm_struct **mmarray; + int ntasks; + int i; + + if (!simple_strtoul(buf, NULL, 10)) { + clear_bit(CS_SPREAD_USER, &cs->flags); + return 0; + } + + mmarray = get_cpuset_mm_array(cs, &ntasks); + if (!mmarray) + return -ENOMEM; + if (!ntasks) + goto done; + + for (i = 0; i < ntasks; i++) + mpol_rebind_mm(mmarray[i], &cs->mems_allowed); +done: + put_cpuset_mm_array(mmarray, ntasks); + set_bit(CS_SPREAD_USER, &cs->flags); + return 0; +} + +int current_cpuset_is_spread_user(void) +{ + int ret; + mutex_lock(&callback_mutex); + ret = is_spread_user(task_cs(current)); + mutex_unlock(&callback_mutex); + return ret; +} + /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, @@ -1283,6 +1327,7 @@ typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, + FILE_SPREAD_USER, } cpuset_filetype_t; static ssize_t cpuset_common_file_write(struct cgroup *cont, @@ -1350,6 +1395,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, retval = update_flag(CS_SPREAD_SLAB, cs, buffer); cs->mems_generation = cpuset_mems_generation++; break; + case FILE_SPREAD_USER: + retval = update_spread_user(cs, buffer); + break; default: retval = -EINVAL; goto out2; @@ -1446,6 +1494,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, case FILE_SPREAD_SLAB: *s++ = is_spread_slab(cs) ? '1' : '0'; break; + case FILE_SPREAD_USER: + *s++ = is_spread_user(cs) ? '1' : '0'; + break; default: retval = -EINVAL; goto out; @@ -1536,6 +1587,13 @@ static struct cftype cft_spread_slab = { .private = FILE_SPREAD_SLAB, }; +static struct cftype cft_spread_user = { + .name = "memory_spread_user", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, + .private = FILE_SPREAD_USER, +}; + static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) { int err; @@ -1558,6 +1616,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) return err; if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0) return err; + if ((err = cgroup_add_file(cont, ss, &cft_spread_user)) < 0) + return err; /* memory_pressure_enabled is in root cpuset only */ if (err == 0 && !cont->parent) err = cgroup_add_file(cont, ss, @@ -1633,6 +1693,8 @@ static struct cgroup_subsys_state *cpuset_create( set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); + if (is_spread_user(parent)) + set_bit(CS_SPREAD_USER, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cs->cpus_allowed = CPU_MASK_NONE; cs->mems_allowed = NODE_MASK_NONE; diff --git a/mm/mempolicy.c b/mm/mempolicy.c --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1739,7 +1739,10 @@ static void mpol_rebind_policy(struct mempolicy *pol, case MPOL_DEFAULT: break; case MPOL_INTERLEAVE: - nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); + if (current_cpuset_is_spread_user()) + tmp = cpuset_mems_allowed(current); + else + nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); pol->v.nodes = tmp; current->il_next = node_remap(current->il_next, *mpolmask, *newmask); ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 2:14 ` [patch 3/3] cpusets: add memory_spread_user option David Rientjes @ 2007-10-26 6:04 ` Paul Jackson 2007-10-26 9:23 ` David Rientjes 0 siblings, 1 reply; 15+ messages in thread From: Paul Jackson @ 2007-10-26 6:04 UTC (permalink / raw) To: David Rientjes; +Cc: akpm, ak, clameter, Lee.Schermerhorn, linux-kernel Hmmm ... another doubt crosses my mind, something perhaps along the lines of Christoph's earlier concerns of more interactions between cpusets and mempolicy. I'm figuring that when someone looks at the cpuset flag: memory_spread_user they will expect that turning it on will cause user space memory to be spread over the nodes of the cpuset. Sure makes sense that it would mean that. But, for the most part, it doesn't. Only tasks that have previously called set_mempolicy(MPOL_INTERLEAVE), and only after the 'mems' of the cpuset are subsequently changed, will have their user memory forced to be spread across the cpuset as a result of this flag setting. That's weird. We really should not surprise users like that. This is a dangerous situation -- the obvious meaning of a flag is not what it means. Any chance, David, that you could have this flag mean: Spread user memory allocations over the cpuset, period, anytime it is set, regardless of what mempolicy calls the task has made and regardless of whether or not or when the cpusets 'mems' were last changed. In your previous reply, you wrote: > This option gives the most power to applications. Most power, or excessive confusion? Straight forward consistency and simple predictability are far more important in almost all cases. The usual exception is when you have a serious use case requiring something that can only be done in a more obscure fashion. There is always a price paid for supporting such complexities in an API however, the price being increased confusion, frustration, errors and bugs on the part of most users of the API. ... Now most likely you will claim you have such a use case, and when I ask for it, I will be frustrated at the lack of compelling detail of what is going on in user space - what sorts of users, apps and systems involved. Ok, no biggie. If this goes down that path, then perhaps at least I need to reconsider the name: memory_spread_user This is -not- like the other memory spread flags. It only spreads user memory across the cpuset if previously, in order, (1) the task did set_mempolicy(MPOL_INTERLEAVE), then (2) someone modified the cpusets 'mems'. Perhaps the following ugly name better warns users of such odd behaviour: memory_spread_mpol_interleave -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 6:04 ` Paul Jackson @ 2007-10-26 9:23 ` David Rientjes 2007-10-26 9:56 ` Paul Jackson 0 siblings, 1 reply; 15+ messages in thread From: David Rientjes @ 2007-10-26 9:23 UTC (permalink / raw) To: Paul Jackson; +Cc: akpm, ak, clameter, Lee.Schermerhorn, linux-kernel On Thu, 25 Oct 2007, Paul Jackson wrote: > I'm figuring that when someone looks at the cpuset flag: > > memory_spread_user > > they will expect that turning it on will cause user space memory to be > spread over the nodes of the cpuset. Sure makes sense that it would > mean that. > > But, for the most part, it doesn't. Only tasks that have > previously called set_mempolicy(MPOL_INTERLEAVE), and only > after the 'mems' of the cpuset are subsequently changed, > will have their user memory forced to be spread across the > cpuset as a result of this flag setting. > And also tasks which are attached to a cpuset that has memory_spread_user enabled and had preexisting MPOL_INTERLEAVE memory policies. cpuset_attach() rebinds memory policies; if memory_spread_user is set, the the interleaved nodemask for the task's mempolicy will be set to the cpuset's mems_allowed. > Any chance, David, that you could have this flag mean: > > Spread user memory allocations over the cpuset, > period, anytime it is set, regardless of what > mempolicy calls the task has made and regardless > of whether or not or when the cpusets 'mems' were > last changed. > This would override the custom mempolicies of all tasks attached to the cpuset. All tasks would have MPOL_INTERLEAVE memory policies with a nodemask of the cpuset's mems_allowed and could not be changed. With my current proposal, tasks receive the full interleaved nodemask of the cpuset's mems_allowed when they have preexisting MPOL_INTERLEAVE memory policies and: - the 'mems' change in a cpuset enabled with memory_spread_user, - it is attached to a cpuset enabled with memory_spread_user, or - memory_spread_page is enabled for its cpuset. We respect the changes that tasks make to their mempolicies with MPOL_INTERLEAVE after any of those three scenarios. Your change would force all tasks in a memory_spread_user cpuset to have MPOL_INTERLEAVE with a nodemask of mems_allowed. That's very easy to define but is going to require additional cpusets to be created with duplicate settings (excluding memory_spread_user) if you want different behavior for its tasks. I won't argue against that. > Most power, or excessive confusion? Straight forward consistency and > simple predictability are far more important in almost all cases. The > usual exception is when you have a serious use case requiring > something that can only be done in a more obscure fashion. > I don't think cpuset files such as memory_spread_{page,slab,user} or memory_pressure, etc, are completely and accurately descriptive of what they do anyway. That's why anybody who is going to use cpusets is going to refer to Documentation/cpusets.txt where the semantics are explicitly written. > There is always a price paid for supporting such complexities in an API > however, the price being increased confusion, frustration, errors and > bugs on the part of most users of the API. > We can certainly code it the way you suggested: memory_spread_user requires all tasks to be MPOL_INTERLEAVE with a static nodemask of mems_allowed. If this use-case is so narrow that any sane implementation would never want several different tasks with different contextualized interleaved memory policies in a dynamically changing cpuset, that's the perfect way to code it. > ... Now most likely you will claim you have such a use case, and when > I ask for it, I will be frustrated at the lack of compelling detail of > what is going on in user space - what sorts of users, apps and systems > involved. Ok, no biggie. If this goes down that path, then perhaps > at least I need to reconsider the name: > > memory_spread_user > I don't actually have any use-cases where I want two different MPOL_INTERLEAVE tasks sharing the same cpuset and only one of them adjusted on a mems change and the other to remain static unless I explicitly call set_mempolicy(). It all comes down to the decision of whether we want to permit set_mempolicy() calls for tasks and respect the nodemask passed in a memory_spread_user cpuset. If so, we must do it my way where the set_mempolicy() occurs after attachment or the setting of the flag. There's just no other time where you can allow them to differ from the memory_spread_user behavior that the cpuset is configured with. So if you don't have any issue with a hard and fast rule of requiring tasks in memory_spread_user cpusets to have MPOL_INTERLEAVE policies with the nodemask of mems_allowed and not giving them the option of changing it, I have no objection. I simply coded it so that you could work around the cpuset flag through the mempolicy interface. I don't have any express need for it. David ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 9:23 ` David Rientjes @ 2007-10-26 9:56 ` Paul Jackson 2007-10-26 17:18 ` Paul Jackson 2007-10-26 20:41 ` David Rientjes 0 siblings, 2 replies; 15+ messages in thread From: Paul Jackson @ 2007-10-26 9:56 UTC (permalink / raw) To: David Rientjes; +Cc: akpm, ak, clameter, Lee.Schermerhorn, linux-kernel David wrote: > It all comes down to the decision of whether we want to permit > set_mempolicy() calls for tasks and respect the nodemask passed in a > memory_spread_user cpuset. Well said. My current inclination is to say: 1) If you want the current behaviour, where set_mempolicy(MPOL_INTERLEAVE) calls mean what they say, and cpusets tries as best it can (imperfectly) to honor those memory policy calls, even in the face of changing cpusets, then leave memory_spread_user turned off (the default, right?) 2) If you want MPOL_INTERLEAVE tasks to interleave user memory across all nodes in the cpuset, whatever that might be, then enable memory_spread_user. This is admittedly less flexible than your patch provided, but I am attracted to the simpler API - easier to explain. This does beg yet another question: shouldn't memory_spread_user force interleaving of user memory -regardless- of mempolicy. And yet another question: what about the MPOL_BIND mempolicy? It too, to a lesser extent, has the same problems with cpusets that shrink and then expand. Several tasks in a cpuset with multiple nodes could carefully bind to a separate node each, but then end up collapsed all onto the same node if the cpuset was shrunk to one node and then expanded again. I should sleep on this, and hopefully respond again, within this day. On a different point, we could, if it was worth the extra bit of code, improve the current code's handling of mempolicy rebinding when the cpuset adds memory nodes. If we kept both the original cpusets mems_allowed, and the original MPOL_INTERLEAVE nodemask requested by the user in a call to set_mempolicy, then we could rebind (nodes_remap) the currently active policy v.nodes using that pair of saved masks to guide the rebinding. This way, if say a cpuset shrunk, then regrew back to its original size (original number of nodes) we would end up replicating the original MPOL_INTERLEAVE request, cpuset relative. This would provide a more accurate cpuset relative translation of such memory policies with-out- changing the set_mempolicy API. Hmmm ... this might meet your needs entirely, so that we did not need -any- added flags to the API. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 9:56 ` Paul Jackson @ 2007-10-26 17:18 ` Paul Jackson 2007-10-26 17:39 ` Christoph Lameter 2007-10-26 17:43 ` Lee Schermerhorn 2007-10-26 20:41 ` David Rientjes 1 sibling, 2 replies; 15+ messages in thread From: Paul Jackson @ 2007-10-26 17:18 UTC (permalink / raw) To: Paul Jackson; +Cc: rientjes, akpm, ak, clameter, Lee.Schermerhorn, linux-kernel pj wrote: > On a different point, we could, if it was worth the extra bit of code, > improve the current code's handling of mempolicy rebinding when the > cpuset adds memory nodes. If we kept both the original cpusets > mems_allowed, and the original MPOL_INTERLEAVE nodemask requested by > the user in a call to set_mempolicy, then we could rebind (nodes_remap) > the currently active policy v.nodes using that pair of saved masks to > guide the rebinding. This way, if say a cpuset shrunk, then regrew back > to its original size (original number of nodes) we would end up > replicating the original MPOL_INTERLEAVE request, cpuset relative. > This would provide a more accurate cpuset relative translation of such > memory policies with-out- changing the set_mempolicy API. Hmmm ... this > might meet your needs entirely, so that we did not need -any- added > flags to the API. Thinking about this some more ... there's daylight here! I'll see if I can code up a patch for this now, but the idea is to allow user code to specify any nodemask to a set_mempolicy MPOL_INTERLEAVE call, even including nodes not in their cpuset, and then (1) use nodes_remap() to fold that mask down to whatever is their current cpuset (2) remember what they passed in and use it again with nodes_remap() to re-fold that mask down, anytime the cpuset changes. For example, if they pass in a mask with all bits sets, then they get interleave over all the nodes in their current cpuset, even as that cpuset changes. If they pass in a mask with say just two bits set, then they will get interleave over just two nodes anytime they are in a cpuset with two or more nodes (when in a single node cpuset, they will of course get no interleave, for lack of anything to interleave over.) This should replace the patches that David is proposing here. It should replace what Lee is proposing. It should work with libnuma and be fully upward compatible with current code (except perhaps code that depends on getting an error from requesting MPOL_INTERLEAVE on a node not allowed.) And instead of just covering the special case of "interleave over all available nodes" it should cover the more general case of interleaving over any subset of nodes, folded or replicated to handle being in any cpuset. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 17:18 ` Paul Jackson @ 2007-10-26 17:39 ` Christoph Lameter 2007-10-26 17:43 ` Paul Jackson 2007-10-26 17:43 ` Lee Schermerhorn 1 sibling, 1 reply; 15+ messages in thread From: Christoph Lameter @ 2007-10-26 17:39 UTC (permalink / raw) To: Paul Jackson; +Cc: rientjes, akpm, ak, Lee.Schermerhorn, linux-kernel On Fri, 26 Oct 2007, Paul Jackson wrote: > For example, if they pass in a mask with all bits sets, then they > get interleave over all the nodes in their current cpuset, even as > that cpuset changes. If they pass in a mask with say just two > bits set, then they will get interleave over just two nodes anytime > they are in a cpuset with two or more nodes (when in a single node > cpuset, they will of course get no interleave, for lack of anything > to interleave over.) Yuck. I really want cpuset relative nodemasks. Nodes need to reference to available nodes. If you set all of them then it will use all available nodes and ignore the ones not allowed. If you just set the first two bits then it will use the first and second node of a cpuset. ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 17:39 ` Christoph Lameter @ 2007-10-26 17:43 ` Paul Jackson 0 siblings, 0 replies; 15+ messages in thread From: Paul Jackson @ 2007-10-26 17:43 UTC (permalink / raw) To: Christoph Lameter; +Cc: rientjes, akpm, ak, Lee.Schermerhorn, linux-kernel > I really want cpuset relative nodemasks. Nodes need to reference > to available nodes. If you set all of them then it will use all available > nodes and ignore the ones not allowed. If you just set the first two bits > then it will use the first and second node of a cpuset. Exactly. I think I am ending up with exactly this. I'm just getting there by a strange way of thinking (as usual ;). -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 17:18 ` Paul Jackson 2007-10-26 17:39 ` Christoph Lameter @ 2007-10-26 17:43 ` Lee Schermerhorn 2007-10-26 17:54 ` Paul Jackson 1 sibling, 1 reply; 15+ messages in thread From: Lee Schermerhorn @ 2007-10-26 17:43 UTC (permalink / raw) To: Paul Jackson; +Cc: rientjes, akpm, ak, clameter, linux-kernel On Fri, 2007-10-26 at 10:18 -0700, Paul Jackson wrote: > pj wrote: > > On a different point, we could, if it was worth the extra bit of code, > > improve the current code's handling of mempolicy rebinding when the > > cpuset adds memory nodes. If we kept both the original cpusets > > mems_allowed, and the original MPOL_INTERLEAVE nodemask requested by > > the user in a call to set_mempolicy, then we could rebind (nodes_remap) > > the currently active policy v.nodes using that pair of saved masks to > > guide the rebinding. This way, if say a cpuset shrunk, then regrew back > > to its original size (original number of nodes) we would end up > > replicating the original MPOL_INTERLEAVE request, cpuset relative. > > This would provide a more accurate cpuset relative translation of such > > memory policies with-out- changing the set_mempolicy API. Hmmm ... this > > might meet your needs entirely, so that we did not need -any- added > > flags to the API. > > Thinking about this some more ... there's daylight here! > > I'll see if I can code up a patch for this now, but the idea is > to allow user code to specify any nodemask to a set_mempolicy > MPOL_INTERLEAVE call, even including nodes not in their cpuset, > and then (1) use nodes_remap() to fold that mask down to whatever > is their current cpuset (2) remember what they passed in and use > it again with nodes_remap() to re-fold that mask down, anytime > the cpuset changes. > > For example, if they pass in a mask with all bits sets, then they > get interleave over all the nodes in their current cpuset, even as > that cpuset changes. If they pass in a mask with say just two > bits set, then they will get interleave over just two nodes anytime > they are in a cpuset with two or more nodes (when in a single node > cpuset, they will of course get no interleave, for lack of anything > to interleave over.) > > This should replace the patches that David is proposing here. It should > replace what Lee is proposing. It should work with libnuma and be > fully upward compatible with current code (except perhaps code that > depends on getting an error from requesting MPOL_INTERLEAVE on a node > not allowed.) > > And instead of just covering the special case of "interleave over all > available nodes" it should cover the more general case of interleaving > over any subset of nodes, folded or replicated to handle being in any > cpuset. Will it handle the case of MPOL_INTERLEAVE policy on a shm segment that is mapped by tasks in different, possibly disjoint, cpusets. Local allocation does, and my patch does. That was one of the primary goals--to address an issue that Christoph has with shared policies. cpusets really muck these up! Lee > ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 17:43 ` Lee Schermerhorn @ 2007-10-26 17:54 ` Paul Jackson 2007-10-26 18:00 ` Christoph Lameter 2007-10-26 20:39 ` Lee Schermerhorn 0 siblings, 2 replies; 15+ messages in thread From: Paul Jackson @ 2007-10-26 17:54 UTC (permalink / raw) To: Lee Schermerhorn; +Cc: rientjes, akpm, ak, clameter, linux-kernel > Will it handle the case of MPOL_INTERLEAVE policy on a shm segment that > is mapped by tasks in different, possibly disjoint, cpusets. Local > allocation does, and my patch does. That was one of the primary > goals--to address an issue that Christoph has with shared policies. > cpusets really muck these up! It probably won't handle that. I don't get along too well with shmem. Can you to an anti-shmem bigot how MPOL_INTERLEAVE should work with shmem segments mapped in diverse ways by different tasks in different cpusets? What would be the key attribute(s) of a proper solution? Maybe if we keep it simple enough, I can avoid mucking it up too much this time around. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 17:54 ` Paul Jackson @ 2007-10-26 18:00 ` Christoph Lameter 2007-10-26 20:39 ` Lee Schermerhorn 1 sibling, 0 replies; 15+ messages in thread From: Christoph Lameter @ 2007-10-26 18:00 UTC (permalink / raw) To: Paul Jackson; +Cc: Lee Schermerhorn, rientjes, akpm, ak, linux-kernel On Fri, 26 Oct 2007, Paul Jackson wrote: > > Will it handle the case of MPOL_INTERLEAVE policy on a shm segment that > > is mapped by tasks in different, possibly disjoint, cpusets. Local > > allocation does, and my patch does. That was one of the primary > > goals--to address an issue that Christoph has with shared policies. > > cpusets really muck these up! > > It probably won't handle that. I don't get along too well with shmem. IMHO shmem policy support is pretty much messed up (seems that we introduced new races by trying to fix the refcounts). I tend to ignore the stuff unless it impacts regular shared or regular memory. Before we do any of this fancy stuff lets at least get the refcount handling right? > Can you to an anti-shmem bigot how MPOL_INTERLEAVE should work with > shmem segments mapped in diverse ways by different tasks in different > cpusets? What would be the key attribute(s) of a proper solution? > Maybe if we keep it simple enough, I can avoid mucking it up too much > this time around. With relative nodemasks we could have a MPOL_INTERLEAVE policy working in multiple cpuset contexts. If nodes 0 and 1 are set in a nodemask then the first two nodes of the current cpuset are interleaved through. Nodes that do not exist are ignored. So if there is no second node then MPOL_INTERLEAVE becomes a noop. ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 17:54 ` Paul Jackson 2007-10-26 18:00 ` Christoph Lameter @ 2007-10-26 20:39 ` Lee Schermerhorn 1 sibling, 0 replies; 15+ messages in thread From: Lee Schermerhorn @ 2007-10-26 20:39 UTC (permalink / raw) To: Paul Jackson; +Cc: rientjes, akpm, ak, clameter, linux-kernel On Fri, 2007-10-26 at 10:54 -0700, Paul Jackson wrote: > > Will it handle the case of MPOL_INTERLEAVE policy on a shm segment that > > is mapped by tasks in different, possibly disjoint, cpusets. Local > > allocation does, and my patch does. That was one of the primary > > goals--to address an issue that Christoph has with shared policies. > > cpusets really muck these up! > > It probably won't handle that. I don't get along too well with shmem. Not surprising :). shmem doesn't get along too well with cpusets. > > Can you to an anti-shmem bigot how MPOL_INTERLEAVE should work with ^ explain ? > shmem segments mapped in diverse ways by different tasks in different > cpusets? What would be the key attribute(s) of a proper solution? > Maybe if we keep it simple enough, I can avoid mucking it up too much > this time around. Personally, I'm of the opinion "if it hurts when you do that, don't do that". I have uses for shared memory and mempolicies on the same, but they don't involve sharing shmem [nor mapped files] between cpusets nor dynamically changing cpusets. So, my approach would be to document the issues clearly [another reason I'd like to see cpuset man pages] and make sure that folks can't accidentally trip over them. But, I suppose all the documentation in the world won't stop some people from hurting themselves. As my grandmother used to tell me, "children and fools shouldn't play with sharp tools." [Then she'd always ask me, "Which one are you?" I guess time has answered that question...] Lee ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 3/3] cpusets: add memory_spread_user option 2007-10-26 9:56 ` Paul Jackson 2007-10-26 17:18 ` Paul Jackson @ 2007-10-26 20:41 ` David Rientjes 1 sibling, 0 replies; 15+ messages in thread From: David Rientjes @ 2007-10-26 20:41 UTC (permalink / raw) To: Paul Jackson; +Cc: akpm, ak, clameter, Lee.Schermerhorn, linux-kernel On Fri, 26 Oct 2007, Paul Jackson wrote: > 1) If you want the current behaviour, where set_mempolicy(MPOL_INTERLEAVE) > calls mean what they say, and cpusets tries as best it can (imperfectly) > to honor those memory policy calls, even in the face of changing cpusets, > then leave memory_spread_user turned off (the default, right?) > 2) If you want MPOL_INTERLEAVE tasks to interleave user memory across all > nodes in the cpuset, whatever that might be, then enable memory_spread_user. > > This is admittedly less flexible than your patch provided, but I am > attracted to the simpler API - easier to explain. > That seems to follow the convention better with respect to memory_spread_page and memory_spread_slab anyway. Either all the tasks attached to the cpuset get that behavior or none of them do. We can do the same for memory_spread_user. Sounds good. > This does beg yet another question: shouldn't memory_spread_user force > interleaving of user memory -regardless- of mempolicy. > Sure, I don't see any compelling reason why it shouldn't. > And yet another question: what about the MPOL_BIND mempolicy? It too, > to a lesser extent, has the same problems with cpusets that shrink and > then expand. Several tasks in a cpuset with multiple nodes could carefully > bind to a separate node each, but then end up collapsed all onto the same > node if the cpuset was shrunk to one node and then expanded again. > Hmm. At some point we're going to have to just say that if you use mempolicies such as MPOL_BIND in your application and then you insanely take those nodes away from your application via cpusets, that you are actually getting exactly what you asked for. There's two ways to fix that: try to remap the MPOL_BIND nodes onto the new set of mems_allowed regardless of the cardinality of the two sets, or refuse to update the nodemask of the cpuset if you're taking one or more nodes away from an attached task that has such a policy. I favor the former because, in conjunction with a sane memory_migrate setting, it shouldn't actually matter that much. The memory you previously allocated will still be on the removed nodes; only your future allocations will actually respect the new nodemask. The MPOL_INTERLEAVE case is more interesting because we're trying to reduce bus contention and decrease our latency with quicker memory access. So, using the true definition of a node as a premise, we should get better results in terms of performance if we expand the nodemask as much as possible. That's exactly what we've been trying to address: when an application's mems_allowed is expanded to allow more nodes, the application is unaware of the change and can't take advantage of the it (without the get_mempolicy() - set_mempolicy() loop). That's my whole case for why cpusets should be modifying MPOL_INTERLEAVE policies in the first place: because they are the ones that allowed access to more memory. > On a different point, we could, if it was worth the extra bit of code, > improve the current code's handling of mempolicy rebinding when the > cpuset adds memory nodes. If we kept both the original cpusets > mems_allowed, and the original MPOL_INTERLEAVE nodemask requested by > the user in a call to set_mempolicy, then we could rebind (nodes_remap) > the currently active policy v.nodes using that pair of saved masks to > guide the rebinding. This way, if say a cpuset shrunk, then regrew back > to its original size (original number of nodes) we would end up > replicating the original MPOL_INTERLEAVE request, cpuset relative. Keeping a copy of the nodemask passed to set_mempolicy() in struct mempolicy is an interesting idea and could, with the logic you describe, help guide the remapping as the set of allowed nodes changes. We'd have two interleaved nodemasks, the actual (pol->v.nodes) and the requested (something like pol->passed_nodemask). get_mempolicy() would always return the actual nodemask so the application can be aware of what it has access to and what it doesn't. I like it. David ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [patch 2/3] mempolicy: mpol_rebind_policy cleanup 2007-10-26 2:14 ` [patch 2/3] mempolicy: mpol_rebind_policy cleanup David Rientjes 2007-10-26 2:14 ` [patch 3/3] cpusets: add memory_spread_user option David Rientjes @ 2007-10-26 2:46 ` Paul Jackson 1 sibling, 0 replies; 15+ messages in thread From: Paul Jackson @ 2007-10-26 2:46 UTC (permalink / raw) To: David Rientjes; +Cc: akpm, ak, clameter, Lee.Schermerhorn, linux-kernel David wrote: > + if (pol->policy != MPOL_DEFAULT) Good catch - thanks. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <pj@sgi.com> 1.925.600.0401 ^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2007-10-26 20:42 UTC | newest] Thread overview: 15+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2007-10-26 2:14 [patch 1/3] cpusets: extract mmarray loading from update_nodemask David Rientjes 2007-10-26 2:14 ` [patch 2/3] mempolicy: mpol_rebind_policy cleanup David Rientjes 2007-10-26 2:14 ` [patch 3/3] cpusets: add memory_spread_user option David Rientjes 2007-10-26 6:04 ` Paul Jackson 2007-10-26 9:23 ` David Rientjes 2007-10-26 9:56 ` Paul Jackson 2007-10-26 17:18 ` Paul Jackson 2007-10-26 17:39 ` Christoph Lameter 2007-10-26 17:43 ` Paul Jackson 2007-10-26 17:43 ` Lee Schermerhorn 2007-10-26 17:54 ` Paul Jackson 2007-10-26 18:00 ` Christoph Lameter 2007-10-26 20:39 ` Lee Schermerhorn 2007-10-26 20:41 ` David Rientjes 2007-10-26 2:46 ` [patch 2/3] mempolicy: mpol_rebind_policy cleanup Paul Jackson
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox