* [RFC] cpuset relative memory policies - second choice
@ 2007-10-31 6:17 Paul Jackson
2007-10-31 18:59 ` Christoph Lameter
0 siblings, 1 reply; 18+ messages in thread
From: Paul Jackson @ 2007-10-31 6:17 UTC (permalink / raw)
To: David Rientjes
Cc: Lee.Schermerhorn, linux-kernel, Paul Jackson, Christoph Lameter,
Andi Kleen
From: Paul Jackson <pj@sgi.com>
RFC only so far - has been built and booted, but has received
almost no testing.
Add a second choice for how node numbers are interpreted and returned
by the NUMA memory policy system calls mbind, set_mempolicy and
get_mempolicy.
The original choice remains the default, for compatibility.
The second choice overcomes some limitations of the first choice in
the interaction between cpusets and these memory policy calls, that
show up when tasks using these calls are also being moved between
different cpusets, especially between cpusets with varying numbers
of allowed nodes in the cpuset 'mems' file.
A new per-task mode, managed using added get_mempolicy calls, controls
which mode applies to subsequently created memory policies.
See the Documentation/vm/numa_memory_policy.txt section MEMORY
POLICIES AND CPUSETS for an explanation of how both these choices
for node numbering work and interact with cpusets.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Lee.Schermerhorn@hp.com
Cc: Andi Kleen <ak@suse.de>
---
Documentation/vm/numa_memory_policy.txt | 140 +++++++++++++++++++++++++-------
include/linux/mempolicy.h | 15 +++
include/linux/sched.h | 1
mm/mempolicy.c | 123 +++++++++++++++++++++++-----
4 files changed, 229 insertions(+), 50 deletions(-)
--- 2.6.23-mm1.orig/include/linux/mempolicy.h 2007-10-30 18:04:11.000000000 -0700
+++ 2.6.23-mm1/include/linux/mempolicy.h 2007-10-30 18:11:07.000000000 -0700
@@ -21,6 +21,10 @@
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
+#define MPOL_F_MODE_DEFAULT (1<<3) /* set cpuset confined nodemask mode */
+#define MPOL_F_MODE_SYS_WIDE (1<<4) /* set system-wide nodemask mode */
+#define MPOL_F_MODE_GET (1<<5) /* return number mode: old => 1, new => 0 */
+
/* Flags for mbind */
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
@@ -28,6 +32,10 @@
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
+/* Internal values for mpol_nodemask_mode (just reuse get_mem_policy values) */
+#define MPOL_MODE_DEFAULT MPOL_F_MODE_DEFAULT /* relative to this cpuset */
+#define MPOL_MODE_SYS_WIDE MPOL_F_MODE_SYS_WIDE /* original input mask */
+
#ifdef __KERNEL__
#include <linux/mmzone.h>
@@ -64,13 +72,18 @@ struct mm_struct;
struct mempolicy {
atomic_t refcnt;
short policy; /* See MPOL_* above */
+ char mpol_nodemask_mode; /* See MPOL_MODE_* above; union c below */
union {
struct zonelist *zonelist; /* bind */
short preferred_node; /* preferred */
nodemask_t nodes; /* interleave */
/* undefined for default */
} v;
- nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
+ /* Cpuset interface: Documentation/vm/numa_memory_policy.txt */
+ union {
+ nodemask_t cpuset_mems_allowed; /* if MPOL_MODE_DEFAULT */
+ nodemask_t original_nodes; /* if MPOL_MODE_SYS_WIDE */
+ } c;
};
/*
--- 2.6.23-mm1.orig/mm/mempolicy.c 2007-10-30 18:04:11.000000000 -0700
+++ 2.6.23-mm1/mm/mempolicy.c 2007-10-30 20:02:10.000000000 -0700
@@ -175,6 +175,7 @@ static struct zonelist *bind_zonelist(no
static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
{
struct mempolicy *policy;
+ nodemask_t cpuset_centric_nodes;
pr_debug("setting mode %d nodes[0] %lx\n",
mode, nodes ? nodes_addr(*nodes)[0] : -1);
@@ -185,9 +186,27 @@ static struct mempolicy *mpol_new(int mo
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
+
+ policy->mpol_nodemask_mode = current->mpol_nodemask_mode;
+ {
+ char m = current->mpol_nodemask_mode;
+ if (m != MPOL_MODE_DEFAULT && m != MPOL_MODE_SYS_WIDE)
+ printk(KERN_WARNING
+ "mempolicy mpol_new unset mode: %d\n", m);
+ }
+ if (policy->mpol_nodemask_mode == MPOL_MODE_SYS_WIDE) {
+ policy->c.original_nodes = *nodes;
+ nodes_remap(cpuset_centric_nodes, *nodes,
+ node_possible_map,
+ cpuset_current_mems_allowed);
+ } else /* MPOL_MODE_DEFAULT */ {
+ policy->c.cpuset_mems_allowed = cpuset_current_mems_allowed;
+ cpuset_centric_nodes = *nodes;
+ }
+
switch (mode) {
case MPOL_INTERLEAVE:
- policy->v.nodes = *nodes;
+ policy->v.nodes = cpuset_centric_nodes;
nodes_and(policy->v.nodes, policy->v.nodes,
node_states[N_HIGH_MEMORY]);
if (nodes_weight(policy->v.nodes) == 0) {
@@ -196,12 +215,12 @@ static struct mempolicy *mpol_new(int mo
}
break;
case MPOL_PREFERRED:
- policy->v.preferred_node = first_node(*nodes);
+ policy->v.preferred_node = first_node(cpuset_centric_nodes);
if (policy->v.preferred_node >= MAX_NUMNODES)
policy->v.preferred_node = -1;
break;
case MPOL_BIND:
- policy->v.zonelist = bind_zonelist(nodes);
+ policy->v.zonelist = bind_zonelist(&cpuset_centric_nodes);
if (IS_ERR(policy->v.zonelist)) {
void *error_code = policy->v.zonelist;
kmem_cache_free(policy_cache, policy);
@@ -210,7 +229,6 @@ static struct mempolicy *mpol_new(int mo
break;
}
policy->policy = mode;
- policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
return policy;
}
@@ -427,8 +445,10 @@ static int contextualize_policy(int mode
return 0;
cpuset_update_task_memory_state();
- if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
- return -EINVAL;
+ if (current->mpol_nodemask_mode == MPOL_MODE_DEFAULT) {
+ if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
+ return -EINVAL;
+ }
return mpol_check_policy(mode, nodes);
}
@@ -533,6 +553,22 @@ static long do_get_mempolicy(int *policy
struct mempolicy *pol = current->mempolicy;
cpuset_update_task_memory_state();
+
+ switch (flags) {
+ case MPOL_F_MODE_DEFAULT:
+ current->mpol_nodemask_mode = MPOL_MODE_DEFAULT;
+ return 0;
+ case MPOL_F_MODE_SYS_WIDE:
+ current->mpol_nodemask_mode = MPOL_MODE_SYS_WIDE;
+ return 0;
+ case MPOL_F_MODE_GET:
+ if (current->mpol_nodemask_mode == MPOL_MODE_DEFAULT)
+ *policy = MPOL_F_MODE_DEFAULT;
+ else
+ *policy = MPOL_F_MODE_SYS_WIDE;
+ return 0;
+ }
+
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
@@ -570,7 +606,12 @@ static long do_get_mempolicy(int *policy
*policy = err;
} else if (pol == current->mempolicy &&
pol->policy == MPOL_INTERLEAVE) {
- *policy = current->il_next;
+ if (current->mpol_nodemask_mode == MPOL_MODE_DEFAULT)
+ *policy = current->il_next;
+ else
+ *policy = node_remap(current->il_next,
+ node_possible_map,
+ cpuset_current_mems_allowed);
} else {
err = -EINVAL;
goto out;
@@ -584,8 +625,12 @@ static long do_get_mempolicy(int *policy
}
err = 0;
- if (nmask)
- get_zonemask(pol, nmask);
+ if (nmask) {
+ if (current->mpol_nodemask_mode == MPOL_MODE_DEFAULT)
+ get_zonemask(pol, nmask);
+ else
+ *nmask = pol->c.original_nodes;
+ }
out:
if (vma)
@@ -901,7 +946,10 @@ asmlinkage long sys_mbind(unsigned long
if (err)
return err;
/* Restrict the nodes to the allowed nodes in the cpuset */
- nodes_and(nodes, nodes, cpuset_current_mems_allowed);
+ /* XXX this is inconsistent: mbind silently discards extra nodes, */
+ /* XXX but set_mempolicy rejects them -EINVAL. */
+ if (current->mpol_nodemask_mode == MPOL_MODE_DEFAULT)
+ nodes_and(nodes, nodes, cpuset_current_mems_allowed);
return do_mbind(start, len, mode, &nodes, flags);
}
@@ -1712,6 +1760,7 @@ void __init numa_policy_init(void)
if (unlikely(nodes_empty(interleave_nodes)))
node_set(prefer, interleave_nodes);
+ current->mpol_nodemask_mode = MPOL_MODE_DEFAULT;
if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
}
@@ -1722,33 +1771,27 @@ void numa_default_policy(void)
do_set_mempolicy(MPOL_DEFAULT, NULL);
}
-/* Migrate a policy to a different set of nodes */
-static void mpol_rebind_policy(struct mempolicy *pol,
+/* Migrate a policy to a different set of nodes: MPOL_MODE_DEFAULT */
+static void mpol_rebind_policy_default(struct mempolicy *pol,
const nodemask_t *newmask)
{
nodemask_t *mpolmask;
nodemask_t tmp;
- if (!pol)
- return;
- mpolmask = &pol->cpuset_mems_allowed;
+ mpolmask = &pol->c.cpuset_mems_allowed;
if (nodes_equal(*mpolmask, *newmask))
return;
switch (pol->policy) {
- case MPOL_DEFAULT:
- break;
case MPOL_INTERLEAVE:
nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
pol->v.nodes = tmp;
current->il_next = node_remap(current->il_next,
*mpolmask, *newmask);
- *mpolmask = *newmask;
break;
case MPOL_PREFERRED:
pol->v.preferred_node = node_remap(pol->v.preferred_node,
*mpolmask, *newmask);
- *mpolmask = *newmask;
break;
case MPOL_BIND: {
nodemask_t nodes;
@@ -1773,13 +1816,53 @@ static void mpol_rebind_policy(struct me
kfree(pol->v.zonelist);
pol->v.zonelist = zonelist;
}
- *mpolmask = *newmask;
break;
}
default:
BUG();
break;
}
+ *mpolmask = *newmask;
+}
+
+/* Migrate a policy to a different set of nodes: MPOL_MODE_SYS_WIDE */
+static void mpol_rebind_policy_sys_wide(struct mempolicy *pol,
+ const nodemask_t *newmask)
+{
+ nodemask_t cpuset_centric_nodes;
+ struct zonelist *zonelist;
+
+ nodes_remap(cpuset_centric_nodes, pol->c.original_nodes,
+ node_possible_map,
+ *newmask);
+ switch (pol->policy) {
+ case MPOL_INTERLEAVE:
+ pol->v.nodes = cpuset_centric_nodes;
+ current->il_next = first_node(pol->v.nodes);
+ break;
+ case MPOL_PREFERRED:
+ pol->v.preferred_node = first_node(cpuset_centric_nodes);
+ break;
+ case MPOL_BIND:
+ zonelist = bind_zonelist(&cpuset_centric_nodes);
+ if (!IS_ERR(zonelist)) {
+ kfree(pol->v.zonelist);
+ pol->v.zonelist = zonelist;
+ }
+ break;
+ }
+}
+
+/* Migrate a policy to a different set of nodes */
+static void mpol_rebind_policy(struct mempolicy *pol,
+ const nodemask_t *newmask)
+{
+ if (!pol || pol->policy == MPOL_DEFAULT)
+ return;
+ if (pol->mpol_nodemask_mode == MPOL_MODE_DEFAULT)
+ mpol_rebind_policy_default(pol, newmask);
+ else
+ mpol_rebind_policy_sys_wide(pol, newmask);
}
/*
--- 2.6.23-mm1.orig/include/linux/sched.h 2007-10-30 18:04:11.000000000 -0700
+++ 2.6.23-mm1/include/linux/sched.h 2007-10-30 18:11:07.000000000 -0700
@@ -1112,6 +1112,7 @@ struct task_struct {
#ifdef CONFIG_NUMA
struct mempolicy *mempolicy;
short il_next;
+ char mpol_nodemask_mode; /* new mem policies will get this mode */
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed;
--- 2.6.23-mm1.orig/Documentation/vm/numa_memory_policy.txt 2007-10-30 18:04:11.000000000 -0700
+++ 2.6.23-mm1/Documentation/vm/numa_memory_policy.txt 2007-10-30 22:52:38.000000000 -0700
@@ -300,32 +300,114 @@ package.
MEMORY POLICIES AND CPUSETS
-Memory policies work within cpusets as described above. For memory policies
-that require a node or set of nodes, the nodes are restricted to the set of
-nodes whose memories are allowed by the cpuset constraints. If the nodemask
-specified for the policy contains nodes that are not allowed by the cpuset, or
-the intersection of the set of nodes specified for the policy and the set of
-nodes with memory is the empty set, the policy is considered invalid
-and cannot be installed.
-
-The interaction of memory policies and cpusets can be problematic for a
-couple of reasons:
-
-1) the memory policy APIs take physical node id's as arguments. As mentioned
- above, it is illegal to specify nodes that are not allowed in the cpuset.
- The application must query the allowed nodes using the get_mempolicy()
- API with the MPOL_F_MEMS_ALLOWED flag to determine the allowed nodes and
- restrict itself to those nodes. However, the resources available to a
- cpuset can be changed by the system administrator, or a workload manager
- application, at any time. So, a task may still get errors attempting to
- specify policy nodes, and must query the allowed memories again.
-
-2) when tasks in two cpusets share access to a memory region, such as shared
- memory segments created by shmget() of mmap() with the MAP_ANONYMOUS and
- MAP_SHARED flags, and any of the tasks install shared policy on the region,
- only nodes whose memories are allowed in both cpusets may be used in the
- policies. Obtaining this information requires "stepping outside" the
- memory policy APIs to use the cpuset information and requires that one
- know in what cpusets other task might be attaching to the shared region.
- Furthermore, if the cpusets' allowed memory sets are disjoint, "local"
- allocation is the only valid policy.
+There are two different modes for how the node numbers and nodemasks
+passed to or from the get_mempolicy(), set_mempolicy(), and mbind()
+system calls are interpreted by the kernel. If the per-task
+task_struct flag mpol_nodemask_mode is MPOL_MODE_DEFAULT, then
+these nodes and nodemasks should only include nodes allowed in the
+current tasks cpuset. If mpol_nodemask_mode is MPOL_MODE_SYS_WIDE,
+then these nodes and nodemasks may include any nodes in the system.
+
+Either way, the kernel will subsequently do the best it can to
+automatically adapt a tasks memory policy to changes in that tasks
+cpuset, so as to (try to) keep that policies node numbers (for
+MPOL_BIND and MPOL_INTERLEAVE) and node masks (for MPOL_INTERLEAVE)
+unchanged relative to the nodes allowed to that task by its cpuset.
+
+Calling get_mempolicy() with MPOL_F_MODE_DEFAULT selects
+MPOL_MODE_DEFAULT, and calling it with MPOL_F_MODE_SYS_WIDE selects
+MPOL_MODE_SYS_WIDE. Calling get_mempolicy() with MPOL_F_MODE_GET
+returns the current mode, MPOL_F_MODE_DEFAULT or MPOL_F_MODE_SYS_WIDE,
+in the policy argument to get_mempolicy.
+
+A tasks current mode, MPOL_F_MODE_DEFAULT or MPOL_F_MODE_SYS_WIDE,
+determines how subsequent node numbers and nodemasks passed to
+subsequent mbind and set_mempolicy calls will be interpreted, and
+how node numbers and nodemasks will be returned by get_mempolicy.
+Existing memory policies are not affected by changes in this mode,
+except as presented by get_mempolicy.
+
+The MPOL_MODE_DEFAULT mode has some limitations, but for historical
+reasons (it was the first and for a while the only mode available) it
+remains the default mode (hence its name.)
+
+The limitations of the MPOL_MODE_DEFAULT mode include:
+
+ 1) Because the node numbers and masks passed into the mbind() and
+ set_mempolicy() system calls are taken relative to the tasks
+ current cpuset, and because that cpuset could change at the
+ same time, there is a small race condition. The node numbers
+ and masks might end up being interpreted by the kernel relative
+ to a different cpuset placement than the application used while
+ preparing them, if the tasks cpuset was moved in the interim.
+
+ The application may query the allowed nodes using get_mempolicy()
+ with the MPOL_F_MEMS_ALLOWED flag to determine the allowed nodes
+ and restrict itself to those nodes. However, the resources
+ available to a cpuset can be changed by the system administrator,
+ or a workload manager application, at any time. So, a task
+ may still get errors attempting to specify policy nodes to
+ set_mempolicy(), and must query the allowed memories again.
+
+ 2) Because only node numbers valid in the tasks current cpuset are
+ considered, a task can not specify which nodes should be added
+ to its memory policies if the task is subsequently moved to
+ a larger cpuset. Similarly, if a task sets a memory policy,
+ then is later moved to a smaller cpuset (fewer memory nodes)
+ and then moved back to its first cpuset or one of the same size,
+ some nodes in its memory policy may be lost (no longer allowed
+ by that policy.)
+
+ This can result in a task not getting the memory policy node
+ placement that it requested. Furthermore, the tasks memory policy
+ might fallback to MPOL_DEFAULT if it ends up with no remaining
+ nodes in its requested memory policy (see the "FALL THROUGH"
+ comments in mm/mempolicy.c.)
+
+ 3) When tasks in two cpusets share access to a memory region, such
+ as shared memory segments created by shmget() of mmap() with the
+ MAP_ANONYMOUS and MAP_SHARED flags, and any of the tasks install
+ shared policy on the region, only nodes whose memories are allowed
+ in both cpusets may be used in the policies. Obtaining this
+ information requires "stepping outside" the memory policy APIs
+ to use the cpuset information and requires that one know in
+ what cpusets other task might be attaching to the shared region.
+ Furthermore, if the cpusets' allowed memory sets are disjoint,
+ "local" allocation is the only valid policy. [This limitation
+ may apply in some respects to the MPOL_F_MODE_SYS_WIDE mode as
+ well - this author doesn't know.]
+
+Depending on the situation, either of these two modes may be best
+suited to an applications needs. Applications dealing with specific
+hardware node numbers, such as certain nodes having different i/o
+devices or more memory or faster processors or a particular NUMA
+topology, may be better expressed using the MPOL_MODE_DEFAULT mode.
+Applications dealing with nodes as more virtual and interchangeable
+entities, that are more concerned with being coded to support being
+moved by cpuset changes, without the above listed limitations, may
+be better expressed using the MPOL_F_MODE_SYS_WIDE mode.
+
+The MPOL_F_MODE_SYS_WIDE mode essentually virtualizes the node numbers
+passed back and forth across the memory policy system calls, as if
+the task was always in a cpuset containing all possible nodes in the
+system. Then the kernel automatically folds the memory policy down
+to whatever memory nodes are in the tasks current cpuset. This is
+useful to tasks that want to specify memory policies independently of
+what cpuset constraints or placement apply at the moment. This is not
+so useful for tasks that have requirements for placement on specific
+hardware memory nodes.
+
+In the internal kernel's mm/mempolicy.c code, when a struct
+mempolicy mpol_nodemask_mode == MPOL_MODE_DEFAULT, it keeps the
+nodemask of the cpuset to which it was most recently bound in
+the policies c.cpuset_mems_allowed field. When the mempolicy
+field mpol_nodemask_mode == MPOL_MODE_SYS_WIDE, it keeps the
+original nodemask from the set_mempolicy call that created that
+mempolicy in the policies c.original_nodes field. The per-task
+task_struct keeps the mode to be used in future set_mempolicy and
+mbind calls in its mpol_nodemask_mode field. The get_mempolicy
+options MPOL_F_MODE_DEFAULT and MPOL_F_MODE_SYS_WIDE set the current
+tasks mpol_nodemask_mode to MPOL_MODE_DEFAULT or MPOL_MODE_SYS_WIDE,
+respectively, and the get_mempolicy option MPOL_F_MODE_GET returns
+the current tasks mpol_nodemask_mode in the policy argument.
+
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.650.933.1373
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC] cpuset relative memory policies - second choice
2007-10-31 6:17 [RFC] cpuset relative memory policies - second choice Paul Jackson
@ 2007-10-31 18:59 ` Christoph Lameter
2007-10-31 20:19 ` Paul Jackson
0 siblings, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2007-10-31 18:59 UTC (permalink / raw)
To: Paul Jackson; +Cc: David Rientjes, Lee.Schermerhorn, linux-kernel, Andi Kleen
On Tue, 30 Oct 2007, Paul Jackson wrote:
> #include <linux/mmzone.h>
> @@ -64,13 +72,18 @@ struct mm_struct;
> struct mempolicy {
> atomic_t refcnt;
> short policy; /* See MPOL_* above */
> + char mpol_nodemask_mode; /* See MPOL_MODE_* above; union c below */
Make both policy and the mode char? Could we shorten the mpol_nodemask_mode
to mode?
> --- 2.6.23-mm1.orig/include/linux/sched.h 2007-10-30 18:04:11.000000000 -0700
> +++ 2.6.23-mm1/include/linux/sched.h 2007-10-30 18:11:07.000000000 -0700
> @@ -1112,6 +1112,7 @@ struct task_struct {
> #ifdef CONFIG_NUMA
> struct mempolicy *mempolicy;
> short il_next;
> + char mpol_nodemask_mode; /* new mem policies will get this mode */
Hmmm... I would rather have numactl manage this flag and specify it in
calls to set memory policies.
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC] cpuset relative memory policies - second choice
2007-10-31 18:59 ` Christoph Lameter
@ 2007-10-31 20:19 ` Paul Jackson
2007-10-31 21:05 ` Christoph Lameter
2007-11-01 18:52 ` David Rientjes
0 siblings, 2 replies; 18+ messages in thread
From: Paul Jackson @ 2007-10-31 20:19 UTC (permalink / raw)
To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, linux-kernel, ak
Christoph wrote:
> > short policy; /* See MPOL_* above */
> > + char mpol_nodemask_mode; /* See MPOL_MODE_* above; union c below */
>
> Make both policy and the mode char?
Well, the mpol_nodemask_mode already is char. So I guess you're
asking if we should change 'policy' to type char as well.
Changing 'policy' from a short to a char would reduce the sizeof
(struct mempolicy) on 16-bit systems, as both chars would fit in one
word. But this code doesn't run on 16-bit systems.
> Could we shorten the mpol_nodemask_mode to mode?
Huh - I don't know what "shorten ... to mode" means. If it means
"shorten ... to char", then see my previous comment above.
> Hmmm... I would rather have numactl manage this flag
Well, numactl is a command line utility. It doesn't manage this flag
as an -alternative- to some sort of changes to the mbind, set_mempolicy
and get_mempolicy calls, but rather layers on top of changes to those
system calls. So I'm not sure what you mean, but I guess it is not
relevant to this patch discussion.
> and specify it in calls to set memory policies.
This suggestion I do think I understand - good. However I disagree.
Here's what I think you meant, and why I disagree.
My current patch adds a new per-task modal state, that is manipulated
by additional get_mempolicy calls and options.
I think you are stating a preference for passing an additional flag on
each mbind, set_mempolicy and get_mempolicy call, to be used when you
want to use this new way (so called "Choice B") of numbering nodes.
The question is:
Should this mode be per-task, or per-system call?
The basic reason that I went with an additional per-task modal
state, rather than a modal flag for each mbind, set_mempolicy and
get_mempolicy call was to reduce the likely rate of bugs in user
level C code using this API.
Programs that code to this API in C usually first spend some number
of lines of code preparing bitmasks that represent sets of nodes,
which they will then pass into these mempolicy system calls.
The bits are numbered differently between Choices A and B.
If a piece of code adapts Choice B numbering, because it is
better suited to that codes needs, and if we used your suggested
per-system call flag, then if they miss passing in the new special
flag on -even-one- mbind, set_mempolicy or get_mempolicy system
call, they have an obscure code bug. They will be passing in
a bitmask that they calculated using Choice B node numbering,
but (implicitly) telling the kernel to interpret that bitmask
using Choice A semantics.
Essentially, a per-task modal flag, rather than a per-system call
modal flag, is a better fit for the typical C code usage of this
Choice, because it will be an entire chunk of user level code
manipulating these bitmasks that either has to be all Choice A,
or all Choice B (except in special cases, carefully coded.)
Why do you prefer a per-system call modal flag, rather than a per-task
modal flag?
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.925.600.0401
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC] cpuset relative memory policies - second choice
2007-10-31 20:19 ` Paul Jackson
@ 2007-10-31 21:05 ` Christoph Lameter
2007-11-01 4:47 ` Paul Jackson
2007-11-01 18:52 ` David Rientjes
1 sibling, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2007-10-31 21:05 UTC (permalink / raw)
To: Paul Jackson; +Cc: rientjes, Lee.Schermerhorn, linux-kernel, ak
On Wed, 31 Oct 2007, Paul Jackson wrote:
> > Make both policy and the mode char?
>
> Well, the mpol_nodemask_mode already is char. So I guess you're
> asking if we should change 'policy' to type char as well.
Right.
> > Could we shorten the mpol_nodemask_mode to mode?
>
> Huh - I don't know what "shorten ... to mode" means. If it means
> "shorten ... to char", then see my previous comment above.
No it means shorted the identifier which is a bit long for a structure
member.
> > Hmmm... I would rather have numactl manage this flag
>
> Well, numactl is a command line utility. It doesn't manage this flag
> as an -alternative- to some sort of changes to the mbind, set_mempolicy
> and get_mempolicy calls, but rather layers on top of changes to those
> system calls. So I'm not sure what you mean, but I guess it is not
> relevant to this patch discussion.
s/numactl/libnuma/
> The question is:
>
> Should this mode be per-task, or per-system call?
>
> The basic reason that I went with an additional per-task modal
> state, rather than a modal flag for each mbind, set_mempolicy and
> get_mempolicy call was to reduce the likely rate of bugs in user
> level C code using this API.
What bugs?
> Programs that code to this API in C usually first spend some number
> of lines of code preparing bitmasks that represent sets of nodes,
> which they will then pass into these mempolicy system calls.
>
> The bits are numbered differently between Choices A and B.
Right.
> If a piece of code adapts Choice B numbering, because it is
> better suited to that codes needs, and if we used your suggested
> per-system call flag, then if they miss passing in the new special
> flag on -even-one- mbind, set_mempolicy or get_mempolicy system
> call, they have an obscure code bug. They will be passing in
> a bitmask that they calculated using Choice B node numbering,
> but (implicitly) telling the kernel to interpret that bitmask
> using Choice A semantics.
But that is only done in libnuma. User code does not call this directly.
> Why do you prefer a per-system call modal flag, rather than a per-task
> modal flag?
It is a change of behavior of the function call. If some mysterious flag
somewhere influences that behavior then we have difficulties finding bugs.
The presence of the flag makes it obvious to the reviewer that we do
something special here.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC] cpuset relative memory policies - second choice
2007-10-31 21:05 ` Christoph Lameter
@ 2007-11-01 4:47 ` Paul Jackson
2007-11-01 5:25 ` Christoph Lameter
2007-11-01 19:00 ` David Rientjes
0 siblings, 2 replies; 18+ messages in thread
From: Paul Jackson @ 2007-11-01 4:47 UTC (permalink / raw)
To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, linux-kernel, ak
Christoph, replying to pj:
> > Well, the mpol_nodemask_mode already is char. So I guess you're
> > asking if we should change 'policy' to type char as well.
>
> Right.
Ok - but why?
I don't see that it matters whether policy is short or char.
> > > Could we shorten the mpol_nodemask_mode to mode?
> >
> > Huh - I don't know what "shorten ... to mode" means. If it means
> > "shorten ... to char", then see my previous comment above.
>
> No it means shorted the identifier which is a bit long for a structure
> member.
Well, I called it "mpol_nodemask_mode" in the mempolicy struct because
I called it that same thing in the task_struct struct, where the longer
name is quite useful (the task struct covers alot of ground; a brief
'mode' field name for this purpose would be too vague and short.
I guess the 'mpol_' prefix part of this struct mempolicy field name is
unnecessary. It's needed in the task struct, but not in the mempolicy
struct. So I can imagine shorting this mempolicy struct name from
"mpol_nodemask_mode" down to "nodemask_mode". But it really isn't a
generic "mode" field, so I'd be reluctant to make it that short.
Whether names should be long or short depends more to me on how they
are used in the code. I want the code to be easy to read. Sometimes
I use 30+ char names; sometimes 1 char names. There is a tendency for
external function names to be longer than local variable names or
structure field names, but that's not the only criteria.
> > > Hmmm... I would rather have numactl manage this flag
> > ..
>
> s/numactl/libnuma/
So you would rather have libnuma manage this flag?
As opposed to what else managing it? I'm still a tad confused on
what you're suggesting on this one.
> What bugs?
The ones I described further on in that message, involving computing
bitmasks using one Choice then making some of the mbind/set_mempolicy
calls with flags indicating the other Choice.
> But that is only done in libnuma. User code does not call this directly.
I disagree.
There are several mempolicy interfaces in use, including at least:
1) libnuma, which in turn is based on (5), below
2) the numactl command line utility (based on libnuma)
3) libcpuset (which has its own modest mempolicy interface)
4) the glibc mbind/set_mempolicy/get_mempolicy wrappers, in C code
5) roll your own mbind/*_mempolicy system call wrappers, in C code
> > Why do you prefer a per-system call modal flag, rather than a per-task
> > modal flag?
>
> It is a change of behavior of the function call.
Hmmm ... yes ... that's a problem. Either way seems to be a problem.
With the mode bit as in my patch, there are fewer places in the user
code that have to be gotten just right. With your way, each and
every mbind and *_mempolicy call has to be hacked with the new flag
if one is going to use the new nodemask bit numbering. Some of these
calls might be inside other routines or libraries that aren't readily
available to be examined or changed. If you miss one, or forget to
add one when adding more mbind or *_mempolicy calls in the future,
then you have a nasty lurking hidden performance bug due to misplaced
pages in certain configurations. That too is a serious maintenance
nightmare, as I've already tried to describe.
Looking at past history and prior examples, the closest I can think of
are the wait/wait2/wait3/wait4, mmap/mmap2, umount/umount2 and dup/dup2
calls. When these system calls changed, a new variant was introduced,
with a 2, 3, 4, ... suffix.
Perhaps that's the way to go:
mbind2, get_mempolicy2, and set_mempolicy2.
Then I could look at proposing a change to another detail of this
interface that has long seemed suboptimal to me, as described in:
http://lkml.org/lkml/2004/8/10/14
Subject: Re: [PATCH] get_nodes mask miscalculation
http://lkml.org/lkml/2004/9/12/250
Subject: more numa maxnode confusions
===
Unfortunately, I have a personal conflict with this work. I need to
start a three week vacation. Well, it should have started a few hours
ago.
So far as I can tell, there is no pressing emergency that this patch,
or its ancestor is addressing:
http://lkml.org/lkml/2007/10/25/449
Subject: [patch 2/2] cpusets: add interleave_over_allowed option
My access will be intermittent -- I'll be unplugging my setup in
California, and driving to Denton, Texas to work from there, still
for SGI. This internet thing that Mr. Gore invented is cool -- one
can work from most anywhere with power and a network connection.
I was hoping to put this effort to bed before this time off, but it
has gotten complicated enough that I won't be able to do that. Sorry.
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.925.600.0401
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 4:47 ` Paul Jackson
@ 2007-11-01 5:25 ` Christoph Lameter
2007-11-01 6:33 ` Paul Jackson
2007-11-01 19:00 ` David Rientjes
1 sibling, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2007-11-01 5:25 UTC (permalink / raw)
To: Paul Jackson; +Cc: rientjes, Lee.Schermerhorn, linux-kernel, ak
On Wed, 31 Oct 2007, Paul Jackson wrote:
> Christoph, replying to pj:
> > > Well, the mpol_nodemask_mode already is char. So I guess you're
> > > asking if we should change 'policy' to type char as well.
> >
> > Right.
>
> Ok - but why?
>
> I don't see that it matters whether policy is short or char.
It looks strange if they have different. Maybe use u8 for both?
> > s/numactl/libnuma/
>
> So you would rather have libnuma manage this flag?
>
> As opposed to what else managing it? I'm still a tad confused on
> what you're suggesting on this one.
You are managing it in the task struct. No need to. libnuma can handle it.
> > But that is only done in libnuma. User code does not call this directly.
>
> I disagree.
>
> There are several mempolicy interfaces in use, including at least:
> 1) libnuma, which in turn is based on (5), below
> 2) the numactl command line utility (based on libnuma)
> 3) libcpuset (which has its own modest mempolicy interface)
These are no problem 1/2 are libnuma. No current version of libcpuset
is available.
> 4) the glibc mbind/set_mempolicy/get_mempolicy wrappers, in C code
> 5) roll your own mbind/*_mempolicy system call wrappers, in C code
Those exist? Never seen that.
> With the mode bit as in my patch, there are fewer places in the user
> code that have to be gotten just right. With your way, each and
> every mbind and *_mempolicy call has to be hacked with the new flag
> if one is going to use the new nodemask bit numbering. Some of these
Yes and that makes sure it is thought through and done right.
> Perhaps that's the way to go:
> mbind2, get_mempolicy2, and set_mempolicy2.
That would be okay from the backwards compat standpoint.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 5:25 ` Christoph Lameter
@ 2007-11-01 6:33 ` Paul Jackson
2007-11-01 8:03 ` David Rientjes
2007-11-01 13:07 ` Christoph Lameter
0 siblings, 2 replies; 18+ messages in thread
From: Paul Jackson @ 2007-11-01 6:33 UTC (permalink / raw)
To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, linux-kernel, ak
> You are managing it in the task struct. No need to. libnuma can handle it.
No - as noted, not all mempolicy system calls go via libnuma.
> No current version of libcpuset is available.
Wrong. It has not received wide publication yet, but it has been
provided to various others under LGPL license.
> Those exist? Never seen that.
I've seen such go by.
David R - is your use of the mbind and *_mempolicy system calls
via libnuma, or direct system calls?
I've fielded questions from others on how to use these system
calls (directly, not via libnuma) for several years now, both
from within and without of SGI.
The glibc folks have spent some effort on refining the system
call wrappers they provide for these calls, which are not used by
libnuma (libnuma doesn't use the glibc wrappers for these calls.)
I presume that glibc work has at least some actual users.
Andi has encouraged everyone to use libnuma, but there is no
serious reason why others have to. It is not like it takes
thousands of lines of elaborate code to perform basic operations
using these calls.
A search of some old SGI release software sitting on an internal
server just now suggests that products with names including histx,
gru, libmpi and pcp might be directly invoking these system calls
... I didn't actually examine the source to determine whether
they really use these direct calls -- just got a grep hit.
I would not be surprised if some of the big batch schedulers
directly invoke these system calls, but don't know for sure.
In any event, we must assume that some use these system calls
directly. They have been available in Linux for several years.
> > With the mode bit as in my patch, there are fewer places in the user
> > code that have to be gotten just right. With your way, each and
> > every mbind and *_mempolicy call has to be hacked with the new flag
> > if one is going to use the new nodemask bit numbering. Some of these
>
> Yes and that makes sure it is thought through and done right.
Maybe for you. Not for the rest of us error prone mere mortals.
Forcing coders to specify the same detail in multiple places, when
there is no way to validate their consistency, doesn't force them
to think or do it right. It increases the error rate due to
inconsistencies. If you have to select a binary mode once, you've got
a 50-50 chance of getting it correct with just a random try, and a
half-way decent chance of noticing an error, since it would affect all
such calls. If you have to say it ten times scattered about your code
and the code of others you are calling and working with, that is being
maintained by various people coming and going over many years, and all
must be consistent, and there is -no- feedback on errors short of subtle
performance issues that only show up on certain non-trivial
configurations doing the particular sequence of calls that hit
the necessary miscoded places, then you've got about one chance in
2**10 of getting them all the same and correct, given random poking,
and close to zero chance of noticing that error in a timely fashion
when it does occur. Admittedly, humans aren't random; but we're no
model of perfection either.
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.925.600.0401
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 6:33 ` Paul Jackson
@ 2007-11-01 8:03 ` David Rientjes
2007-11-01 13:07 ` Christoph Lameter
1 sibling, 0 replies; 18+ messages in thread
From: David Rientjes @ 2007-11-01 8:03 UTC (permalink / raw)
To: Paul Jackson; +Cc: Christoph Lameter, Lee.Schermerhorn, linux-kernel, ak
On Wed, 31 Oct 2007, Paul Jackson wrote:
> David R - is your use of the mbind and *_mempolicy system calls
> via libnuma, or direct system calls?
>
I hope to be able to use libnuma exclusively once your fix is in place so
that the interleaving behaves the way we want while attached to a changing
cpuset. (And it would be preferable to not have to use a special version
of it that is hacked to support the fix.) It's just simpler to use as an
interface, but I wouldn't consider using the system calls directly an
error.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 6:33 ` Paul Jackson
2007-11-01 8:03 ` David Rientjes
@ 2007-11-01 13:07 ` Christoph Lameter
2007-11-01 16:06 ` Paul Jackson
1 sibling, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2007-11-01 13:07 UTC (permalink / raw)
To: Paul Jackson; +Cc: rientjes, Lee.Schermerhorn, linux-kernel, ak
On Wed, 31 Oct 2007, Paul Jackson wrote:
> > You are managing it in the task struct. No need to. libnuma can handle it.
>
> No - as noted, not all mempolicy system calls go via libnuma.
Well then show me.
> > No current version of libcpuset is available.
>
> Wrong. It has not received wide publication yet, but it has been
> provided to various others under LGPL license.
The last version that I remember was for 2.4.x.
> A search of some old SGI release software sitting on an internal
> server just now suggests that products with names including histx,
> gru, libmpi and pcp might be directly invoking these system calls
> ... I didn't actually examine the source to determine whether
> they really use these direct calls -- just got a grep hit.
A good argument to leave the API unchanged and not create magic task
flags.
> > > With the mode bit as in my patch, there are fewer places in the user
> > > code that have to be gotten just right. With your way, each and
> > > every mbind and *_mempolicy call has to be hacked with the new flag
> > > if one is going to use the new nodemask bit numbering. Some of these
> >
> > Yes and that makes sure it is thought through and done right.
>
> Maybe for you. Not for the rest of us error prone mere mortals.
> Forcing coders to specify the same detail in multiple places, when
> there is no way to validate their consistency, doesn't force them
> to think or do it right. It increases the error rate due to
There are always wrappers for system calls. The flags will be set in
these.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 13:07 ` Christoph Lameter
@ 2007-11-01 16:06 ` Paul Jackson
2007-11-01 17:07 ` Christoph Lameter
2007-11-01 19:03 ` David Rientjes
0 siblings, 2 replies; 18+ messages in thread
From: Paul Jackson @ 2007-11-01 16:06 UTC (permalink / raw)
To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, linux-kernel, ak
Christoph, replying to pj:
> > Wrong. It has not received wide publication yet, but it has been
> > provided to various others under LGPL license.
>
> The last version that I remember was for 2.4.x.
You might be recalling something called libcpumemset, which is about
five years old. The library known as libcpuset is SGI's current LGPL
library for use with Linux 2.6 cpusets.
> > Forcing coders to specify the same detail in multiple places, when
> > there is no way to validate their consistency, doesn't force them
> > to think or do it right. It increases the error rate due to
>
> There are always wrappers for system calls. The flags will be set in
> these.
We were discussing libnuma here, not glibc. The system call wrappers
are in glibc. System call wrappers should not be setting optional
flags. They should just make the system call -- do whatever magic it
takes to get the provided arguments into the right registers or other
conventionally determined places, and invoke the necessary machine
instruction to trap into the kernel.
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.925.600.0401
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 16:06 ` Paul Jackson
@ 2007-11-01 17:07 ` Christoph Lameter
2007-11-01 17:26 ` Paul Jackson
2007-11-01 19:03 ` David Rientjes
1 sibling, 1 reply; 18+ messages in thread
From: Christoph Lameter @ 2007-11-01 17:07 UTC (permalink / raw)
To: Paul Jackson; +Cc: rientjes, Lee.Schermerhorn, linux-kernel, ak
On Thu, 1 Nov 2007, Paul Jackson wrote:
> > > Forcing coders to specify the same detail in multiple places, when
> > > there is no way to validate their consistency, doesn't force them
> > > to think or do it right. It increases the error rate due to
> >
> > There are always wrappers for system calls. The flags will be set in
> > these.
>
> We were discussing libnuma here, not glibc. The system call wrappers
> are in glibc. System call wrappers should not be setting optional
> flags. They should just make the system call -- do whatever magic it
> takes to get the provided arguments into the right registers or other
> conventionally determined places, and invoke the necessary machine
> instruction to trap into the kernel.
The library interface can set flags to modify behavior.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 17:07 ` Christoph Lameter
@ 2007-11-01 17:26 ` Paul Jackson
2007-11-01 17:38 ` Lee Schermerhorn
` (2 more replies)
0 siblings, 3 replies; 18+ messages in thread
From: Paul Jackson @ 2007-11-01 17:26 UTC (permalink / raw)
To: Christoph Lameter; +Cc: rientjes, Lee.Schermerhorn, linux-kernel, ak
Christoph wrote:
> The library interface can set flags to modify behavior.
A library such as libnuma can set them, yes, but not everyone uses
libnuma. Basically everyone uses the standard C library, glibc, which
has the system call wrappers, but these wrappers should not be setting
optional flags.
We're going around in circles here, Christoph.
--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <pj@sgi.com> 1.925.600.0401
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 17:26 ` Paul Jackson
@ 2007-11-01 17:38 ` Lee Schermerhorn
2007-11-01 17:44 ` Christoph Lameter
2007-11-01 19:06 ` David Rientjes
2 siblings, 0 replies; 18+ messages in thread
From: Lee Schermerhorn @ 2007-11-01 17:38 UTC (permalink / raw)
To: Paul Jackson
Cc: Christoph Lameter, rientjes, linux-kernel, ak, Michael Kerrisk
On Thu, 2007-11-01 at 10:26 -0700, Paul Jackson wrote:
> Christoph wrote:
> > The library interface can set flags to modify behavior.
>
> A library such as libnuma can set them, yes, but not everyone uses
> libnuma. Basically everyone uses the standard C library, glibc, which
> has the system call wrappers, but these wrappers should not be setting
> optional flags.
>
> We're going around in circles here, Christoph.
I think that the syscall man pages can document the behavior mode flag
for folks who want to use the "raw" interface. I think we already
recommend the use of libnuma APIs. [If not we can make it so, if folks
agree.]
So, we default to old behavior in the raw syscall APIs--we MUST, right?
"no breaky user APIs..."--and let new version of the library/ies enable
new behavior when appropriate. Even a "new syscall", such as the
set_mempolicy2(), et al that you suggested, could be just wrappers over
the existing ones with the behavior mod flag. Or vice versa.
Lee
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 17:26 ` Paul Jackson
2007-11-01 17:38 ` Lee Schermerhorn
@ 2007-11-01 17:44 ` Christoph Lameter
2007-11-01 19:06 ` David Rientjes
2 siblings, 0 replies; 18+ messages in thread
From: Christoph Lameter @ 2007-11-01 17:44 UTC (permalink / raw)
To: Paul Jackson; +Cc: rientjes, Lee.Schermerhorn, linux-kernel, ak
On Thu, 1 Nov 2007, Paul Jackson wrote:
> Christoph wrote:
> > The library interface can set flags to modify behavior.
>
> A library such as libnuma can set them, yes, but not everyone uses
> libnuma. Basically everyone uses the standard C library, glibc, which
> has the system call wrappers, but these wrappers should not be setting
> optional flags.
>
> We're going around in circles here, Christoph.
Yes and you keep missing the point focusing on stuff that is not relevant.
Where did you get the rule that libraries should not be setting flags?
libraries do all sort of conversion before calling the kernel API.
Look the libraries are a strong argument against the method of setting
task flags. If you are using multiple libraries some of which have been
updated and some of those which have not then setting a task flag can
have bad consequences. You want clean syscall behavior.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 17:26 ` Paul Jackson
2007-11-01 17:38 ` Lee Schermerhorn
2007-11-01 17:44 ` Christoph Lameter
@ 2007-11-01 19:06 ` David Rientjes
2 siblings, 0 replies; 18+ messages in thread
From: David Rientjes @ 2007-11-01 19:06 UTC (permalink / raw)
To: Paul Jackson; +Cc: Christoph Lameter, Lee.Schermerhorn, linux-kernel, ak
On Thu, 1 Nov 2007, Paul Jackson wrote:
> A library such as libnuma can set them, yes, but not everyone uses
> libnuma. Basically everyone uses the standard C library, glibc, which
> has the system call wrappers, but these wrappers should not be setting
> optional flags.
>
I think what would end up happening is that additional functions would be
added to libnuma that would effect the system-wide nodemask numbering by
simply OR'ing the MPOL_F_MODE_SYS_WIDE flag as part of the mode actual.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 16:06 ` Paul Jackson
2007-11-01 17:07 ` Christoph Lameter
@ 2007-11-01 19:03 ` David Rientjes
1 sibling, 0 replies; 18+ messages in thread
From: David Rientjes @ 2007-11-01 19:03 UTC (permalink / raw)
To: Paul Jackson; +Cc: Christoph Lameter, Lee.Schermerhorn, linux-kernel, ak
On Thu, 1 Nov 2007, Paul Jackson wrote:
> We were discussing libnuma here, not glibc. The system call wrappers
> are in glibc. System call wrappers should not be setting optional
> flags. They should just make the system call -- do whatever magic it
> takes to get the provided arguments into the right registers or other
> conventionally determined places, and invoke the necessary machine
> instruction to trap into the kernel.
>
So there is no problem with allowing modal flags to be passed to
set_mempolicy() because glibc will respect the mode actual and pass it
right along to the kernel. That gives the power to the programmer to
specify whether he or she, based on the updated documentation, wants the
nodemask to be interpreted in terms of the system or cpuset.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC] cpuset relative memory policies - second choice
2007-11-01 4:47 ` Paul Jackson
2007-11-01 5:25 ` Christoph Lameter
@ 2007-11-01 19:00 ` David Rientjes
1 sibling, 0 replies; 18+ messages in thread
From: David Rientjes @ 2007-11-01 19:00 UTC (permalink / raw)
To: Paul Jackson; +Cc: Christoph Lameter, Lee.Schermerhorn, linux-kernel, ak
On Wed, 31 Oct 2007, Paul Jackson wrote:
> With the mode bit as in my patch, there are fewer places in the user
> code that have to be gotten just right. With your way, each and
> every mbind and *_mempolicy call has to be hacked with the new flag
> if one is going to use the new nodemask bit numbering.
That code is going to have to be hacked anyway to use the new nodemask
semantics, so it can easily add a flag to the set_mempolicy() call for the
system-wide node numbering. This then only requires a small addition to
the documentation: use MPOL_F_MODE_SYS_WIDE for system-wide node numbering
that isn't constrained to your cpuset.
> Some of these
> calls might be inside other routines or libraries that aren't readily
> available to be examined or changed. If you miss one, or forget to
> add one when adding more mbind or *_mempolicy calls in the future,
> then you have a nasty lurking hidden performance bug due to misplaced
> pages in certain configurations. That too is a serious maintenance
> nightmare, as I've already tried to describe.
>
That's a very legitimate concern, but those libraries will eventually need
to be made to support this new extention anyway. They will be modified
just like we're modifying the kernel once people want to start using the
different nodemask semantics. As mempolicy modes become more popular,
those libraries are going to start accepting custom mode flags to pass to
their set_mempolicy() wrappers that will get OR'd with the mempolicy mode
that is used. It will be the natural progression of how mempolicies are
supported in userspace.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [RFC] cpuset relative memory policies - second choice
2007-10-31 20:19 ` Paul Jackson
2007-10-31 21:05 ` Christoph Lameter
@ 2007-11-01 18:52 ` David Rientjes
1 sibling, 0 replies; 18+ messages in thread
From: David Rientjes @ 2007-11-01 18:52 UTC (permalink / raw)
To: Paul Jackson; +Cc: Christoph Lameter, Lee.Schermerhorn, linux-kernel, ak
On Wed, 31 Oct 2007, Paul Jackson wrote:
> The basic reason that I went with an additional per-task modal
> state, rather than a modal flag for each mbind, set_mempolicy and
> get_mempolicy call was to reduce the likely rate of bugs in user
> level C code using this API.
>
I think it may be more error prone to accidently leave off the
get_mempolicy() system call to use the system-wide numbering or for
user-level code to forget that the mode was already set (and now stored in
their task_struct) without a subsequent get_mempolicy() to revert back to
the default behavior. Both of these problems can be addressed by checking
the *policy returned by get_mempolicy(), as you've coded it, but many
applications will probably ignore that overhead.
Allowing the mode to be passed on each set_mempolicy() system call seems
better, this is where the nodemask is passed anyway so it's legitimate for
the caller to specify how that nodemask should be interpreted (either
system-wide or cpuset-wide). This keeps all semantics of the nodemask to
a single invocation of a system call instead of setting policy modes with
get_mempolicy() and confusing the matter later.
I think get_mempolicy() can remain unchanged because it will simply return
the contextualized nodemask in either case and would not require a mode to
be passed.
David
^ permalink raw reply [flat|nested] 18+ messages in thread
end of thread, other threads:[~2007-11-01 19:06 UTC | newest]
Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-10-31 6:17 [RFC] cpuset relative memory policies - second choice Paul Jackson
2007-10-31 18:59 ` Christoph Lameter
2007-10-31 20:19 ` Paul Jackson
2007-10-31 21:05 ` Christoph Lameter
2007-11-01 4:47 ` Paul Jackson
2007-11-01 5:25 ` Christoph Lameter
2007-11-01 6:33 ` Paul Jackson
2007-11-01 8:03 ` David Rientjes
2007-11-01 13:07 ` Christoph Lameter
2007-11-01 16:06 ` Paul Jackson
2007-11-01 17:07 ` Christoph Lameter
2007-11-01 17:26 ` Paul Jackson
2007-11-01 17:38 ` Lee Schermerhorn
2007-11-01 17:44 ` Christoph Lameter
2007-11-01 19:06 ` David Rientjes
2007-11-01 19:03 ` David Rientjes
2007-11-01 19:00 ` David Rientjes
2007-11-01 18:52 ` David Rientjes
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox