* [PATCH 01/10] cpuset: remove redundant check in cpuset_cpus_allowed_fallback()
[not found] ` <517137A7.4020605-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
@ 2013-04-19 12:25 ` Li Zefan
2013-04-19 12:25 ` [PATCH 02/10] cpuset: cleanup guarantee_online_{cpus|mems}() Li Zefan
` (8 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Li Zefan @ 2013-04-19 12:25 UTC (permalink / raw)
To: Tejun Heo; +Cc: Cgroups, Containers, LKML
task_cs() will never return NULL.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cpuset.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 866d78e..4a62dcc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2259,8 +2259,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
rcu_read_lock();
cs = task_cs(tsk);
- if (cs)
- do_set_cpus_allowed(tsk, cs->cpus_allowed);
+ do_set_cpus_allowed(tsk, cs->cpus_allowed);
rcu_read_unlock();
/*
--
1.8.0.2
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 02/10] cpuset: cleanup guarantee_online_{cpus|mems}()
[not found] ` <517137A7.4020605-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
2013-04-19 12:25 ` [PATCH 01/10] cpuset: remove redundant check in cpuset_cpus_allowed_fallback() Li Zefan
@ 2013-04-19 12:25 ` Li Zefan
2013-04-19 12:26 ` [PATCH 03/10] cpuset: remove unnecessary variable in cpuset_attach() Li Zefan
` (7 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Li Zefan @ 2013-04-19 12:25 UTC (permalink / raw)
To: Tejun Heo; +Cc: Cgroups, Containers, LKML
- We never pass a NULL @cs to these functions.
- The top cpuset always has some online cpus/mems.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cpuset.c | 29 +++++++----------------------
1 file changed, 7 insertions(+), 22 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4a62dcc..6f9d5e0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -304,53 +304,38 @@ static struct file_system_type cpuset_fs_type = {
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. If we get
- * all the way to the top and still haven't found any online cpus,
- * return cpu_online_mask. Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_mask.
+ * until we find one that does have some online cpus. The top
+ * cpuset always has some cpus online.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
* Call with callback_mutex held.
*/
-
static void guarantee_online_cpus(const struct cpuset *cs,
struct cpumask *pmask)
{
- while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+ while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
cs = parent_cs(cs);
- if (cs)
- cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
- else
- cpumask_copy(pmask, cpu_online_mask);
- BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
+ cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
}
/*
* Return in *pmask the portion of a cpusets's mems_allowed that
* are online, with memory. If none are online with memory, walk
* up the cpuset hierarchy until we find one that does have some
- * online mems. If we get all the way to the top and still haven't
- * found any online mems, return node_states[N_MEMORY].
+ * online mems. The top cpuset always has some mems online.
*
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
* Call with callback_mutex held.
*/
-
static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
{
- while (cs && !nodes_intersects(cs->mems_allowed,
- node_states[N_MEMORY]))
+ while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
cs = parent_cs(cs);
- if (cs)
- nodes_and(*pmask, cs->mems_allowed,
- node_states[N_MEMORY]);
- else
- *pmask = node_states[N_MEMORY];
- BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
+ nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
}
/*
--
1.8.0.2
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 03/10] cpuset: remove unnecessary variable in cpuset_attach()
[not found] ` <517137A7.4020605-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
2013-04-19 12:25 ` [PATCH 01/10] cpuset: remove redundant check in cpuset_cpus_allowed_fallback() Li Zefan
2013-04-19 12:25 ` [PATCH 02/10] cpuset: cleanup guarantee_online_{cpus|mems}() Li Zefan
@ 2013-04-19 12:26 ` Li Zefan
2013-04-19 12:26 ` [PATCH 04/10] cpuset: remove cpuset_test_cpumask() Li Zefan
` (6 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Li Zefan @ 2013-04-19 12:26 UTC (permalink / raw)
To: Tejun Heo; +Cc: Cgroups, Containers, LKML
We can just use oldcs->mems_allowed.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cpuset.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6f9d5e0..66302a9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1406,8 +1406,7 @@ static cpumask_var_t cpus_attach;
static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{
- /* static bufs protected by cpuset_mutex */
- static nodemask_t cpuset_attach_nodemask_from;
+ /* static buf protected by cpuset_mutex */
static nodemask_t cpuset_attach_nodemask_to;
struct mm_struct *mm;
struct task_struct *task;
@@ -1441,13 +1440,12 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
* Change mm, possibly for multiple threads in a threadgroup. This is
* expensive and may sleep.
*/
- cpuset_attach_nodemask_from = oldcs->mems_allowed;
cpuset_attach_nodemask_to = cs->mems_allowed;
mm = get_task_mm(leader);
if (mm) {
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+ cpuset_migrate_mm(mm, &oldcs->mems_allowed,
&cpuset_attach_nodemask_to);
mmput(mm);
}
--
1.8.0.2
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 04/10] cpuset: remove cpuset_test_cpumask()
[not found] ` <517137A7.4020605-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
` (2 preceding siblings ...)
2013-04-19 12:26 ` [PATCH 03/10] cpuset: remove unnecessary variable in cpuset_attach() Li Zefan
@ 2013-04-19 12:26 ` Li Zefan
[not found] ` <517137E7.20505-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
2013-04-19 12:27 ` [PATCH 05/10] cpuset: don't update tasks' cpumask and nodemask in an empty cpuset Li Zefan
` (5 subsequent siblings)
9 siblings, 1 reply; 17+ messages in thread
From: Li Zefan @ 2013-04-19 12:26 UTC (permalink / raw)
To: Tejun Heo; +Cc: Cgroups, Containers, LKML
The test is done in set_cpus_allowed_ptr(), so it's redundant.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cpuset.c | 19 +------------------
1 file changed, 1 insertion(+), 18 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 66302a9..8b0e433 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -783,23 +783,6 @@ void rebuild_sched_domains(void)
}
/**
- * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
- * @tsk: task to test
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Call with cpuset_mutex held. May take callback_mutex during call.
- * Called for each task in a cgroup by cgroup_scan_tasks().
- * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
- * words, if its mask is not equal to its cpuset's mask).
- */
-static int cpuset_test_cpumask(struct task_struct *tsk,
- struct cgroup_scanner *scan)
-{
- return !cpumask_equal(&tsk->cpus_allowed,
- (cgroup_cs(scan->cg))->cpus_allowed);
-}
-
-/**
* cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
* @tsk: task to test
* @scan: struct cgroup_scanner containing the cgroup of the task
@@ -834,7 +817,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
struct cgroup_scanner scan;
scan.cg = cs->css.cgroup;
- scan.test_task = cpuset_test_cpumask;
+ scan.test_task = NULL;
scan.process_task = cpuset_change_cpumask;
scan.heap = heap;
cgroup_scan_tasks(&scan);
--
1.8.0.2
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 05/10] cpuset: don't update tasks' cpumask and nodemask in an empty cpuset
[not found] ` <517137A7.4020605-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
` (3 preceding siblings ...)
2013-04-19 12:26 ` [PATCH 04/10] cpuset: remove cpuset_test_cpumask() Li Zefan
@ 2013-04-19 12:27 ` Li Zefan
2013-04-19 12:27 ` [PATCH 06/10] cpuset: re-structure update_cpumask() a bit Li Zefan
` (4 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Li Zefan @ 2013-04-19 12:27 UTC (permalink / raw)
To: Tejun Heo; +Cc: Cgroups, Containers, LKML
I think this was introduced unintentionally when cpuset hotplug was
made asynchronous. Fortunately it does no harm, as updating tasks'
cpumask will just return failure and there's a guarantee_online_mems()
when updating nodemask, and then the tasks will be moved to an ancestor
cpuset.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cpuset.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8b0e433..2e4c5af 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2013,7 +2013,9 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
mutex_lock(&callback_mutex);
cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
mutex_unlock(&callback_mutex);
- update_tasks_cpumask(cs, NULL);
+
+ if (!cpumask_empty(cs->cpus_allowed))
+ update_tasks_cpumask(cs, NULL);
}
/* remove offline mems from @cs */
@@ -2022,7 +2024,9 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
mutex_lock(&callback_mutex);
nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(cs, &tmp_mems, NULL);
+
+ if (!nodes_empty(cs->mems_allowed))
+ update_tasks_nodemask(cs, &tmp_mems, NULL);
}
is_empty = cpumask_empty(cs->cpus_allowed) ||
--
1.8.0.2
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 06/10] cpuset: re-structure update_cpumask() a bit
[not found] ` <517137A7.4020605-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
` (4 preceding siblings ...)
2013-04-19 12:27 ` [PATCH 05/10] cpuset: don't update tasks' cpumask and nodemask in an empty cpuset Li Zefan
@ 2013-04-19 12:27 ` Li Zefan
2013-04-19 12:28 ` [PATCH 07/10] cpuset: record old_mems_allowed in struct cpuset Li Zefan
` (3 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Li Zefan @ 2013-04-19 12:27 UTC (permalink / raw)
To: Tejun Heo; +Cc: Cgroups, Containers, LKML
Check if cpus_allowed is to be changed before calling validate_change().
This won't change any behavior, but later it will allow us to do this:
# mkdir /cpuset/child
# echo $$ > /cpuset/child/tasks /* empty cpuset */
# echo > /cpuset/child/cpuset.cpus /* do nothing, won't fail */
Without this patch, the last operation will fail.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cpuset.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2e4c5af..63a5343 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -855,14 +855,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
return -EINVAL;
}
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- return retval;
/* Nothing to do if the cpus didn't change */
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
+ retval = validate_change(cs, trialcs);
+ if (retval < 0)
+ return retval;
+
retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
if (retval)
return retval;
--
1.8.0.2
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 07/10] cpuset: record old_mems_allowed in struct cpuset
[not found] ` <517137A7.4020605-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
` (5 preceding siblings ...)
2013-04-19 12:27 ` [PATCH 06/10] cpuset: re-structure update_cpumask() a bit Li Zefan
@ 2013-04-19 12:28 ` Li Zefan
2013-04-19 12:28 ` [PATCH 08/10] cpuset: introduce effective_{cpumask|nodemask}_cpuset() Li Zefan
` (2 subsequent siblings)
9 siblings, 0 replies; 17+ messages in thread
From: Li Zefan @ 2013-04-19 12:28 UTC (permalink / raw)
To: Tejun Heo; +Cc: Cgroups, Containers, LKML
When we update a cpuset's mems_allowed and thus update tasks'
mems_allowed, it's required to pass the old mems_allowed and new
mems_allowed to cpuset_migrate_mm().
Currently we save old mems_allowed in a temp local variable before
changing cpuset->mems_allowed. This patch changes it by saving
old mems_allowed in cpuset->old_mems_allowed.
This currently won't change any behavior, but it will later allow
us to keep tasks in empty cpusets.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cpuset.c | 62 +++++++++++++++++++++++++++++++++------------------------
1 file changed, 36 insertions(+), 26 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 63a5343..c4f9ebd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -87,6 +87,18 @@ struct cpuset {
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
+ /*
+ * This is old Memory Nodes tasks took on.
+ *
+ * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
+ * - A new cpuset's old_mems_allowed is initialized when some
+ * task is moved into it.
+ * - old_mems_allowed is used in cpuset_migrate_mm() when we change
+ * cpuset.mems_allowed and have tasks' nodemask updated, and
+ * then old_mems_allowed is updated to mems_allowed.
+ */
+ nodemask_t old_mems_allowed;
+
struct fmeter fmeter; /* memory_pressure filter */
/*
@@ -975,16 +987,12 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
static void cpuset_change_nodemask(struct task_struct *p,
struct cgroup_scanner *scan)
{
+ struct cpuset *cs = cgroup_cs(scan->cg);
struct mm_struct *mm;
- struct cpuset *cs;
int migrate;
- const nodemask_t *oldmem = scan->data;
- static nodemask_t newmems; /* protected by cpuset_mutex */
-
- cs = cgroup_cs(scan->cg);
- guarantee_online_mems(cs, &newmems);
+ nodemask_t *newmems = scan->data;
- cpuset_change_task_nodemask(p, &newmems);
+ cpuset_change_task_nodemask(p, newmems);
mm = get_task_mm(p);
if (!mm)
@@ -994,7 +1002,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
- cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
+ cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems);
mmput(mm);
}
@@ -1003,25 +1011,26 @@ static void *cpuset_being_rebound;
/**
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- * @oldmem: old mems_allowed of cpuset cs
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
* Called with cpuset_mutex held
* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
* if @heap != NULL.
*/
-static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
- struct ptr_heap *heap)
+static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
{
+ static nodemask_t newmems; /* protected by cpuset_mutex */
struct cgroup_scanner scan;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
+ guarantee_online_mems(cs, &newmems);
+
scan.cg = cs->css.cgroup;
scan.test_task = NULL;
scan.process_task = cpuset_change_nodemask;
scan.heap = heap;
- scan.data = (nodemask_t *)oldmem;
+ scan.data = &newmems;
/*
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1035,6 +1044,12 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
*/
cgroup_scan_tasks(&scan);
+ /*
+ * All the tasks' nodemasks have been updated, update
+ * cs->old_mems_allowed.
+ */
+ cs->old_mems_allowed = newmems;
+
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
cpuset_being_rebound = NULL;
}
@@ -1055,13 +1070,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
int retval;
struct ptr_heap heap;
- if (!oldmem)
- return -ENOMEM;
-
/*
* top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
* it's read-only
@@ -1090,8 +1101,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
goto done;
}
}
- *oldmem = cs->mems_allowed;
- if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
+
+ if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
goto done;
}
@@ -1107,11 +1118,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
cs->mems_allowed = trialcs->mems_allowed;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(cs, oldmem, &heap);
+ update_tasks_nodemask(cs, &heap);
heap_free(&heap);
done:
- NODEMASK_FREE(oldmem);
return retval;
}
@@ -1424,7 +1434,6 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
* Change mm, possibly for multiple threads in a threadgroup. This is
* expensive and may sleep.
*/
- cpuset_attach_nodemask_to = cs->mems_allowed;
mm = get_task_mm(leader);
if (mm) {
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
@@ -1434,6 +1443,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
mmput(mm);
}
+ cs->old_mems_allowed = cpuset_attach_nodemask_to;
+
cs->attach_in_progress--;
/*
@@ -2000,7 +2011,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
{
static cpumask_t off_cpus;
- static nodemask_t off_mems, tmp_mems;
+ static nodemask_t off_mems;
struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
bool is_empty;
@@ -2021,13 +2032,12 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
/* remove offline mems from @cs */
if (!nodes_empty(off_mems)) {
- tmp_mems = cs->mems_allowed;
mutex_lock(&callback_mutex);
nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
mutex_unlock(&callback_mutex);
if (!nodes_empty(cs->mems_allowed))
- update_tasks_nodemask(cs, &tmp_mems, NULL);
+ update_tasks_nodemask(cs, NULL);
}
is_empty = cpumask_empty(cs->cpus_allowed) ||
@@ -2119,11 +2129,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
- tmp_mems = top_cpuset.mems_allowed;
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = new_mems;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
+ update_tasks_nodemask(&top_cpuset, NULL);
}
/* if cpus or mems went down, we need to propagate to descendants */
@@ -2196,6 +2205,7 @@ void __init cpuset_init_smp(void)
{
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
top_cpuset.mems_allowed = node_states[N_MEMORY];
+ top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
hotplug_memory_notifier(cpuset_track_online_nodes, 10);
--
1.8.0.2
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 08/10] cpuset: introduce effective_{cpumask|nodemask}_cpuset()
[not found] ` <517137A7.4020605-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
` (6 preceding siblings ...)
2013-04-19 12:28 ` [PATCH 07/10] cpuset: record old_mems_allowed in struct cpuset Li Zefan
@ 2013-04-19 12:28 ` Li Zefan
2013-04-19 12:29 ` [PATCH 09/10] cpuset: allow to keep tasks in empty cpusets Li Zefan
2013-04-19 12:29 ` [PATCH 10/10] cpuset: allow to move tasks to " Li Zefan
9 siblings, 0 replies; 17+ messages in thread
From: Li Zefan @ 2013-04-19 12:28 UTC (permalink / raw)
To: Tejun Heo; +Cc: Cgroups, Containers, LKML
effective_cpumask_cpuset() returns an ancestor cpuset which has
non-empty cpumask.
If a cpuset is empty and the tasks in it need to update their
cpus_allowed, they take on the ancestor cpuset's cpumask.
This currently won't change any behavior, but it will later allow us
to keep tasks in empty cpusets.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cpuset.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 65 insertions(+), 11 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c4f9ebd..741e652 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -794,6 +794,45 @@ void rebuild_sched_domains(void)
mutex_unlock(&cpuset_mutex);
}
+/*
+ * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
+ * @cs: the cpuset in interest
+ *
+ * A cpuset's effective cpumask is the cpumask of the nearest ancestor
+ * with non-empty cpus. We use effective cpumask whenever:
+ * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
+ * if the cpuset they reside in has no cpus)
+ * - we want to retrieve task_cs(tsk)'s cpus_allowed.
+ *
+ * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
+ * exception. See comments there.
+ */
+static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
+{
+ while (cpumask_empty(cs->cpus_allowed))
+ cs = parent_cs(cs);
+ return cs;
+}
+
+/*
+ * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
+ * @cs: the cpuset in interest
+ *
+ * A cpuset's effective nodemask is the nodemask of the nearest ancestor
+ * with non-empty memss. We use effective nodemask whenever:
+ * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
+ * if the cpuset they reside in has no mems)
+ * - we want to retrieve task_cs(tsk)'s mems_allowed.
+ *
+ * Called with cpuset_mutex held.
+ */
+static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
+{
+ while (nodes_empty(cs->mems_allowed))
+ cs = parent_cs(cs);
+ return cs;
+}
+
/**
* cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
* @tsk: task to test
@@ -808,7 +847,10 @@ void rebuild_sched_domains(void)
static void cpuset_change_cpumask(struct task_struct *tsk,
struct cgroup_scanner *scan)
{
- set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
+ struct cpuset *cpus_cs;
+
+ cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
+ set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
}
/**
@@ -923,12 +965,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct task_struct *tsk = current;
+ struct cpuset *mems_cs;
tsk->mems_allowed = *to;
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
- guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
+ mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+ guarantee_online_mems(mems_cs, &tsk->mems_allowed);
}
/*
@@ -1021,10 +1065,11 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
{
static nodemask_t newmems; /* protected by cpuset_mutex */
struct cgroup_scanner scan;
+ struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- guarantee_online_mems(cs, &newmems);
+ guarantee_online_mems(mems_cs, &newmems);
scan.cg = cs->css.cgroup;
scan.test_task = NULL;
@@ -1408,6 +1453,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *oldcs = cgroup_cs(oldcgrp);
+ struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+ struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
mutex_lock(&cpuset_mutex);
@@ -1415,9 +1462,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
else
- guarantee_online_cpus(cs, cpus_attach);
+ guarantee_online_cpus(cpus_cs, cpus_attach);
- guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, cgrp, tset) {
/*
@@ -1436,9 +1483,11 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
*/
mm = get_task_mm(leader);
if (mm) {
+ struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
+
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, &oldcs->mems_allowed,
+ cpuset_migrate_mm(mm, &mems_oldcs->mems_allowed,
&cpuset_attach_nodemask_to);
mmput(mm);
}
@@ -2227,20 +2276,23 @@ void __init cpuset_init_smp(void)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
+ struct cpuset *cpus_cs;
+
mutex_lock(&callback_mutex);
task_lock(tsk);
- guarantee_online_cpus(task_cs(tsk), pmask);
+ cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+ guarantee_online_cpus(cpus_cs, pmask);
task_unlock(tsk);
mutex_unlock(&callback_mutex);
}
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- const struct cpuset *cs;
+ const struct cpuset *cpus_cs;
rcu_read_lock();
- cs = task_cs(tsk);
- do_set_cpus_allowed(tsk, cs->cpus_allowed);
+ cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
+ do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
rcu_read_unlock();
/*
@@ -2279,11 +2331,13 @@ void cpuset_init_current_mems_allowed(void)
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
+ struct cpuset *mems_cs;
nodemask_t mask;
mutex_lock(&callback_mutex);
task_lock(tsk);
- guarantee_online_mems(task_cs(tsk), &mask);
+ mems_cs = effective_nodemask_cpuset(task_cs(tsk));
+ guarantee_online_mems(mems_cs, &mask);
task_unlock(tsk);
mutex_unlock(&callback_mutex);
--
1.8.0.2
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 09/10] cpuset: allow to keep tasks in empty cpusets
[not found] ` <517137A7.4020605-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
` (7 preceding siblings ...)
2013-04-19 12:28 ` [PATCH 08/10] cpuset: introduce effective_{cpumask|nodemask}_cpuset() Li Zefan
@ 2013-04-19 12:29 ` Li Zefan
2013-04-19 12:29 ` [PATCH 10/10] cpuset: allow to move tasks to " Li Zefan
9 siblings, 0 replies; 17+ messages in thread
From: Li Zefan @ 2013-04-19 12:29 UTC (permalink / raw)
To: Tejun Heo; +Cc: Cgroups, Containers, LKML
To achieve this:
- We call update_tasks_cpumask/nodemask() for empty cpusets when
hotplug happens, instead of moving tasks out of them.
- When a cpuset's masks are changed by writing cpuset.cpus/mems,
we also update tasks in child cpusets which are empty.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cpuset.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 146 insertions(+), 25 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 741e652..95e9394 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -99,6 +99,9 @@ struct cpuset {
*/
nodemask_t old_mems_allowed;
+ /* used in cpuset_update_nodemask_workfn() */
+ struct ptr_heap *heap;
+
struct fmeter fmeter; /* memory_pressure filter */
/*
@@ -114,6 +117,7 @@ struct cpuset {
int relax_domain_level;
struct work_struct hotplug_work;
+ struct work_struct update_nodemask_work;
};
/* Retrieve the cpuset for a cgroup */
@@ -276,6 +280,8 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex);
+static struct workqueue_struct *cpuset_update_nodemask_wq;
+
/*
* CPU / memory hotplug is handled asynchronously.
*/
@@ -877,6 +883,39 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
cgroup_scan_tasks(&scan);
}
+/*
+ * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
+ * @root_cs: the root cpuset of the hierarchy
+ * @update_root: update root cpuset or not?
+ * @heap: the heap used by cgroup_scan_tasks()
+ *
+ * This will update cpumasks of tasks in @root_cs and all other empty cpusets
+ * which take on cpumask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_cpumask_hier(struct cpuset *root_cs,
+ bool update_root, struct ptr_heap *heap)
+{
+ struct cpuset *cp;
+ struct cgroup *pos_cgrp;
+
+ if (update_root)
+ update_tasks_cpumask(root_cs, heap);
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+ /* skip the whole subtree if @cp have some CPU */
+ if (!cpumask_empty(cp->cpus_allowed)) {
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+ continue;
+ }
+
+ update_tasks_cpumask(cp, heap);
+ }
+ rcu_read_unlock();
+}
+
/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
@@ -928,11 +967,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
- /*
- * Scan tasks in the cpuset, and update the cpumasks of any
- * that need an update.
- */
- update_tasks_cpumask(cs, &heap);
+ update_tasks_cpumask_hier(cs, true, &heap);
heap_free(&heap);
@@ -1099,6 +1134,78 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
cpuset_being_rebound = NULL;
}
+static void cpuset_update_nodemask_workfn(struct work_struct *work)
+{
+ struct cpuset *cs = container_of(work, struct cpuset,
+ update_nodemask_work);
+
+ update_tasks_nodemask(cs, cs->heap);
+ css_put(&cs->css);
+}
+
+static void schedule_update_tasks_nodemask(struct cpuset *cs,
+ struct ptr_heap *heap)
+{
+ bool queued;
+
+ /* Will be released when the work item finishes executing. */
+ if (!css_tryget(&cs->css))
+ return;
+
+ /*
+ * The caller will flush the workqueue with cpuset_mutex held,
+ * so it's not possible a work item was already queued, and
+ * we're sure cs->heap is valid.
+ */
+ cs->heap = heap;
+ queued = queue_work(cpuset_update_nodemask_wq,
+ &cs->update_nodemask_work);
+ if (queued) {
+ WARN_ON(1);
+ css_put(&cs->css);
+ }
+}
+
+/*
+ * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
+ * @cs: the root cpuset of the hierarchy
+ * @update_root: update the root cpuset or not?
+ * @heap: the heap used by cgroup_scan_tasks()
+ *
+ * This will update nodemasks of tasks in @root_cs and all other empty cpusets
+ * which take on nodemask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_nodemask_hier(struct cpuset *root_cs,
+ bool update_root, struct ptr_heap *heap)
+{
+ struct cpuset *cp;
+ struct cgroup *pos_cgrp;
+
+ if (update_root)
+ update_tasks_nodemask(root_cs, heap);
+
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+ /* skip the whole subtree if @cp have some CPU */
+ if (!nodes_empty(cp->mems_allowed)) {
+ pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+ continue;
+ }
+
+ schedule_update_tasks_nodemask(cp, heap);
+ }
+ rcu_read_unlock();
+
+ /*
+ * The only reason we use workqueue is update_tasks_nodemask() can't
+ * be called in rcu_read_lock(). Flush the workqueue to make sure
+ * all the updates are done before we return.
+ */
+ flush_workqueue(cpuset_update_nodemask_wq);
+}
+
/*
* Handle user request to change the 'mems' memory placement
* of a cpuset. Needs to validate the request, update the
@@ -1163,7 +1270,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
cs->mems_allowed = trialcs->mems_allowed;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(cs, &heap);
+ update_tasks_nodemask_hier(cs, true, &heap);
heap_free(&heap);
done:
@@ -1888,6 +1995,7 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
nodes_clear(cs->mems_allowed);
fmeter_init(&cs->fmeter);
INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
+ INIT_WORK(&cs->update_nodemask_work, cpuset_update_nodemask_workfn);
cs->relax_domain_level = -1;
return &cs->css;
@@ -2063,31 +2171,36 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
static nodemask_t off_mems;
struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
bool is_empty;
+ bool sane = cgroup_sane_behavior(cs->css.cgroup);
mutex_lock(&cpuset_mutex);
cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
- /* remove offline cpus from @cs */
- if (!cpumask_empty(&off_cpus)) {
- mutex_lock(&callback_mutex);
- cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
- mutex_unlock(&callback_mutex);
+ mutex_lock(&callback_mutex);
+ cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+ mutex_unlock(&callback_mutex);
- if (!cpumask_empty(cs->cpus_allowed))
- update_tasks_cpumask(cs, NULL);
- }
+ /*
+ * If sane_behavior flag is set, we need to update tasks' cpumask
+ * for empy cpuset to take on ancestor's cpumask
+ */
+ if ((sane && cpumask_empty(cs->cpus_allowed)) ||
+ (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
+ update_tasks_cpumask(cs, NULL);
- /* remove offline mems from @cs */
- if (!nodes_empty(off_mems)) {
- mutex_lock(&callback_mutex);
- nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
- mutex_unlock(&callback_mutex);
+ mutex_lock(&callback_mutex);
+ nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+ mutex_unlock(&callback_mutex);
- if (!nodes_empty(cs->mems_allowed))
- update_tasks_nodemask(cs, NULL);
- }
+ /*
+ * If sane_behavior flag is set, we need to update tasks' nodemask
+ * for empy cpuset to take on ancestor's nodemask
+ */
+ if ((sane && nodes_empty(cs->mems_allowed)) ||
+ (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
+ update_tasks_nodemask(cs, NULL);
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
@@ -2095,11 +2208,13 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
mutex_unlock(&cpuset_mutex);
/*
- * If @cs became empty, move tasks to the nearest ancestor with
- * execution resources. This is full cgroup operation which will
+ * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
+ *
+ * Otherwise move tasks to the nearest ancestor with execution
+ * resources. This is full cgroup operation which will
* also call back into cpuset. Should be done outside any lock.
*/
- if (is_empty)
+ if (!sane && is_empty)
remove_tasks_in_empty_cpuset(cs);
/* the following may free @cs, should be the last operation */
@@ -2174,6 +2289,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
mutex_unlock(&callback_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
+ update_tasks_cpumask_hier(&top_cpuset, false, NULL);
}
/* synchronize mems_allowed to N_MEMORY */
@@ -2182,6 +2298,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
top_cpuset.mems_allowed = new_mems;
mutex_unlock(&callback_mutex);
update_tasks_nodemask(&top_cpuset, NULL);
+ update_tasks_nodemask_hier(&top_cpuset, false, NULL);
}
/* if cpus or mems went down, we need to propagate to descendants */
@@ -2261,6 +2378,10 @@ void __init cpuset_init_smp(void)
cpuset_propagate_hotplug_wq =
alloc_ordered_workqueue("cpuset_hotplug", 0);
BUG_ON(!cpuset_propagate_hotplug_wq);
+
+ cpuset_update_nodemask_wq =
+ create_workqueue("cpuset_update_nodemask");
+ BUG_ON(!cpuset_update_nodemask_wq);
}
/**
--
1.8.0.2
^ permalink raw reply related [flat|nested] 17+ messages in thread* [PATCH 10/10] cpuset: allow to move tasks to empty cpusets
[not found] ` <517137A7.4020605-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
` (8 preceding siblings ...)
2013-04-19 12:29 ` [PATCH 09/10] cpuset: allow to keep tasks in empty cpusets Li Zefan
@ 2013-04-19 12:29 ` Li Zefan
9 siblings, 0 replies; 17+ messages in thread
From: Li Zefan @ 2013-04-19 12:29 UTC (permalink / raw)
To: Tejun Heo; +Cc: Cgroups, Containers, LKML
Currently some cpuset behaviors are not friendly when cpuset is co-mounted
with other cgroup controllers.
Now with this patchset if cpuset is mounted with sane_behavior option,
it behaves differently:
- Tasks will be kept in empty cpusets when hotplug happens and take
masks of ancestors with non-empty cpus/mems, instead of being moved to
an ancestor.
- A task can be moved into an empty cpuset, and again it takes masks of
ancestors, so we drop a task into a newly created cgroup without the
user having to do anything for it.
As tasks can reside in empy cpusets, here're some rules:
- They can be moved to another cpuset, regardless it's empty or not.
- Though it takes masks from ancestors, it takes other configs from the
empty cpuset.
- If the ancestors' masks are changed, those tasks will also be updated
to take new masks.
Signed-off-by: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
---
kernel/cpuset.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 95e9394..646c794 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -489,7 +489,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
*/
ret = -ENOSPC;
if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
- (cpumask_empty(trial->cpus_allowed) ||
+ (cpumask_empty(trial->cpus_allowed) &&
nodes_empty(trial->mems_allowed)))
goto out;
@@ -1502,8 +1502,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
mutex_lock(&cpuset_mutex);
+ /*
+ * We allow to move tasks into an empty cpuset if sane_behavior
+ * flag is set.
+ */
ret = -ENOSPC;
- if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ if (!cgroup_sane_behavior(cgrp) &&
+ (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
cgroup_taskset_for_each(task, cgrp, tset) {
@@ -1608,7 +1613,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
* propagation if @cs doesn't have any CPU or memory. It will move
* the newly added tasks to the nearest parent which can execute.
*/
- if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ if (!cgroup_sane_behavior(cgrp) &&
+ (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
schedule_cpuset_propagate_hotplug(cs);
mutex_unlock(&cpuset_mutex);
--
1.8.0.2
^ permalink raw reply related [flat|nested] 17+ messages in thread