* [PATCH 0/4] cpuset: cleanups and fixes
@ 2009-02-26 7:02 Li Zefan
2009-02-26 7:02 ` [PATCH 1/4] cgroups: add 'data' field to struct cgroup_scanner Li Zefan
` (4 more replies)
0 siblings, 5 replies; 8+ messages in thread
From: Li Zefan @ 2009-02-26 7:02 UTC (permalink / raw)
To: Andrew Morton; +Cc: Paul Menage, LKML
This patchset mainly simplifies update_tasks_nodemask() and makes it
return void (won't fail), thus fix a case that writing to cpuset.mems
returns -ENOMEM but its mems_allowed has been changed.
[PATCH 1/4] cgroups: add 'data' field to struct cgroup_scanner
[PATCH 2/4] cpuset: rewrite update_tasks_nodemask()
[PATCH 3/4] cpuset: avoid changing cpuset's mems when errno returned
[PATCH 4/4] cpuset: remove struct cpuset_hotplug_scanner
---
include/linux/cgroup.h | 1
kernel/cpuset.c | 151 +++++++++++++++++++------------------------------
2 files changed, 62 insertions(+), 90 deletions(-)
---
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 1/4] cgroups: add 'data' field to struct cgroup_scanner
2009-02-26 7:02 [PATCH 0/4] cpuset: cleanups and fixes Li Zefan
@ 2009-02-26 7:02 ` Li Zefan
2009-02-26 7:03 ` [PATCH 2/4] cpuset: rewrite update_tasks_nodemask() Li Zefan
` (3 subsequent siblings)
4 siblings, 0 replies; 8+ messages in thread
From: Li Zefan @ 2009-02-26 7:02 UTC (permalink / raw)
To: Andrew Morton; +Cc: Paul Menage, LKML
We need to pass some data to test_task() or process_task() in some cases.
Will be used later.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
include/linux/cgroup.h | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9a40932..6ad1989 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -332,6 +332,7 @@ struct cgroup_scanner {
void (*process_task)(struct task_struct *p,
struct cgroup_scanner *scan);
struct ptr_heap *heap;
+ void *data;
};
/*
-- 1.5.4.rc3
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 2/4] cpuset: rewrite update_tasks_nodemask()
2009-02-26 7:02 [PATCH 0/4] cpuset: cleanups and fixes Li Zefan
2009-02-26 7:02 ` [PATCH 1/4] cgroups: add 'data' field to struct cgroup_scanner Li Zefan
@ 2009-02-26 7:03 ` Li Zefan
2009-02-26 7:04 ` [PATCH 3/4] cpuset: avoid changing cpuset's mems when errno returned Li Zefan
` (2 subsequent siblings)
4 siblings, 0 replies; 8+ messages in thread
From: Li Zefan @ 2009-02-26 7:03 UTC (permalink / raw)
To: Andrew Morton; +Cc: Paul Menage, LKML
This patch uses cgroup_scan_tasks() to rebind tasks' vmas to new cpuset's
mems_allowed.
Not only simplify the code largely, but also avoid allocating an array
to hold mm pointers of all the tasks in the cpuset. This array can be big
(size > PAGESIZE) if we have lots of tasks in that cpuset, thus has a
chance to fail the allocation when under memory stress.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
kernel/cpuset.c | 109 ++++++++++++++++++++-----------------------------------
1 files changed, 39 insertions(+), 70 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 20c2db2..01a111f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1026,6 +1026,31 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
mutex_unlock(&callback_mutex);
}
+/*
+ * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
+ * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
+ */
+static void cpuset_change_nodemask(struct task_struct *p,
+ struct cgroup_scanner *scan)
+{
+ struct mm_struct *mm;
+ struct cpuset *cs;
+ int migrate;
+ const nodemask_t *oldmem = scan->data;
+
+ mm = get_task_mm(p);
+ if (!mm)
+ return;
+
+ cs = cgroup_cs(scan->cg);
+ migrate = is_memory_migrate(cs);
+
+ mpol_rebind_mm(mm, &cs->mems_allowed);
+ if (migrate)
+ cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
+ mmput(mm);
+}
+
static void *cpuset_being_rebound;
/**
@@ -1038,88 +1063,32 @@ static void *cpuset_being_rebound;
*/
static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
{
- struct task_struct *p;
- struct mm_struct **mmarray;
- int i, n, ntasks;
- int migrate;
- int fudge;
- struct cgroup_iter it;
int retval;
+ struct cgroup_scanner scan;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- fudge = 10; /* spare mmarray[] slots */
- fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
- retval = -ENOMEM;
-
- /*
- * Allocate mmarray[] to hold mm reference for each task
- * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
- * tasklist_lock. We could use GFP_ATOMIC, but with a
- * few more lines of code, we can retry until we get a big
- * enough mmarray[] w/o using GFP_ATOMIC.
- */
- while (1) {
- ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
- ntasks += fudge;
- mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
- if (!mmarray)
- goto done;
- read_lock(&tasklist_lock); /* block fork */
- if (cgroup_task_count(cs->css.cgroup) <= ntasks)
- break; /* got enough */
- read_unlock(&tasklist_lock); /* try again */
- kfree(mmarray);
- }
-
- n = 0;
-
- /* Load up mmarray[] with mm reference for each task in cpuset. */
- cgroup_iter_start(cs->css.cgroup, &it);
- while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
- struct mm_struct *mm;
-
- if (n >= ntasks) {
- printk(KERN_WARNING
- "Cpuset mempolicy rebind incomplete.\n");
- break;
- }
- mm = get_task_mm(p);
- if (!mm)
- continue;
- mmarray[n++] = mm;
- }
- cgroup_iter_end(cs->css.cgroup, &it);
- read_unlock(&tasklist_lock);
+ scan.cg = cs->css.cgroup;
+ scan.test_task = NULL;
+ scan.process_task = cpuset_change_nodemask;
+ scan.heap = NULL;
+ scan.data = (nodemask_t *)oldmem;
/*
- * Now that we've dropped the tasklist spinlock, we can
- * rebind the vma mempolicies of each mm in mmarray[] to their
- * new cpuset, and release that mm. The mpol_rebind_mm()
- * call takes mmap_sem, which we couldn't take while holding
- * tasklist_lock. Forks can happen again now - the mpol_dup()
- * cpuset_being_rebound check will catch such forks, and rebind
- * their vma mempolicies too. Because we still hold the global
- * cgroup_mutex, we know that no other rebind effort will
- * be contending for the global variable cpuset_being_rebound.
+ * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
+ * take while holding tasklist_lock. Forks can happen - the
+ * mpol_dup() cpuset_being_rebound check will catch such forks,
+ * and rebind their vma mempolicies too. Because we still hold
+ * the global cgroup_mutex, we know that no other rebind effort
+ * will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- migrate = is_memory_migrate(cs);
- for (i = 0; i < n; i++) {
- struct mm_struct *mm = mmarray[i];
-
- mpol_rebind_mm(mm, &cs->mems_allowed);
- if (migrate)
- cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
- mmput(mm);
- }
+ retval = cgroup_scan_tasks(&scan);
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
- kfree(mmarray);
cpuset_being_rebound = NULL;
- retval = 0;
-done:
+
return retval;
}
-- 1.5.4.rc3
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 3/4] cpuset: avoid changing cpuset's mems when errno returned
2009-02-26 7:02 [PATCH 0/4] cpuset: cleanups and fixes Li Zefan
2009-02-26 7:02 ` [PATCH 1/4] cgroups: add 'data' field to struct cgroup_scanner Li Zefan
2009-02-26 7:03 ` [PATCH 2/4] cpuset: rewrite update_tasks_nodemask() Li Zefan
@ 2009-02-26 7:04 ` Li Zefan
2009-02-26 7:04 ` [PATCH 4/4] cpuset: remove struct cpuset_hotplug_scanner Li Zefan
2009-02-26 7:39 ` [PATCH 0/4] cpuset: cleanups and fixes KAMEZAWA Hiroyuki
4 siblings, 0 replies; 8+ messages in thread
From: Li Zefan @ 2009-02-26 7:04 UTC (permalink / raw)
To: Andrew Morton; +Cc: Paul Menage, LKML
When writing to cpuset.mems, cpuset has to update its mems_allowed before
calling update_tasks_nodemask(), but this function might return -ENOMEM.
To avoid this rare case, we allocate the memory before changing mems_allowed,
and then pass to update_tasks_nodemask(). Similar to what update_cpumask()
does.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
kernel/cpuset.c | 25 ++++++++++++++++---------
1 files changed, 16 insertions(+), 9 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 01a111f..3202364 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1057,13 +1057,15 @@ static void *cpuset_being_rebound;
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
* @oldmem: old mems_allowed of cpuset cs
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
* Called with cgroup_mutex held
- * Return 0 if successful, -errno if not.
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
*/
-static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
+static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
+ struct ptr_heap *heap)
{
- int retval;
struct cgroup_scanner scan;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
@@ -1071,7 +1073,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
scan.cg = cs->css.cgroup;
scan.test_task = NULL;
scan.process_task = cpuset_change_nodemask;
- scan.heap = NULL;
+ scan.heap = heap;
scan.data = (nodemask_t *)oldmem;
/*
@@ -1084,12 +1086,10 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- retval = cgroup_scan_tasks(&scan);
+ cgroup_scan_tasks(&scan);
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
cpuset_being_rebound = NULL;
-
- return retval;
}
/*
@@ -1110,6 +1110,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
{
nodemask_t oldmem;
int retval;
+ struct ptr_heap heap;
/*
* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1144,12 +1145,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
+ retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+ if (retval < 0)
+ goto done;
+
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
cs->mems_generation = cpuset_mems_generation++;
mutex_unlock(&callback_mutex);
- retval = update_tasks_nodemask(cs, &oldmem);
+ update_tasks_nodemask(cs, &oldmem, &heap);
+
+ heap_free(&heap);
done:
return retval;
}
@@ -2002,7 +2009,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
remove_tasks_in_empty_cpuset(cp);
else {
update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, &oldmems);
+ update_tasks_nodemask(cp, &oldmems, NULL);
}
}
}
-- 1.5.4.rc3
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 4/4] cpuset: remove struct cpuset_hotplug_scanner
2009-02-26 7:02 [PATCH 0/4] cpuset: cleanups and fixes Li Zefan
` (2 preceding siblings ...)
2009-02-26 7:04 ` [PATCH 3/4] cpuset: avoid changing cpuset's mems when errno returned Li Zefan
@ 2009-02-26 7:04 ` Li Zefan
2009-02-26 7:39 ` [PATCH 0/4] cpuset: cleanups and fixes KAMEZAWA Hiroyuki
4 siblings, 0 replies; 8+ messages in thread
From: Li Zefan @ 2009-02-26 7:04 UTC (permalink / raw)
To: Andrew Morton; +Cc: Paul Menage, LKML
Use cgroup_scanner.data, instead of introducing cpuset_hotplug_scanner.
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
kernel/cpuset.c | 23 +++++++++--------------
1 files changed, 9 insertions(+), 14 deletions(-)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3202364..a46d693 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -128,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
return container_of(task_subsys_state(task, cpuset_subsys_id),
struct cpuset, css);
}
-struct cpuset_hotplug_scanner {
- struct cgroup_scanner scan;
- struct cgroup *to;
-};
/* bits in struct cpuset flags field */
typedef enum {
@@ -1889,10 +1885,9 @@ int __init cpuset_init(void)
static void cpuset_do_move_task(struct task_struct *tsk,
struct cgroup_scanner *scan)
{
- struct cpuset_hotplug_scanner *chsp;
+ struct cgroup *new_cgroup = scan->data;
- chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
- cgroup_attach_task(chsp->to, tsk);
+ cgroup_attach_task(new_cgroup, tsk);
}
/**
@@ -1908,15 +1903,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
*/
static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
{
- struct cpuset_hotplug_scanner scan;
+ struct cgroup_scanner scan;
- scan.scan.cg = from->css.cgroup;
- scan.scan.test_task = NULL; /* select all tasks in cgroup */
- scan.scan.process_task = cpuset_do_move_task;
- scan.scan.heap = NULL;
- scan.to = to->css.cgroup;
+ scan.cg = from->css.cgroup;
+ scan.test_task = NULL; /* select all tasks in cgroup */
+ scan.process_task = cpuset_do_move_task;
+ scan.heap = NULL;
+ scan.data = to->css.cgroup;
- if (cgroup_scan_tasks(&scan.scan))
+ if (cgroup_scan_tasks(&scan))
printk(KERN_ERR "move_member_tasks_to_cpuset: "
"cgroup_scan_tasks failed\n");
}
-- 1.5.4.rc3
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 0/4] cpuset: cleanups and fixes
2009-02-26 7:02 [PATCH 0/4] cpuset: cleanups and fixes Li Zefan
` (3 preceding siblings ...)
2009-02-26 7:04 ` [PATCH 4/4] cpuset: remove struct cpuset_hotplug_scanner Li Zefan
@ 2009-02-26 7:39 ` KAMEZAWA Hiroyuki
2009-02-26 7:58 ` Li Zefan
4 siblings, 1 reply; 8+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-02-26 7:39 UTC (permalink / raw)
To: Li Zefan; +Cc: Andrew Morton, Paul Menage, LKML
On Thu, 26 Feb 2009 15:02:15 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:
> This patchset mainly simplifies update_tasks_nodemask() and makes it
> return void (won't fail), thus fix a case that writing to cpuset.mems
> returns -ENOMEM but its mems_allowed has been changed.
>
> [PATCH 1/4] cgroups: add 'data' field to struct cgroup_scanner
> [PATCH 2/4] cpuset: rewrite update_tasks_nodemask()
> [PATCH 3/4] cpuset: avoid changing cpuset's mems when errno returned
> [PATCH 4/4] cpuset: remove struct cpuset_hotplug_scanner
>
Thank you. I think there is no problematic in your patch.
After reading your patch, I wonder we can improve this migration by
filter for cgroup_scan_tasks().
Assume a task with tons of threads in a cgroup and
- change nodemask
- migrate=1
Under current implemntation, cpuset_migrate_mm=>do_migrate_mm is called per
threads. This implies there will be big overhead, useless scanning of page
tables.
Can we add some checks as "Is this mm already migrated ?", anywhere ?
(we may need to add some another array, again...)
Thanks,
-Kame
> ---
> include/linux/cgroup.h | 1
> kernel/cpuset.c | 151 +++++++++++++++++++------------------------------
> 2 files changed, 62 insertions(+), 90 deletions(-)
> ---
>
> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 0/4] cpuset: cleanups and fixes
2009-02-26 7:39 ` [PATCH 0/4] cpuset: cleanups and fixes KAMEZAWA Hiroyuki
@ 2009-02-26 7:58 ` Li Zefan
2009-02-26 8:09 ` KAMEZAWA Hiroyuki
0 siblings, 1 reply; 8+ messages in thread
From: Li Zefan @ 2009-02-26 7:58 UTC (permalink / raw)
To: KAMEZAWA Hiroyuki; +Cc: Andrew Morton, Paul Menage, LKML
KAMEZAWA Hiroyuki wrote:
> On Thu, 26 Feb 2009 15:02:15 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
>
>> This patchset mainly simplifies update_tasks_nodemask() and makes it
>> return void (won't fail), thus fix a case that writing to cpuset.mems
>> returns -ENOMEM but its mems_allowed has been changed.
>>
>> [PATCH 1/4] cgroups: add 'data' field to struct cgroup_scanner
>> [PATCH 2/4] cpuset: rewrite update_tasks_nodemask()
>> [PATCH 3/4] cpuset: avoid changing cpuset's mems when errno returned
>> [PATCH 4/4] cpuset: remove struct cpuset_hotplug_scanner
>>
> Thank you. I think there is no problematic in your patch.
>
> After reading your patch, I wonder we can improve this migration by
> filter for cgroup_scan_tasks().
>
> Assume a task with tons of threads in a cgroup and
> - change nodemask
> - migrate=1
>
> Under current implemntation, cpuset_migrate_mm=>do_migrate_mm is called per
> threads. This implies there will be big overhead, useless scanning of page
> tables.
>
> Can we add some checks as "Is this mm already migrated ?", anywhere ?
> (we may need to add some another array, again...)
>
cpuset's memory_migrate flag is disabled by default, and I guess migrating mm
in cpuset is not frequently used? I tend to not optimise it unless it becomes
a problem in real-life. :)
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH 0/4] cpuset: cleanups and fixes
2009-02-26 7:58 ` Li Zefan
@ 2009-02-26 8:09 ` KAMEZAWA Hiroyuki
0 siblings, 0 replies; 8+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-02-26 8:09 UTC (permalink / raw)
To: Li Zefan; +Cc: Andrew Morton, Paul Menage, LKML
On Thu, 26 Feb 2009 15:58:34 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:
> KAMEZAWA Hiroyuki wrote:
> > On Thu, 26 Feb 2009 15:02:15 +0800
> > Li Zefan <lizf@cn.fujitsu.com> wrote:
> >
> >> This patchset mainly simplifies update_tasks_nodemask() and makes it
> >> return void (won't fail), thus fix a case that writing to cpuset.mems
> >> returns -ENOMEM but its mems_allowed has been changed.
> >>
> >> [PATCH 1/4] cgroups: add 'data' field to struct cgroup_scanner
> >> [PATCH 2/4] cpuset: rewrite update_tasks_nodemask()
> >> [PATCH 3/4] cpuset: avoid changing cpuset's mems when errno returned
> >> [PATCH 4/4] cpuset: remove struct cpuset_hotplug_scanner
> >>
> > Thank you. I think there is no problematic in your patch.
> >
> > After reading your patch, I wonder we can improve this migration by
> > filter for cgroup_scan_tasks().
> >
> > Assume a task with tons of threads in a cgroup and
> > - change nodemask
> > - migrate=1
> >
> > Under current implemntation, cpuset_migrate_mm=>do_migrate_mm is called per
> > threads. This implies there will be big overhead, useless scanning of page
> > tables.
> >
> > Can we add some checks as "Is this mm already migrated ?", anywhere ?
> > (we may need to add some another array, again...)
> >
>
> cpuset's memory_migrate flag is disabled by default, and I guess migrating mm
> in cpuset is not frequently used? I tend to not optimise it unless it becomes
> a problem in real-life. :)
>
Ok, I will do by myself.
-Kame
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2009-02-26 8:11 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-02-26 7:02 [PATCH 0/4] cpuset: cleanups and fixes Li Zefan
2009-02-26 7:02 ` [PATCH 1/4] cgroups: add 'data' field to struct cgroup_scanner Li Zefan
2009-02-26 7:03 ` [PATCH 2/4] cpuset: rewrite update_tasks_nodemask() Li Zefan
2009-02-26 7:04 ` [PATCH 3/4] cpuset: avoid changing cpuset's mems when errno returned Li Zefan
2009-02-26 7:04 ` [PATCH 4/4] cpuset: remove struct cpuset_hotplug_scanner Li Zefan
2009-02-26 7:39 ` [PATCH 0/4] cpuset: cleanups and fixes KAMEZAWA Hiroyuki
2009-02-26 7:58 ` Li Zefan
2009-02-26 8:09 ` KAMEZAWA Hiroyuki
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox