From: Miao Xie <miaox@cn.fujitsu.com>
To: Ingo Molnar <mingo@elte.hu>,
Andrew Morton <akpm@linux-foundation.org>,
Paul Menage <menage@google.com>
Cc: Linux-Kernel <linux-kernel@vger.kernel.org>
Subject: [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set
Date: Wed, 21 Jan 2009 16:06:20 +0800 [thread overview]
Message-ID: <4976D77C.3020107@cn.fujitsu.com> (raw)
The task still allocated the page caches on old node after modifying its
cpuset's mems when 'memory_spread_page' was set, it is caused by the old
mem_allowed_list of the task, the current kernel doesn't updates it unless some
function invokes cpuset_update_task_memory_state(), it is too late sometimes.
We must update the mem_allowed_list of the tasks in time.
Slab has the same problem.
We fixes the bug by updating tasks' mem_allowed_list and spread flag after
its cpuset's mems or spread flag is changed.
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
include/linux/cpuset.h | 4 -
include/linux/sched.h | 1 -
init/main.c | 3 +-
kernel/cpuset.c | 204 +++++++++++++++--------------------------------
kernel/kthread.c | 1 +
mm/mempolicy.c | 12 ---
mm/page_alloc.c | 5 +-
7 files changed, 69 insertions(+), 161 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 90c6074..c8155a6 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -17,7 +17,6 @@
extern int number_of_cpusets; /* How many cpusets are defined in system? */
-extern int cpuset_init_early(void);
extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
@@ -26,7 +25,6 @@ extern void cpuset_cpus_allowed_locked(struct task_struct *p,
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
-void cpuset_update_task_memory_state(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
@@ -83,7 +81,6 @@ extern void cpuset_print_task_mems_allowed(struct task_struct *p);
#else /* !CONFIG_CPUSETS */
-static inline int cpuset_init_early(void) { return 0; }
static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}
@@ -105,7 +102,6 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
static inline void cpuset_init_current_mems_allowed(void) {}
-static inline void cpuset_update_task_memory_state(void) {}
static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b8..2c9a93c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1332,7 +1332,6 @@ struct task_struct {
#endif
#ifdef CONFIG_CPUSETS
nodemask_t mems_allowed;
- int cpuset_mems_generation;
int cpuset_mem_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
diff --git a/init/main.c b/init/main.c
index 8442094..3ce3b5d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -635,7 +635,6 @@ asmlinkage void __init start_kernel(void)
#endif
vmalloc_init();
vfs_caches_init_early();
- cpuset_init_early();
page_cgroup_init();
mem_init();
enable_debug_pagealloc();
@@ -845,6 +844,8 @@ static int __init kernel_init(void * unused)
*/
init_pid_ns.child_reaper = current;
+ current->mems_allowed = node_possible_map;
+
cad_pid = task_pid(current);
smp_prepare_cpus(setup_max_cpus);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a856788..36436fc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -89,12 +89,6 @@ struct cpuset {
struct cpuset *parent; /* my parent */
- /*
- * Copy of global cpuset_mems_generation as of the most
- * recent time this cpuset changed its mems_allowed.
- */
- int mems_generation;
-
struct fmeter fmeter; /* memory_pressure filter */
/* partition number for rebuild_sched_domains() */
@@ -172,27 +166,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
-/*
- * Increment this integer everytime any cpuset changes its
- * mems_allowed value. Users of cpusets can track this generation
- * number, and avoid having to lock and reload mems_allowed unless
- * the cpuset they're using changes generation.
- *
- * A single, global generation is needed because cpuset_attach_task() could
- * reattach a task to a different cpuset, which must not have its
- * generation numbers aliased with those of that tasks previous cpuset.
- *
- * Generations are needed for mems_allowed because one task cannot
- * modify another's memory placement. So we must enable every task,
- * on every visit to __alloc_pages(), to efficiently check whether
- * its current->cpuset->mems_allowed has changed, requiring an update
- * of its current->mems_allowed.
- *
- * Since writes to cpuset_mems_generation are guarded by the cgroup lock
- * there is no need to mark it atomic.
- */
-static int cpuset_mems_generation;
-
static struct cpuset top_cpuset = {
.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
};
@@ -224,8 +197,8 @@ static struct cpuset top_cpuset = {
* If a task is only holding callback_mutex, then it has read-only
* access to cpusets.
*
- * The task_struct fields mems_allowed and mems_generation may only
- * be accessed in the context of that task, so require no locks.
+ * The task_struct fields mems_allowed may only be accessed in the context
+ * of that task, so require no locks.
*
* The cpuset_common_file_read() handlers only hold callback_mutex across
* small pieces of code, such as when reading out possibly multi-word
@@ -327,77 +300,6 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
}
-/**
- * cpuset_update_task_memory_state - update task memory placement
- *
- * If the current tasks cpusets mems_allowed changed behind our
- * backs, update current->mems_allowed, mems_generation and task NUMA
- * mempolicy to the new value.
- *
- * Task mempolicy is updated by rebinding it relative to the
- * current->cpuset if a task has its memory placement changed.
- * Do not call this routine if in_interrupt().
- *
- * Call without callback_mutex or task_lock() held. May be
- * called with or without cgroup_mutex held. Thanks in part to
- * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL. This routine also might acquire callback_mutex during
- * call.
- *
- * Reading current->cpuset->mems_generation doesn't need task_lock
- * to guard the current->cpuset derefence, because it is guarded
- * from concurrent freeing of current->cpuset using RCU.
- *
- * The rcu_dereference() is technically probably not needed,
- * as I don't actually mind if I see a new cpuset pointer but
- * an old value of mems_generation. However this really only
- * matters on alpha systems using cpusets heavily. If I dropped
- * that rcu_dereference(), it would save them a memory barrier.
- * For all other arch's, rcu_dereference is a no-op anyway, and for
- * alpha systems not using cpusets, another planned optimization,
- * avoiding the rcu critical section for tasks in the root cpuset
- * which is statically allocated, so can't vanish, will make this
- * irrelevant. Better to use RCU as intended, than to engage in
- * some cute trick to save a memory barrier that is impossible to
- * test, for alpha systems using cpusets heavily, which might not
- * even exist.
- *
- * This routine is needed to update the per-task mems_allowed data,
- * within the tasks context, when it is trying to allocate memory
- * (in various mm/mempolicy.c routines) and notices that some other
- * task has been modifying its cpuset.
- */
-
-void cpuset_update_task_memory_state(void)
-{
- int my_cpusets_mem_gen;
- struct task_struct *tsk = current;
- struct cpuset *cs;
-
- rcu_read_lock();
- my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
- rcu_read_unlock();
-
- if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
- mutex_lock(&callback_mutex);
- task_lock(tsk);
- cs = task_cs(tsk); /* Maybe changed when task not locked */
- guarantee_online_mems(cs, &tsk->mems_allowed);
- tsk->cpuset_mems_generation = cs->mems_generation;
- if (is_spread_page(cs))
- tsk->flags |= PF_SPREAD_PAGE;
- else
- tsk->flags &= ~PF_SPREAD_PAGE;
- if (is_spread_slab(cs))
- tsk->flags |= PF_SPREAD_SLAB;
- else
- tsk->flags &= ~PF_SPREAD_SLAB;
- task_unlock(tsk);
- mutex_unlock(&callback_mutex);
- mpol_rebind_task(tsk, &tsk->mems_allowed);
- }
-}
-
/*
* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
*
@@ -990,14 +892,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* other task, the task_struct mems_allowed that we are hacking
* is for our current task, which must allocate new pages for that
* migrating memory region.
- *
- * We call cpuset_update_task_memory_state() before hacking
- * our tasks mems_allowed, so that we are assured of being in
- * sync with our tasks cpuset, and in particular, callbacks to
- * cpuset_update_task_memory_state() from nested page allocations
- * won't see any mismatch of our cpuset and task mems_generation
- * values, so won't overwrite our hacked tasks mems_allowed
- * nodemask.
*/
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1005,8 +899,6 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
{
struct task_struct *tsk = current;
- cpuset_update_task_memory_state();
-
mutex_lock(&callback_mutex);
tsk->mems_allowed = *to;
mutex_unlock(&callback_mutex);
@@ -1076,6 +968,10 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
"Cpuset mempolicy rebind incomplete.\n");
break;
}
+ mutex_lock(&callback_mutex);
+ guarantee_online_mems(cs, &p->mems_allowed);
+ mutex_unlock(&callback_mutex);
+ mpol_rebind_task(p, &p->mems_allowed);
mm = get_task_mm(p);
if (!mm)
continue;
@@ -1118,10 +1014,9 @@ done:
/*
* Handle user request to change the 'mems' memory placement
* of a cpuset. Needs to validate the request, update the
- * cpusets mems_allowed and mems_generation, and for each
- * task in the cpuset, rebind any vma mempolicies and if
- * the cpuset is marked 'memory_migrate', migrate the tasks
- * pages to the new memory.
+ * cpusets mems_allowed, and for each task in the cpuset,
+ * rebind any vma mempolicies and if the cpuset is marked
+ * 'memory_migrate', migrate the tasks pages to the new memory.
*
* Call with cgroup_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1169,7 +1064,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
- cs->mems_generation = cpuset_mems_generation++;
mutex_unlock(&callback_mutex);
retval = update_tasks_nodemask(cs, &oldmem);
@@ -1197,6 +1091,33 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
return 0;
}
+static void cpuset_change_flag(struct task_struct *tsk,
+ struct cgroup_scanner *scan)
+{
+ struct cpuset *cs;
+
+ cs = task_cs(tsk);
+ if (is_spread_page(cs))
+ tsk->flags |= PF_SPREAD_PAGE;
+ else
+ tsk->flags &= ~PF_SPREAD_PAGE;
+ if (is_spread_slab(cs))
+ tsk->flags |= PF_SPREAD_SLAB;
+ else
+ tsk->flags &= ~PF_SPREAD_SLAB;
+}
+
+static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+{
+ struct cgroup_scanner scan;
+
+ scan.cg = cs->css.cgroup;
+ scan.test_task = NULL;
+ scan.process_task = cpuset_change_flag;
+ scan.heap = heap;
+ cgroup_scan_tasks(&scan);
+}
+
/*
* update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (see cpuset_flagbits_t)
@@ -1212,6 +1133,8 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
struct cpuset *trialcs;
int err;
int balance_flag_changed;
+ int spread_changed;
+ struct ptr_heap heap;
trialcs = alloc_trial_cpuset(cs);
if (!trialcs)
@@ -1226,9 +1149,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (err < 0)
goto out;
+ err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+ if (err)
+ goto out;
+
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
+ spread_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+ || (is_spread_page(cs) != is_spread_page(trialcs)));
+
mutex_lock(&callback_mutex);
cs->flags = trialcs->flags;
mutex_unlock(&callback_mutex);
@@ -1236,6 +1166,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
async_rebuild_sched_domains();
+ if (spread_changed)
+ update_tasks_flags(cs, &heap);
+
+ heap_free(&heap);
out:
free_trial_cpuset(trialcs);
return err;
@@ -1374,15 +1308,29 @@ static void cpuset_attach(struct cgroup_subsys *ss,
if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
+ tsk->mems_allowed = node_possible_map;
} else {
mutex_lock(&callback_mutex);
guarantee_online_cpus(cs, cpus_attach);
+ guarantee_online_mems(cs,&tsk->mems_allowed);
mutex_unlock(&callback_mutex);
}
err = set_cpus_allowed_ptr(tsk, cpus_attach);
if (err)
return;
+ mutex_lock(&callback_mutex);
+ if (is_spread_page(cs))
+ tsk->flags |= PF_SPREAD_PAGE;
+ else
+ tsk->flags &= ~PF_SPREAD_PAGE;
+ if (is_spread_slab(cs))
+ tsk->flags |= PF_SPREAD_SLAB;
+ else
+ tsk->flags &= ~PF_SPREAD_SLAB;
+ mutex_unlock(&callback_mutex);
+ mpol_rebind_task(tsk, &tsk->mems_allowed);
+
from = oldcs->mems_allowed;
to = cs->mems_allowed;
mm = get_task_mm(tsk);
@@ -1444,11 +1392,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
break;
case FILE_SPREAD_PAGE:
retval = update_flag(CS_SPREAD_PAGE, cs, val);
- cs->mems_generation = cpuset_mems_generation++;
break;
case FILE_SPREAD_SLAB:
retval = update_flag(CS_SPREAD_SLAB, cs, val);
- cs->mems_generation = cpuset_mems_generation++;
break;
default:
retval = -EINVAL;
@@ -1787,8 +1733,6 @@ static struct cgroup_subsys_state *cpuset_create(
struct cpuset *parent;
if (!cont->parent) {
- /* This is early initialization for the top cgroup */
- top_cpuset.mems_generation = cpuset_mems_generation++;
return &top_cpuset.css;
}
parent = cgroup_cs(cont->parent);
@@ -1800,7 +1744,6 @@ static struct cgroup_subsys_state *cpuset_create(
return ERR_PTR(-ENOMEM);
}
- cpuset_update_task_memory_state();
cs->flags = 0;
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1809,7 +1752,6 @@ static struct cgroup_subsys_state *cpuset_create(
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
- cs->mems_generation = cpuset_mems_generation++;
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
@@ -1828,8 +1770,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
{
struct cpuset *cs = cgroup_cs(cont);
- cpuset_update_task_memory_state();
-
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
@@ -1850,21 +1790,6 @@ struct cgroup_subsys cpuset_subsys = {
.early_init = 1,
};
-/*
- * cpuset_init_early - just enough so that the calls to
- * cpuset_update_task_memory_state() in early init code
- * are harmless.
- */
-
-int __init cpuset_init_early(void)
-{
- alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
-
- top_cpuset.mems_generation = cpuset_mems_generation++;
- return 0;
-}
-
-
/**
* cpuset_init - initialize cpusets at system boot
*
@@ -1875,11 +1800,12 @@ int __init cpuset_init(void)
{
int err = 0;
+ if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
+ BUG();
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
fmeter_init(&top_cpuset.fmeter);
- top_cpuset.mems_generation = cpuset_mems_generation++;
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
top_cpuset.relax_domain_level = -1;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fbc456..90469e6 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -242,6 +242,7 @@ int kthreadd(void *unused)
set_user_nice(tsk, KTHREAD_NICE_LEVEL);
set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
+ current->mems_allowed = node_possible_map;
current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
for (;;) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3eb4a6f..5912b03 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -222,10 +222,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
policy->flags = flags;
if (nodes) {
- /*
- * cpuset related setup doesn't apply to local allocation
- */
- cpuset_update_task_memory_state();
if (flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&cpuset_context_nmask, nodes,
&cpuset_current_mems_allowed);
@@ -674,7 +670,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
- cpuset_update_task_memory_state();
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
@@ -1545,8 +1540,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
- cpuset_update_task_memory_state();
-
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
@@ -1585,16 +1578,11 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
* interrupt context and apply the current process NUMA policy.
* Returns NULL when no page can be allocated.
*
- * Don't call cpuset_update_task_memory_state() unless
- * 1) it's ok to take cpuset_sem (can WAIT), and
- * 2) allocating for current task (not interrupt).
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
- if ((gfp & __GFP_WAIT) && !in_interrupt())
- cpuset_update_task_memory_state();
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5675b30..503219c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1573,10 +1573,7 @@ nofail_alloc:
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- /*
- * The task's cpuset might have expanded its set of allowable nodes
- */
- cpuset_update_task_memory_state();
+
p->flags |= PF_MEMALLOC;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
--
1.6.0.3
next reply other threads:[~2009-01-21 8:08 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-01-21 8:06 Miao Xie [this message]
2009-01-21 8:30 ` [PATCH] cpuset: fix allocating page cache/slab object on the unallowed node when memory spread is set Nick Piggin
2009-01-21 10:41 ` Paul Menage
2009-02-03 3:05 ` Miao Xie
2009-01-27 22:42 ` Andrew Morton
2009-01-28 16:38 ` Christoph Lameter
2009-02-03 3:25 ` Miao Xie
2009-02-03 22:16 ` Andrew Morton
2009-02-03 22:49 ` Paul Menage
2009-02-04 9:31 ` Miao Xie
2009-02-06 19:19 ` Paul Menage
2009-02-09 4:02 ` Nick Piggin
2009-02-10 11:37 ` Paul Menage
2009-02-12 0:54 ` Nick Piggin
2009-02-12 1:19 ` Paul Menage
2009-02-12 1:55 ` Nick Piggin
2009-02-12 1:58 ` Paul Menage
2009-02-12 8:23 ` Miao Xie
2009-02-12 21:53 ` Paul Menage
2009-02-12 8:27 ` Miao Xie
2009-02-12 10:40 ` Nick Piggin
2009-02-12 5:57 ` Miao Xie
2009-02-12 11:06 ` Paul Jackson
2009-02-04 9:03 ` Miao Xie
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4976D77C.3020107@cn.fujitsu.com \
--to=miaox@cn.fujitsu.com \
--cc=akpm@linux-foundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=menage@google.com \
--cc=mingo@elte.hu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.