[PATCH V2 4/4] cpuset,mm: update task's mems

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily
@ 2010-03-08 10:10 Miao Xie
  2010-03-08 21:46 ` David Rientjes
  0 siblings, 1 reply; 14+ messages in thread
From: Miao Xie @ 2010-03-08 10:10 UTC (permalink / raw)
  To: David Rientjes, Lee Schermerhorn, Nick Piggin, Paul Menage
  Cc: Linux-Kernel, Linux-MM

Changes from V1 to V2:
- Update task->mems_allowed lazily, instead of using a lock to protect it

Before applying this patch, cpuset updates task->mems_allowed by setting all
new bits in the nodemask first, and clearing all old unallowed bits later.
But in the way, the allocator is likely to see an empty nodemask.

The problem is following:
The size of nodemask_t is greater than the size of long integer, so loading
and storing of nodemask_t are not atomic operations. If task->mems_allowed
don't intersect with new_mask, such as the first word of the mask is empty
and only the first word of new_mask is not empty. When the allocator
loads a word of the mask before

	current->mems_allowed |= new_mask;

and then loads another word of the mask after

	current->mems_allowed = new_mask;

the allocator gets an empty nodemask.

Considering the change of task->mems_allowed is not frequent, so in this patch,
I use two variables as a tag to indicate whether task->mems_allowed need be
update or not. And before setting the tag, cpuset caches the new mask of every
task at its task_struct.

When the allocator want to access task->mems_allowed, it must check updated-tag
first. If the tag is set, the allocator enters the slow path and updates
task->mems_allowed.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 include/linux/cpuset.h    |   45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/init_task.h |   20 +++++++++++++++++++-
 include/linux/sched.h     |   41 +++++++++++++++++++++++++++++++++++++----
 kernel/cpuset.c           |   44 ++++++++++++++++++++------------------------
 kernel/fork.c             |   17 +++++++++++++++++
 mm/filemap.c              |    6 +++++-
 mm/hugetlb.c              |    2 ++
 mm/mempolicy.c            |   37 +++++++++++++++++--------------------
 mm/slab.c                 |    5 +++++
 mm/slub.c                 |    2 ++
 10 files changed, 169 insertions(+), 50 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a5740fc..2eb0fa7 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -93,6 +93,44 @@ extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 static inline void set_mems_allowed(nodemask_t nodemask)
 {
 	current->mems_allowed = nodemask;
+	current->mems_allowed_for_update = nodemask;
+}
+
+#define task_mems_lock_irqsave(p, flags)			\
+	do {							\
+		spin_lock_irqsave(&p->mems_lock, flags);	\
+	} while (0)
+
+#define task_mems_unlock_irqrestore(p, flags)			\
+	do {							\
+		spin_unlock_irqrestore(&p->mems_lock, flags);	\
+	} while (0)
+
+#include <linux/mempolicy.h>
+/**
+ * cpuset_update_task_mems_allowed - update task memory placement
+ *
+ * If the current task's mems_allowed_for_update and mempolicy_for_update are
+ * changed by cpuset behind our backs, update current->mems_allowed,
+ * mems_generation and task NUMA mempolicy to the new value.
+ *
+ * Call WITHOUT mems_lock held.
+ * 
+ * This routine is needed to update the pre-task mems_allowed and mempolicy
+ * within the tasks context, when it is trying to allocate memory.
+ */
+static __always_inline void cpuset_update_task_mems_allowed(void)
+{
+	struct task_struct *tsk = current;
+	unsigned long flags;
+
+	if (unlikely(tsk->mems_generation != tsk->mems_generation_for_update)) {
+		task_mems_lock_irqsave(tsk, flags);
+		tsk->mems_allowed = tsk->mems_allowed_for_update;
+		tsk->mems_generation = tsk->mems_generation_for_update;
+		task_mems_unlock_irqrestore(tsk, flags);
+		mpol_rebind_task(tsk, &tsk->mems_allowed);
+	}
 }
 
 #else /* !CONFIG_CPUSETS */
@@ -193,6 +231,13 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 {
 }
 
+static inline void cpuset_update_task_mems_allowed(void)
+{
+}
+
+#define task_mems_lock_irqsave(p, flags)	do { (void)(flags); } while (0)
+
+#define task_mems_unlock_irqrestore(p, flags)	do { (void)(flags); } while (0)
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index abec69b..be016f0 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -103,7 +103,7 @@ extern struct group_info init_groups;
 extern struct cred init_cred;
 
 #ifdef CONFIG_PERF_EVENTS
-# define INIT_PERF_EVENTS(tsk)					\
+# define INIT_PERF_EVENTS(tsk)						\
 	.perf_event_mutex = 						\
 		 __MUTEX_INITIALIZER(tsk.perf_event_mutex),		\
 	.perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
@@ -111,6 +111,22 @@ extern struct cred init_cred;
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
+#ifdef CONFIG_CPUSETS
+# define INIT_MEMS_ALLOWED(tsk)						\
+	.mems_lock = __SPIN_LOCK_UNLOCKED(tsk.mems_lock),		\
+	.mems_generation = 0,						\
+	.mems_generation_for_update = 0,
+#else
+# define INIT_MEMS_ALLOWED(tsk)
+#endif
+
+#ifdef CONFIG_NUMA
+# define INIT_MEMPOLICY							\
+	.mempolicy = NULL,
+#else
+# define INIT_MEMPOLICY
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -180,6 +196,8 @@ extern struct cred init_cred;
 	INIT_FTRACE_GRAPH						\
 	INIT_TRACE_RECURSION						\
 	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_MEMS_ALLOWED(tsk)						\
+	INIT_MEMPOLICY							\
 }
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 46c6f8d..9e7f14f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1351,8 +1351,9 @@ struct task_struct {
 /* Thread group tracking */
    	u32 parent_exec_id;
    	u32 self_exec_id;
-/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
- * mempolicy */
+/*
+ * Protection of (de-)allocation: mm, files, fs, tty, keyrings
+ */
 	spinlock_t alloc_lock;
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -1420,8 +1421,36 @@ struct task_struct {
 	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
-	nodemask_t mems_allowed;	/* Protected by alloc_lock */
+	/*
+	 * It is unnecessary to protect mems_allowed, because it only can be
+	 * loaded and stored by current task's self
+	 */
+	nodemask_t mems_allowed;
 	int cpuset_mem_spread_rotor;
+
+	/* Protection of ->mems_allowed_for_update */
+	spinlock_t mems_lock;
+	/*
+	 * This variable(mems_allowed_for_update) are just used for caching
+	 * memory placement information.
+	 *
+	 * ->mems_allowed are used by the kernel allocator.
+	 */
+	nodemask_t mems_allowed_for_update;	/* Protected by mems_lock */
+
+	/*
+	 * Increment this integer everytime ->mems_allowed_for_update is
+	 * changed by cpuset. Task can compare this number with mems_generation,
+	 * and if they are not the same, mems_allowed_for_update is changed and
+	 * ->mems_allowed must be updated. In this way, tasks can avoid having
+	 * to lock and reload mems_allowed_for_update unless it is changed.
+	 */
+	int mems_generation_for_update;
+	/*
+	 * After updating mems_allowed, set mems_generation to
+	 * mems_generation_for_update.
+	 */
+	int mems_generation;
 #endif
 #ifdef CONFIG_CGROUPS
 	/* Control Group info protected by css_set_lock */
@@ -1443,7 +1472,11 @@ struct task_struct {
 	struct list_head perf_event_list;
 #endif
 #ifdef CONFIG_NUMA
-	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
+	/*
+	 * It is unnecessary to protect mempolicy, because it only can be
+	 * loaded/stored by current task's self.
+	 */
+	struct mempolicy *mempolicy;
 	short il_next;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f36e577..ff6d76b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -34,7 +34,6 @@
 #include <linux/kernel.h>
 #include <linux/kmod.h>
 #include <linux/list.h>
-#include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/memory.h>
 #include <linux/module.h>
@@ -201,9 +200,9 @@ static struct cpuset top_cpuset = {
  * If a task is only holding callback_mutex, then it has read-only
  * access to cpusets.
  *
- * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * by other task, we use alloc_lock in the task_struct fields to protect
- * them.
+ * Now, the task_struct fields mems_allowed_for_update is used to cache
+ * the new mems information,we use mems_lock in the task_struct fields to
+ * protect it.
  *
  * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
@@ -939,29 +938,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 }
 
 /*
- * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
+ * cpuset_change_task_nodemask - change task's mems_allowed
  * @tsk: the task to change
  * @newmems: new nodes that the task will be set
  *
- * In order to avoid seeing no nodes if the old and new nodes are disjoint,
- * we structure updates as setting all new allowed nodes, then clearing newly
- * disallowed ones.
- *
- * Called with task's alloc_lock held
+ * Called with task's mems_lock held and disable interrupt.
  */
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
-	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, &tsk->mems_allowed);
-	mpol_rebind_task(tsk, newmems);
-	tsk->mems_allowed = *newmems;
+	tsk->mems_allowed_for_update = *newmems;
+
+	tsk->mems_generation_for_update++;
 }
 
 /*
- * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
- * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cgroup_mutex held.
+ * Update task's mems_allowed and vmas' mempolicy of it to cpuset's
+ * new mems_allowed, and migrate pages to new nodes if memory_migrate
+ * flag is set. Called with cgroup_mutex held.
  */
 static void cpuset_change_nodemask(struct task_struct *p,
 				   struct cgroup_scanner *scan)
@@ -970,6 +964,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
+	unsigned long flags;
 	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
 
 	if (!newmems)
@@ -978,9 +973,9 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	cs = cgroup_cs(scan->cg);
 	guarantee_online_mems(cs, newmems);
 
-	task_lock(p);
+	task_mems_lock_irqsave(p, flags);
 	cpuset_change_task_nodemask(p, newmems);
-	task_unlock(p);
+	task_mems_unlock_irqrestore(p, flags);
 
 	NODEMASK_FREE(newmems);
 
@@ -1041,9 +1036,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
  * Handle user request to change the 'mems' memory placement
  * of a cpuset.  Needs to validate the request, update the
  * cpusets mems_allowed, and for each task in the cpuset,
- * update mems_allowed and rebind task's mempolicy and any vma
- * mempolicies and if the cpuset is marked 'memory_migrate',
- * migrate the tasks pages to the new memory.
+ * update mems_allowed and any vma mempolicies and if the
+ * cpuset is marked 'memory_migrate', migrate the tasks pages
+ * to the new memory.
  *
  * Call with cgroup_mutex held.  May take callback_mutex during call.
  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1375,6 +1370,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
 			       struct cpuset *cs)
 {
+	unsigned long flags;
 	int err;
 	/*
 	 * can_attach beforehand should guarantee that this doesn't fail.
@@ -1383,9 +1379,9 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
 	err = set_cpus_allowed_ptr(tsk, cpus_attach);
 	WARN_ON_ONCE(err);
 
-	task_lock(tsk);
+	task_mems_lock_irqsave(tsk, flags);
 	cpuset_change_task_nodemask(tsk, to);
-	task_unlock(tsk);
+	task_mems_unlock_irqrestore(tsk, flags);
 	cpuset_update_task_spread_flag(cs, tsk);
 
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index b0ec34a..a5c581d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
+#include <linux/cpuset.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
@@ -1095,6 +1096,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	monotonic_to_bootbased(&p->real_start_time);
 	p->io_context = NULL;
 	p->audit_context = NULL;
+#ifdef CONFIG_CPUSETS
+	spin_lock_init(&p->mems_lock);
+	p->mems_generation_for_update = 0;
+	p->mems_generation = 0;
+#endif
 	cgroup_fork(p);
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
@@ -1307,6 +1313,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	proc_fork_connector(p);
 	cgroup_post_fork(p);
 	perf_event_fork(p);
+#ifdef CONFIG_CPUSETS
+	/*
+	 * Checking whether p's cpuset changed mems before cgroup_post_fork()
+	 * and after dup_task_struct().
+	 */
+	if (unlikely(current->mems_generation_for_update
+			!= current->mems_generation)) {
+		p->mems_allowed = cpuset_mems_allowed(current);
+		mpol_rebind_task(p, &p->mems_allowed);
+	}
+#endif
 	return p;
 
 bad_fork_free_pid:
diff --git a/mm/filemap.c b/mm/filemap.c
index 045b31c..595f5cc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -461,8 +461,12 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
 #ifdef CONFIG_NUMA
 struct page *__page_cache_alloc(gfp_t gfp)
 {
+	int n;
+
 	if (cpuset_do_page_mem_spread()) {
-		int n = cpuset_mem_spread_node();
+		cpuset_update_task_mems_allowed();
+
+		n = cpuset_mem_spread_node();
 		return alloc_pages_exact_node(n, gfp, 0);
 	}
 	return alloc_pages(gfp, 0);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3a5aeb3..a19865c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1837,6 +1837,8 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 	int node;
 	unsigned int nr = 0;
 
+	cpuset_update_task_mems_allowed();
+
 	for_each_node_mask(node, cpuset_current_mems_allowed)
 		nr += array[node];
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bda230e..6d4cdb7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -190,8 +190,7 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
  * parameter with respect to the policy mode and flags.  But, we need to
  * handle an empty nodemask with MPOL_PREFERRED here.
  *
- * Must be called holding task's alloc_lock to protect task's mems_allowed
- * and mempolicy.  May also be called holding the mmap_semaphore for write.
+ * May be called holding the mmap_semaphore for write.
  */
 static int mpol_set_nodemask(struct mempolicy *pol,
 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
@@ -201,6 +200,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 	if (pol == NULL)
 		return 0;
+
+	cpuset_update_task_mems_allowed();
+
 	/* Check N_HIGH_MEMORY */
 	nodes_and(nsc->mask1,
 		  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
@@ -665,10 +667,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	 */
 	if (mm)
 		down_write(&mm->mmap_sem);
-	task_lock(current);
 	ret = mpol_set_nodemask(new, nodes, scratch);
 	if (ret) {
-		task_unlock(current);
 		if (mm)
 			up_write(&mm->mmap_sem);
 		mpol_put(new);
@@ -680,7 +680,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	if (new && new->mode == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
-	task_unlock(current);
 	if (mm)
 		up_write(&mm->mmap_sem);
 
@@ -693,8 +692,6 @@ out:
 
 /*
  * Return nodemask for policy for get_mempolicy() query
- *
- * Called with task's alloc_lock held
  */
 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 {
@@ -740,6 +737,8 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
+	cpuset_update_task_mems_allowed();
+
 	if (flags &
 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 		return -EINVAL;
@@ -748,9 +747,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 			return -EINVAL;
 		*policy = 0;	/* just so it's initialized */
-		task_lock(current);
 		*nmask  = cpuset_current_mems_allowed;
-		task_unlock(current);
 		return 0;
 	}
 
@@ -805,11 +802,8 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	}
 
 	err = 0;
-	if (nmask) {
-		task_lock(current);
+	if (nmask)
 		get_policy_nodemask(pol, nmask);
-		task_unlock(current);
-	}
 
  out:
 	mpol_cond_put(pol);
@@ -1054,9 +1048,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
 			down_write(&mm->mmap_sem);
-			task_lock(current);
 			err = mpol_set_nodemask(new, nmask, scratch);
-			task_unlock(current);
 			if (err)
 				up_write(&mm->mmap_sem);
 		} else
@@ -1576,6 +1568,8 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 {
 	struct zonelist *zl;
 
+	cpuset_update_task_mems_allowed();
+
 	*mpol = get_vma_policy(current, vma, addr);
 	*nodemask = NULL;	/* assume !MPOL_BIND */
 
@@ -1614,6 +1608,8 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	if (!(mask && current->mempolicy))
 		return false;
 
+	cpuset_update_task_mems_allowed();
+
 	mempolicy = current->mempolicy;
 	switch (mempolicy->mode) {
 	case MPOL_PREFERRED:
@@ -1678,9 +1674,12 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 struct page *
 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+	struct mempolicy *pol;
 	struct zonelist *zl;
 
+	cpuset_update_task_mems_allowed();
+
+	pol= get_vma_policy(current, vma, addr);
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 
@@ -1727,6 +1726,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = current->mempolicy;
 
+	cpuset_update_task_mems_allowed();
+
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
 
@@ -2007,9 +2008,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 			return;		/* no valid nodemask intersection */
 		}
 
-		task_lock(current);
 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
-		task_unlock(current);
 		mpol_put(mpol);	/* drop our ref on sb mpol */
 		if (ret) {
 			NODEMASK_SCRATCH_FREE(scratch);
@@ -2241,9 +2240,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		int ret;
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
-			task_lock(current);
 			ret = mpol_set_nodemask(new, &nodes, scratch);
-			task_unlock(current);
 		} else
 			ret = -ENOMEM;
 		NODEMASK_SCRATCH_FREE(scratch);
diff --git a/mm/slab.c b/mm/slab.c
index a9f325b..bb13050 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3147,6 +3147,9 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 
 	if (in_interrupt() || (flags & __GFP_THISNODE))
 		return NULL;
+
+	cpuset_update_task_mems_allowed();
+
 	nid_alloc = nid_here = numa_node_id();
 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
 		nid_alloc = cpuset_mem_spread_node();
@@ -3178,6 +3181,8 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	if (flags & __GFP_THISNODE)
 		return NULL;
 
+	cpuset_update_task_mems_allowed();
+
 	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
diff --git a/mm/slub.c b/mm/slub.c
index 0bfd386..a2d02b6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1368,6 +1368,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 			get_cycles() % 1024 > s->remote_node_defrag_ratio)
 		return NULL;
 
+	cpuset_update_task_mems_allowed();
+
 	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		struct kmem_cache_node *n;
-- 
1.6.5.2


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily
  2010-03-08 10:10 [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily Miao Xie
@ 2010-03-08 21:46 ` David Rientjes
  2010-03-09  7:25   ` Miao Xie
  0 siblings, 1 reply; 14+ messages in thread
From: David Rientjes @ 2010-03-08 21:46 UTC (permalink / raw)
  To: Miao Xie; +Cc: Lee Schermerhorn, Nick Piggin, Paul Menage, Linux-Kernel,
	Linux-MM

On Mon, 8 Mar 2010, Miao Xie wrote:

> Changes from V1 to V2:
> - Update task->mems_allowed lazily, instead of using a lock to protect it
> 
> Before applying this patch, cpuset updates task->mems_allowed by setting all
> new bits in the nodemask first, and clearing all old unallowed bits later.
> But in the way, the allocator is likely to see an empty nodemask.
> 

Likely?  It's probably rarer than one in a million.

> The problem is following:
> The size of nodemask_t is greater than the size of long integer, so loading
> and storing of nodemask_t are not atomic operations. If task->mems_allowed
> don't intersect with new_mask, such as the first word of the mask is empty
> and only the first word of new_mask is not empty. When the allocator
> loads a word of the mask before
> 
> 	current->mems_allowed |= new_mask;
> 
> and then loads another word of the mask after
> 
> 	current->mems_allowed = new_mask;
> 
> the allocator gets an empty nodemask.
> 
> Considering the change of task->mems_allowed is not frequent, so in this patch,
> I use two variables as a tag to indicate whether task->mems_allowed need be
> update or not. And before setting the tag, cpuset caches the new mask of every
> task at its task_struct.
> 

So what exactly is the benefit of 58568d2 from last June that caused this 
issue to begin with?  It seems like this entire patchset is a revert of 
that commit.  So why shouldn't we just revert that one commit and then add 
the locking and updating necessary for configs where
MAX_NUMNODES > BITS_PER_LONG on top?

> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index a5740fc..2eb0fa7 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -93,6 +93,44 @@ extern void cpuset_print_task_mems_allowed(struct task_struct *p);
>  static inline void set_mems_allowed(nodemask_t nodemask)
>  {
>  	current->mems_allowed = nodemask;
> +	current->mems_allowed_for_update = nodemask;
> +}
> +
> +#define task_mems_lock_irqsave(p, flags)			\
> +	do {							\
> +		spin_lock_irqsave(&p->mems_lock, flags);	\
> +	} while (0)
> +
> +#define task_mems_unlock_irqrestore(p, flags)			\
> +	do {							\
> +		spin_unlock_irqrestore(&p->mems_lock, flags);	\
> +	} while (0)

We don't need mems_lock at all for 99% of the machines running Linux where 
the update can be done atomically, so these need to be redefined to be a 
no-op for those users.

> +
> +#include <linux/mempolicy.h>

#includes should be at the top of include/linux/cpuset.h.

> +/**
> + * cpuset_update_task_mems_allowed - update task memory placement
> + *
> + * If the current task's mems_allowed_for_update and mempolicy_for_update are
> + * changed by cpuset behind our backs, update current->mems_allowed,
> + * mems_generation and task NUMA mempolicy to the new value.
> + *
> + * Call WITHOUT mems_lock held.
> + * 
> + * This routine is needed to update the pre-task mems_allowed and mempolicy
> + * within the tasks context, when it is trying to allocate memory.
> + */
> +static __always_inline void cpuset_update_task_mems_allowed(void)
> +{
> +	struct task_struct *tsk = current;
> +	unsigned long flags;
> +
> +	if (unlikely(tsk->mems_generation != tsk->mems_generation_for_update)) {
> +		task_mems_lock_irqsave(tsk, flags);
> +		tsk->mems_allowed = tsk->mems_allowed_for_update;
> +		tsk->mems_generation = tsk->mems_generation_for_update;
> +		task_mems_unlock_irqrestore(tsk, flags);

By this synchronization, you're guaranteeing that no other kernel code 
ever reads tsk->mems_allowed when tsk != current?  Otherwise, you're 
simply protecting the store to tsk->mems_allowed here and not serializing 
on the loads that can return empty nodemasks.

> +		mpol_rebind_task(tsk, &tsk->mems_allowed);
> +	}
>  }
>  
>  #else /* !CONFIG_CPUSETS */
> @@ -193,6 +231,13 @@ static inline void set_mems_allowed(nodemask_t nodemask)
>  {
>  }
>  
> +static inline void cpuset_update_task_mems_allowed(void)
> +{
> +}
> +
> +#define task_mems_lock_irqsave(p, flags)	do { (void)(flags); } while (0)
> +
> +#define task_mems_unlock_irqrestore(p, flags)	do { (void)(flags); } while (0)
>  #endif /* !CONFIG_CPUSETS */
>  
>  #endif /* _LINUX_CPUSET_H */
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index abec69b..be016f0 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -103,7 +103,7 @@ extern struct group_info init_groups;
>  extern struct cred init_cred;
>  
>  #ifdef CONFIG_PERF_EVENTS
> -# define INIT_PERF_EVENTS(tsk)					\
> +# define INIT_PERF_EVENTS(tsk)						\

Probably not intended.

>  	.perf_event_mutex = 						\
>  		 __MUTEX_INITIALIZER(tsk.perf_event_mutex),		\
>  	.perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
> @@ -111,6 +111,22 @@ extern struct cred init_cred;
>  # define INIT_PERF_EVENTS(tsk)
>  #endif
>  
> +#ifdef CONFIG_CPUSETS
> +# define INIT_MEMS_ALLOWED(tsk)						\
> +	.mems_lock = __SPIN_LOCK_UNLOCKED(tsk.mems_lock),		\
> +	.mems_generation = 0,						\
> +	.mems_generation_for_update = 0,
> +#else
> +# define INIT_MEMS_ALLOWED(tsk)
> +#endif
> +
> +#ifdef CONFIG_NUMA
> +# define INIT_MEMPOLICY							\
> +	.mempolicy = NULL,

This has never been needed before.

> +#else
> +# define INIT_MEMPOLICY
> +#endif
> +
>  /*
>   *  INIT_TASK is used to set up the first task table, touch at
>   * your own risk!. Base=0, limit=0x1fffff (=2MB)
> @@ -180,6 +196,8 @@ extern struct cred init_cred;
>  	INIT_FTRACE_GRAPH						\
>  	INIT_TRACE_RECURSION						\
>  	INIT_TASK_RCU_PREEMPT(tsk)					\
> +	INIT_MEMS_ALLOWED(tsk)						\
> +	INIT_MEMPOLICY							\
>  }
>  
>  
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 46c6f8d..9e7f14f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1351,8 +1351,9 @@ struct task_struct {
>  /* Thread group tracking */
>     	u32 parent_exec_id;
>     	u32 self_exec_id;
> -/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
> - * mempolicy */
> +/*
> + * Protection of (de-)allocation: mm, files, fs, tty, keyrings
> + */
>  	spinlock_t alloc_lock;
>  
>  #ifdef CONFIG_GENERIC_HARDIRQS
> @@ -1420,8 +1421,36 @@ struct task_struct {
>  	cputime_t acct_timexpd;	/* stime + utime since last update */
>  #endif
>  #ifdef CONFIG_CPUSETS
> -	nodemask_t mems_allowed;	/* Protected by alloc_lock */
> +	/*
> +	 * It is unnecessary to protect mems_allowed, because it only can be
> +	 * loaded and stored by current task's self
> +	 */
> +	nodemask_t mems_allowed;
>  	int cpuset_mem_spread_rotor;
> +
> +	/* Protection of ->mems_allowed_for_update */
> +	spinlock_t mems_lock;
> +	/*
> +	 * This variable(mems_allowed_for_update) are just used for caching
> +	 * memory placement information.
> +	 *
> +	 * ->mems_allowed are used by the kernel allocator.
> +	 */
> +	nodemask_t mems_allowed_for_update;	/* Protected by mems_lock */

Another nodemask_t in struct task_struct for this?  And for all configs, 
including those that can do atomic updates to mems_allowed?

> +
> +	/*
> +	 * Increment this integer everytime ->mems_allowed_for_update is
> +	 * changed by cpuset. Task can compare this number with mems_generation,
> +	 * and if they are not the same, mems_allowed_for_update is changed and
> +	 * ->mems_allowed must be updated. In this way, tasks can avoid having
> +	 * to lock and reload mems_allowed_for_update unless it is changed.
> +	 */
> +	int mems_generation_for_update;
> +	/*
> +	 * After updating mems_allowed, set mems_generation to
> +	 * mems_generation_for_update.
> +	 */
> +	int mems_generation;

I don't see why you need two mems_generation numbers, one should belong in 
the task's cpuset.  Then you can compare tsk->mems_generation to 
task_cs(tsk)->mems_generation at cpuset_update_task_memory_state() if you 
set tsk->mems_generation = task_cs(tsk)->mems_generation on 
cpuset_attach() or update_nodemask().

>  #endif
>  #ifdef CONFIG_CGROUPS
>  	/* Control Group info protected by css_set_lock */
> @@ -1443,7 +1472,11 @@ struct task_struct {
>  	struct list_head perf_event_list;
>  #endif
>  #ifdef CONFIG_NUMA
> -	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
> +	/*
> +	 * It is unnecessary to protect mempolicy, because it only can be
> +	 * loaded/stored by current task's self.
> +	 */
> +	struct mempolicy *mempolicy;

That's going to change soon since my oom killer rewrite protects 
tsk->mempolicy under task_lock().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily
  2010-03-08 21:46 ` David Rientjes
@ 2010-03-09  7:25   ` Miao Xie
  2010-03-11  8:15     ` Nick Piggin
  0 siblings, 1 reply; 14+ messages in thread
From: Miao Xie @ 2010-03-09  7:25 UTC (permalink / raw)
  To: David Rientjes
  Cc: Lee Schermerhorn, Nick Piggin, Paul Menage, Linux-Kernel,
	Linux-MM

on 2010-3-9 5:46, David Rientjes wrote:
[snip]
>> Considering the change of task->mems_allowed is not frequent, so in this patch,
>> I use two variables as a tag to indicate whether task->mems_allowed need be
>> update or not. And before setting the tag, cpuset caches the new mask of every
>> task at its task_struct.
>>
> 
> So what exactly is the benefit of 58568d2 from last June that caused this 
> issue to begin with?  It seems like this entire patchset is a revert of 
> that commit.  So why shouldn't we just revert that one commit and then add 
> the locking and updating necessary for configs where
> MAX_NUMNODES > BITS_PER_LONG on top?

I worried about the consistency of task->mempolicy with task->mems_allowed for
configs where MAX_NUMNODES <= BITS_PER_LONG. 

The problem that I worried is fowllowing:
When the kernel allocator allocates pages for tasks, it will access task->mempolicy
first and get the allowed node, then check whether that node is allowed by
task->mems_allowed.

But, Without this patch, ->mempolicy and ->mems_allowed is not updated at the same
time. the kernel allocator may access the inconsistent information of ->mempolicy
and ->mems_allowed, sush as the allocator gets the allowed node from old mempolicy,
but checks whether that node is allowed by new mems_allowed which does't intersect
old mempolicy.

So I made this patchset.

>> +/**
>> + * cpuset_update_task_mems_allowed - update task memory placement
>> + *
>> + * If the current task's mems_allowed_for_update and mempolicy_for_update are
>> + * changed by cpuset behind our backs, update current->mems_allowed,
>> + * mems_generation and task NUMA mempolicy to the new value.
>> + *
>> + * Call WITHOUT mems_lock held.
>> + * 
>> + * This routine is needed to update the pre-task mems_allowed and mempolicy
>> + * within the tasks context, when it is trying to allocate memory.
>> + */
>> +static __always_inline void cpuset_update_task_mems_allowed(void)
>> +{
>> +	struct task_struct *tsk = current;
>> +	unsigned long flags;
>> +
>> +	if (unlikely(tsk->mems_generation != tsk->mems_generation_for_update)) {
>> +		task_mems_lock_irqsave(tsk, flags);
>> +		tsk->mems_allowed = tsk->mems_allowed_for_update;
>> +		tsk->mems_generation = tsk->mems_generation_for_update;
>> +		task_mems_unlock_irqrestore(tsk, flags);
> 
> By this synchronization, you're guaranteeing that no other kernel code 
> ever reads tsk->mems_allowed when tsk != current?  Otherwise, you're 
> simply protecting the store to tsk->mems_allowed here and not serializing 
> on the loads that can return empty nodemasks.

I guarantee that no other kernel code changes tsk->mems_allowed when tsk != current.
so every task can  be safe to read tsk->mems_allowed without lock.

I will use mems_lock to protect it when other task reads. 

>> +	/* Protection of ->mems_allowed_for_update */
>> +	spinlock_t mems_lock;
>> +	/*
>> +	 * This variable(mems_allowed_for_update) are just used for caching
>> +	 * memory placement information.
>> +	 *
>> +	 * ->mems_allowed are used by the kernel allocator.
>> +	 */
>> +	nodemask_t mems_allowed_for_update;	/* Protected by mems_lock */
> 
> Another nodemask_t in struct task_struct for this?  And for all configs, 
> including those that can do atomic updates to mems_allowed?

Yes, for all configs.

> 
>> +
>> +	/*
>> +	 * Increment this integer everytime ->mems_allowed_for_update is
>> +	 * changed by cpuset. Task can compare this number with mems_generation,
>> +	 * and if they are not the same, mems_allowed_for_update is changed and
>> +	 * ->mems_allowed must be updated. In this way, tasks can avoid having
>> +	 * to lock and reload mems_allowed_for_update unless it is changed.
>> +	 */
>> +	int mems_generation_for_update;
>> +	/*
>> +	 * After updating mems_allowed, set mems_generation to
>> +	 * mems_generation_for_update.
>> +	 */
>> +	int mems_generation;
> 
> I don't see why you need two mems_generation numbers, one should belong in 
> the task's cpuset.  Then you can compare tsk->mems_generation to 
> task_cs(tsk)->mems_generation at cpuset_update_task_memory_state() if you 
> set tsk->mems_generation = task_cs(tsk)->mems_generation on 
> cpuset_attach() or update_nodemask().

In this way, we must use rcu_read_lock() to protect task's cs, and the performance
will slowdown though rcu read lock's spending is very small.

Thanks!
Miao

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily
  2010-03-09  7:25   ` Miao Xie
@ 2010-03-11  8:15     ` Nick Piggin
  2010-03-11 10:33       ` Miao Xie
  0 siblings, 1 reply; 14+ messages in thread
From: Nick Piggin @ 2010-03-11  8:15 UTC (permalink / raw)
  To: Miao Xie
  Cc: David Rientjes, Lee Schermerhorn, Paul Menage, Linux-Kernel,
	Linux-MM

On Tue, Mar 09, 2010 at 03:25:54PM +0800, Miao Xie wrote:
> on 2010-3-9 5:46, David Rientjes wrote:
> [snip]
> >> Considering the change of task->mems_allowed is not frequent, so in this patch,
> >> I use two variables as a tag to indicate whether task->mems_allowed need be
> >> update or not. And before setting the tag, cpuset caches the new mask of every
> >> task at its task_struct.
> >>
> > 
> > So what exactly is the benefit of 58568d2 from last June that caused this 
> > issue to begin with?  It seems like this entire patchset is a revert of 
> > that commit.  So why shouldn't we just revert that one commit and then add 
> > the locking and updating necessary for configs where
> > MAX_NUMNODES > BITS_PER_LONG on top?
> 
> I worried about the consistency of task->mempolicy with task->mems_allowed for
> configs where MAX_NUMNODES <= BITS_PER_LONG. 
> 
> The problem that I worried is fowllowing:
> When the kernel allocator allocates pages for tasks, it will access task->mempolicy
> first and get the allowed node, then check whether that node is allowed by
> task->mems_allowed.
> 
> But, Without this patch, ->mempolicy and ->mems_allowed is not updated at the same
> time. the kernel allocator may access the inconsistent information of ->mempolicy
> and ->mems_allowed, sush as the allocator gets the allowed node from old mempolicy,
> but checks whether that node is allowed by new mems_allowed which does't intersect
> old mempolicy.
> 
> So I made this patchset.

I like your focus on keeping the hotpath light, but it is getting a bit
crazy. I wonder if it wouldn't be better just to teach those places that
matter to retry on finding an inconsistent nodemask? The only failure
case to worry about is getting an empty nodemask, isn't it?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily
  2010-03-11  8:15     ` Nick Piggin
@ 2010-03-11 10:33       ` Miao Xie
  2010-03-11 11:03         ` Nick Piggin
  0 siblings, 1 reply; 14+ messages in thread
From: Miao Xie @ 2010-03-11 10:33 UTC (permalink / raw)
  To: Nick Piggin
  Cc: David Rientjes, Lee Schermerhorn, Paul Menage, Linux-Kernel,
	Linux-MM

on 2010-3-11 16:15, Nick Piggin wrote:
> On Tue, Mar 09, 2010 at 03:25:54PM +0800, Miao Xie wrote:
>> on 2010-3-9 5:46, David Rientjes wrote:
>> [snip]
>>>> Considering the change of task->mems_allowed is not frequent, so in this patch,
>>>> I use two variables as a tag to indicate whether task->mems_allowed need be
>>>> update or not. And before setting the tag, cpuset caches the new mask of every
>>>> task at its task_struct.
>>>>
>>>
>>> So what exactly is the benefit of 58568d2 from last June that caused this 
>>> issue to begin with?  It seems like this entire patchset is a revert of 
>>> that commit.  So why shouldn't we just revert that one commit and then add 
>>> the locking and updating necessary for configs where
>>> MAX_NUMNODES > BITS_PER_LONG on top?
>>
>> I worried about the consistency of task->mempolicy with task->mems_allowed for
>> configs where MAX_NUMNODES <= BITS_PER_LONG. 
>>
>> The problem that I worried is fowllowing:
>> When the kernel allocator allocates pages for tasks, it will access task->mempolicy
>> first and get the allowed node, then check whether that node is allowed by
>> task->mems_allowed.
>>
>> But, Without this patch, ->mempolicy and ->mems_allowed is not updated at the same
>> time. the kernel allocator may access the inconsistent information of ->mempolicy
>> and ->mems_allowed, sush as the allocator gets the allowed node from old mempolicy,
>> but checks whether that node is allowed by new mems_allowed which does't intersect
>> old mempolicy.
>>
>> So I made this patchset.
> 
> I like your focus on keeping the hotpath light, but it is getting a bit
> crazy. I wonder if it wouldn't be better just to teach those places that
> matter to retry on finding an inconsistent nodemask? The only failure
> case to worry about is getting an empty nodemask, isn't it?
> 

Ok, I try to make a new patch by using seqlock.

Miao

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily
  2010-03-11 10:33       ` Miao Xie
@ 2010-03-11 11:03         ` Nick Piggin
  2010-03-25 10:23           ` Miao Xie
                             ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Nick Piggin @ 2010-03-11 11:03 UTC (permalink / raw)
  To: Miao Xie
  Cc: David Rientjes, Lee Schermerhorn, Paul Menage, Linux-Kernel,
	Linux-MM

On Thu, Mar 11, 2010 at 06:33:02PM +0800, Miao Xie wrote:
> on 2010-3-11 16:15, Nick Piggin wrote:
> > On Tue, Mar 09, 2010 at 03:25:54PM +0800, Miao Xie wrote:
> >> on 2010-3-9 5:46, David Rientjes wrote:
> >> [snip]
> >>>> Considering the change of task->mems_allowed is not frequent, so in this patch,
> >>>> I use two variables as a tag to indicate whether task->mems_allowed need be
> >>>> update or not. And before setting the tag, cpuset caches the new mask of every
> >>>> task at its task_struct.
> >>>>
> >>>
> >>> So what exactly is the benefit of 58568d2 from last June that caused this 
> >>> issue to begin with?  It seems like this entire patchset is a revert of 
> >>> that commit.  So why shouldn't we just revert that one commit and then add 
> >>> the locking and updating necessary for configs where
> >>> MAX_NUMNODES > BITS_PER_LONG on top?
> >>
> >> I worried about the consistency of task->mempolicy with task->mems_allowed for
> >> configs where MAX_NUMNODES <= BITS_PER_LONG. 
> >>
> >> The problem that I worried is fowllowing:
> >> When the kernel allocator allocates pages for tasks, it will access task->mempolicy
> >> first and get the allowed node, then check whether that node is allowed by
> >> task->mems_allowed.
> >>
> >> But, Without this patch, ->mempolicy and ->mems_allowed is not updated at the same
> >> time. the kernel allocator may access the inconsistent information of ->mempolicy
> >> and ->mems_allowed, sush as the allocator gets the allowed node from old mempolicy,
> >> but checks whether that node is allowed by new mems_allowed which does't intersect
> >> old mempolicy.
> >>
> >> So I made this patchset.
> > 
> > I like your focus on keeping the hotpath light, but it is getting a bit
> > crazy. I wonder if it wouldn't be better just to teach those places that
> > matter to retry on finding an inconsistent nodemask? The only failure
> > case to worry about is getting an empty nodemask, isn't it?
> > 
> 
> Ok, I try to make a new patch by using seqlock.

Well... I do think seqlocks would be a bit simpler because they don't
require this checking and synchronizing of this patch.

But you are right: on non-x86 architectures seqlocks would probably be
more costly than your patch in the fastpaths. Unless you can avoid
using the seqlock in fastpaths and just have callers handle the rare
case of an empty nodemask.

cpuset_node_allowed_*wall doesn't need anything because it is just
interested in one bit in the mask.

cpuset_mem_spread_node doesn't matter because it will loop around and
try again if it doesn't find any nodes online.

cpuset_mems_allowed seems totally broken anyway

etc.

This approach might take a little more work, but I think it might be the
best way.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily
  2010-03-11 11:03         ` Nick Piggin
@ 2010-03-25 10:23           ` Miao Xie
  2010-03-25 12:56             ` Miao Xie
  2010-03-25 13:33           ` [PATCH] [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed (v2) (was: Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily) Miao Xie
  2010-03-31  9:54           ` [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily Miao Xie
  2 siblings, 1 reply; 14+ messages in thread
From: Miao Xie @ 2010-03-25 10:23 UTC (permalink / raw)
  To: Nick Piggin, David Rientjes, Lee Schermerhorn, Paul Menage,
	Andrew Morton
  Cc: Linux-Kernel, Linux-MM

on 2010-3-11 19:03, Nick Piggin wrote:
> Well... I do think seqlocks would be a bit simpler because they don't
> require this checking and synchronizing of this patch.

Hi, Nick Piggin

I have made a new patch which uses seqlock to protect mems_allowed and mempolicy.
please review it.

title: [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed

Before applying this patch, cpuset updates task->mems_allowed by setting all
new bits in the nodemask first, and clearing all old unallowed bits later.
But in the way, the allocator can see an empty nodemask, though it is infrequent.

The problem is following:
The size of nodemask_t is greater than the size of long integer, so loading
and storing of nodemask_t are not atomic operations. If task->mems_allowed
don't intersect with new_mask, such as the first word of the mask is empty
and only the first word of new_mask is not empty. When the allocator
loads a word of the mask before

	current->mems_allowed |= new_mask;

and then loads another word of the mask after

	current->mems_allowed = new_mask;

the allocator gets an empty nodemask.

Besides that, if the size of nodemask_t is less than the size of long integer,
there is another problem. when the kernel allocater invokes the following function,

	struct zoneref *next_zones_zonelist(struct zoneref *z,
						enum zone_type highest_zoneidx,
						nodemask_t *nodes,
						struct zone **zone)
	{
		/*
		 * Find the next suitable zone to use for the allocation.
		 * Only filter based on nodemask if it's set
		 */
		if (likely(nodes == NULL))
			......
 	       else
			while (zonelist_zone_idx(z) > highest_zoneidx ||
					(z->zone && !zref_in_nodemask(z, nodes)))
				z++;

		*zone = zonelist_zone(z);
		return z;
	}

if we change nodemask between two calls of zref_in_nodemask(), such as
	Task1						Task2
	zref_in_nodemask(z = node0's z, nodes = 1-2)
	zref_in_nodemask return 0
							nodes = 0
	zref_in_nodemask(z = node1's z, nodes = 0)
	zref_in_nodemask return 0
z will overflow.

when the kernel allocater accesses task->mempolicy, there is the same problem. 

The following method is used to fix these two problem.
A seqlock is used to protect task's mempolicy and mems_allowed for configs where
MAX_NUMNODES > BITS_PER_LONG, and when the kernel allocater accesses nodemask,
it locks the seqlock and gets the copy of nodemask, then it passes the copy of
nodemask to the memory allocating function.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 include/linux/cpuset.h    |   79 +++++++++++++++++++++++++--
 include/linux/init_task.h |    8 +++
 include/linux/sched.h     |   17 +++++--
 kernel/cpuset.c           |   94 +++++++++++++++++++++++++-------
 kernel/exit.c             |    4 ++
 kernel/fork.c             |    4 ++
 mm/hugetlb.c              |   22 +++++++-
 mm/mempolicy.c            |  133 ++++++++++++++++++++++++++++++++++-----------
 mm/slab.c                 |   24 +++++++-
 mm/slub.c                 |    9 +++-
 10 files changed, 326 insertions(+), 68 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a5740fc..e307f89 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -53,8 +53,8 @@ static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
 	return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
 }
 
-extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
-					  const struct task_struct *tsk2);
+extern int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
+					  struct task_struct *tsk2);
 
 #define cpuset_memory_pressure_bump() 				\
 	do {							\
@@ -90,9 +90,68 @@ extern void rebuild_sched_domains(void);
 
 extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 
+# if MAX_NUMNODES > BITS_PER_LONG
+/*
+ * Be used to protect task->mempolicy and mems_allowed when reading them for
+ * page allocation.
+ *
+ * we don't care that the kernel page allocator allocate a page on a node in
+ * the old mems_allowed, which isn't a big deal, especially since it was
+ * previously allowed.
+ *
+ * We just worry whether the kernel page allocator gets an empty mems_allowed
+ * or not. But
+ *   if MAX_NUMNODES <= BITS_PER_LONG, loading/storing task->mems_allowed are
+ *   atomic operations. So we needn't do anything to protect the loading of
+ *   task->mems_allowed in fastpaths.
+ *
+ *   if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed are
+ *   not atomic operations. So we use a seqlock to protect the loading of
+ *   task->mems_allowed in fastpaths.
+ */
+#define mems_fastpath_lock_irqsave(p, flags)				\
+	({								\
+		read_seqbegin_irqsave(&p->mems_seqlock, flags);		\
+	})
+
+#define mems_fastpath_unlock_irqrestore(p, seq, flags)			\
+	({								\
+		read_seqretry_irqrestore(&p->mems_seqlock, seq, flags);	\
+	})
+
+#define mems_slowpath_lock_irqsave(p, flags)				\
+	do {								\
+		write_seqlock_irqsave(&p->mems_seqlock, flags);		\
+	} while (0)
+
+#define mems_slowpath_unlock_irqrestore(p, flags)			\
+	do {								\
+		write_sequnlock_irqrestore(&p->mems_seqlock, flags);	\
+	} while (0)
+# else
+#define mems_fastpath_lock_irqsave(p, flags)		({ (void)(flags); 0; })
+
+#define mems_fastpath_unlock_irqrestore(p, flags)	({ (void)(flags); 0; })
+
+#define mems_slowpath_lock_irqsave(p, flags)			\
+	do {							\
+		task_lock(p);					\
+		(void)(flags);					\
+	} while (0)
+
+#define mems_slowpath_unlock_irqrestore(p, flags)		\
+	do {							\
+		task_unlock(p);					\
+		(void)(flags);					\
+	} while (0)
+# endif
+
 static inline void set_mems_allowed(nodemask_t nodemask)
 {
+	unsigned long flags;
+	mems_slowpath_lock_irqsave(current, flags);
 	current->mems_allowed = nodemask;
+	mems_slowpath_unlock_irqrestore(current, flags);
 }
 
 #else /* !CONFIG_CPUSETS */
@@ -144,8 +203,8 @@ static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
 	return 1;
 }
 
-static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
-						 const struct task_struct *tsk2)
+static inline int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
+						 struct task_struct *tsk2)
 {
 	return 1;
 }
@@ -193,6 +252,18 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 {
 }
 
+#define mems_fastpath_lock_irqsave(p, flags)				\
+	({ (void)(flags); 0; })
+
+#define mems_fastpath_unlock_irqrestore(p, seq, flags)			\
+	({ (void)(flags); 0; })
+
+#define mems_slowpath_lock_irqsave(p, flags)				\
+	do { (void)(flags); } while (0)
+
+#define mems_slowpath_unlock_irqrestore(p, flags)			\
+	do { (void)(flags); } while (0)
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1ed6797..0394e20 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -102,6 +102,13 @@ extern struct cred init_cred;
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
+#if defined(CONFIG_CPUSETS) && MAX_NUMNODES > BITS_PER_LONG
+# define INIT_MEM_SEQLOCK(tsk)						\
+	.mems_seqlock	= __SEQLOCK_UNLOCKED(tsk.mems_seqlock),
+#else
+# define INIT_MEM_SEQLOCK(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -171,6 +178,7 @@ extern struct cred init_cred;
 	INIT_FTRACE_GRAPH						\
 	INIT_TRACE_RECURSION						\
 	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_MEM_SEQLOCK(tsk)						\
 }
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 84b8c22..1cf5fd3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1356,8 +1356,9 @@ struct task_struct {
 /* Thread group tracking */
    	u32 parent_exec_id;
    	u32 self_exec_id;
-/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
- * mempolicy */
+/* Protection of (de-)allocation: mm, files, fs, tty, keyrings.
+ * if MAX_NUMNODES <= BITS_PER_LONG,it will protect mems_allowed and mempolicy.
+ * Or we use other seqlock - mems_seqlock to protect them. */
 	spinlock_t alloc_lock;
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -1425,7 +1426,13 @@ struct task_struct {
 	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
-	nodemask_t mems_allowed;	/* Protected by alloc_lock */
+# if MAX_NUMNODES > BITS_PER_LONG
+	/* Protection of mems_allowed, and mempolicy */
+	seqlock_t mems_seqlock;
+# endif
+	/* if MAX_NUMNODES <= BITS_PER_LONG, Protected by alloc_lock;
+	 * else Protected by mems_seqlock */
+	nodemask_t mems_allowed;
 	int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
@@ -1448,7 +1455,9 @@ struct task_struct {
 	struct list_head perf_event_list;
 #endif
 #ifdef CONFIG_NUMA
-	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
+	/* if MAX_NUMNODES <= BITS_PER_LONG, Protected by alloc_lock;
+	 * else Protected by mems_seqlock */
+	struct mempolicy *mempolicy;
 	short il_next;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d109467..8a658c5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -198,12 +198,13 @@ static struct cpuset top_cpuset = {
  * from one of the callbacks into the cpuset code from within
  * __alloc_pages().
  *
- * If a task is only holding callback_mutex, then it has read-only
- * access to cpusets.
+ * If a task is only holding callback_mutex or cgroup_mutext, then it has
+ * read-only access to cpusets.
  *
  * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * by other task, we use alloc_lock in the task_struct fields to protect
- * them.
+ * by other task, we use alloc_lock(if MAX_NUMNODES <= BITS_PER_LONG) or
+ * mems_seqlock(if MAX_NUMNODES > BITS_PER_LONG) in the task_struct fields
+ * to protect them.
  *
  * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
@@ -920,6 +921,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  *    call to guarantee_online_mems(), as we know no one is changing
  *    our task's cpuset.
  *
+ *    As the above comment said, no one can change current task's mems_allowed
+ *    except itself. so we needn't hold lock to protect task's mems_allowed
+ *    during this call.
+ *
  *    While the mm_struct we are migrating is typically from some
  *    other task, the task_struct mems_allowed that we are hacking
  *    is for our current task, which must allocate new pages for that
@@ -947,15 +952,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  * we structure updates as setting all new allowed nodes, then clearing newly
  * disallowed ones.
  *
- * Called with task's alloc_lock held
+ * Called with mems_slowpath_lock held
  */
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
-	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, &tsk->mems_allowed);
-	mpol_rebind_task(tsk, newmems);
 	tsk->mems_allowed = *newmems;
+	mpol_rebind_task(tsk, newmems);
 }
 
 /*
@@ -970,6 +973,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
+	unsigned long flags;
 	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
 
 	if (!newmems)
@@ -978,9 +982,9 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	cs = cgroup_cs(scan->cg);
 	guarantee_online_mems(cs, newmems);
 
-	task_lock(p);
+	mems_slowpath_lock_irqsave(p, flags);
 	cpuset_change_task_nodemask(p, newmems);
-	task_unlock(p);
+	mems_slowpath_unlock_irqrestore(p, flags);
 
 	NODEMASK_FREE(newmems);
 
@@ -1375,6 +1379,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
 			       struct cpuset *cs)
 {
+	unsigned long flags;
 	int err;
 	/*
 	 * can_attach beforehand should guarantee that this doesn't fail.
@@ -1383,9 +1388,10 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
 	err = set_cpus_allowed_ptr(tsk, cpus_attach);
 	WARN_ON_ONCE(err);
 
-	task_lock(tsk);
+	mems_slowpath_lock_irqsave(tsk, flags);
 	cpuset_change_task_nodemask(tsk, to);
-	task_unlock(tsk);
+	mems_slowpath_unlock_irqrestore(tsk, flags);
+
 	cpuset_update_task_spread_flag(cs, tsk);
 
 }
@@ -2233,7 +2239,15 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
  */
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 {
-	return nodes_intersects(*nodemask, current->mems_allowed);
+	unsigned long flags, seq;
+	int retval;
+
+	do {
+		seq = mems_fastpath_lock_irqsave(current, flags);
+		retval = nodes_intersects(*nodemask, current->mems_allowed);
+	} while (mems_fastpath_unlock_irqrestore(current, seq, flags));
+
+	return retval;
 }
 
 /*
@@ -2314,11 +2328,18 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
 	const struct cpuset *cs;	/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
+	unsigned long flags, seq;
 
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
-	if (node_isset(node, current->mems_allowed))
+
+	do {
+		seq = mems_fastpath_lock_irqsave(current, flags);
+		allowed = node_isset(node, current->mems_allowed);
+	} while (mems_fastpath_unlock_irqrestore(current, seq, flags));
+
+	if (allowed)
 		return 1;
 	/*
 	 * Allow tasks that have access to memory reserves because they have
@@ -2369,9 +2390,18 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
  */
 int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
+	int allowed;
+	unsigned long flags, seq;
+
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	if (node_isset(node, current->mems_allowed))
+
+	do {
+		seq = mems_fastpath_lock_irqsave(current, flags);
+		allowed = node_isset(node, current->mems_allowed);
+	} while (mems_fastpath_unlock_irqrestore(current, seq, flags));
+
+	if (allowed)
 		return 1;
 	/*
 	 * Allow tasks that have access to memory reserves because they have
@@ -2438,10 +2468,18 @@ void cpuset_unlock(void)
 int cpuset_mem_spread_node(void)
 {
 	int node;
+	unsigned long flags, seq;
+	/* Used for allocating memory, so can't use NODEMASK_ALLOC */
+	nodemask_t nodes;
 
-	node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
+	do {
+		seq = mems_fastpath_lock_irqsave(current, flags);
+		nodes = current->mems_allowed;
+	} while (mems_fastpath_unlock_irqrestore(current, seq, flags));
+
+	node = next_node(current->cpuset_mem_spread_rotor, nodes);
 	if (node == MAX_NUMNODES)
-		node = first_node(current->mems_allowed);
+		node = first_node(nodes);
 	current->cpuset_mem_spread_rotor = node;
 	return node;
 }
@@ -2458,10 +2496,26 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
  * to the other.
  **/
 
-int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
-				   const struct task_struct *tsk2)
+int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
+				   struct task_struct *tsk2)
 {
-	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
+	unsigned long flags1, flags2;
+	int retval;
+	struct task_struct *tsk;
+
+	if (tsk1 > tsk2) {
+		tsk = tsk1;
+		tsk1 = tsk2;
+		tsk2 = tsk;
+	}
+
+	mems_slowpath_lock_irqsave(tsk1, flags1);
+	mems_slowpath_lock_irqsave(tsk2, flags2);
+	retval = nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
+	mems_slowpath_unlock_irqrestore(tsk2, flags2);
+	mems_slowpath_unlock_irqrestore(tsk1, flags1);
+
+	return retval;
 }
 
 /**
diff --git a/kernel/exit.c b/kernel/exit.c
index 7b012a0..cbf045d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -16,6 +16,7 @@
 #include <linux/key.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
+#include <linux/cpuset.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/file.h>
@@ -649,6 +650,7 @@ static void exit_mm(struct task_struct * tsk)
 {
 	struct mm_struct *mm = tsk->mm;
 	struct core_state *core_state;
+	unsigned long flags;
 
 	mm_release(tsk, mm);
 	if (!mm)
@@ -694,8 +696,10 @@ static void exit_mm(struct task_struct * tsk)
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
 #ifdef CONFIG_NUMA
+	mems_slowpath_lock_irqsave(tsk, flags);
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
+	mems_slowpath_unlock_irqrestore(tsk, flags);
 #endif
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index fe73f8d..591346a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
+#include <linux/cpuset.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
@@ -1075,6 +1076,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cgroup_fork(p);
+#if defined(CONFIG_CPUSETS) && MAX_NUMNODES > BITS_PER_LONG
+	seqlock_init(&p->mems_seqlock);
+#endif
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
  	if (IS_ERR(p->mempolicy)) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3a5aeb3..523f0f9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -465,6 +465,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 	struct page *page = NULL;
 	struct mempolicy *mpol;
 	nodemask_t *nodemask;
+	nodemask_t tmp_mask;
+	unsigned long seq, irqflag;
 	struct zonelist *zonelist = huge_zonelist(vma, address,
 					htlb_alloc_mask, &mpol, &nodemask);
 	struct zone *zone;
@@ -483,6 +485,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
 		return NULL;
 
+	if (mpol == current->mempolicy) {
+		do {
+			seq = mems_fastpath_lock_irqsave(current, irqflag);
+			tmp_mask = *nodemask;
+		} while (mems_fastpath_unlock_irqrestore(current,
+								seq, irqflag));
+		nodemask = &tmp_mask;
+	}
+
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
 		nid = zone_to_nid(zone);
@@ -1835,10 +1846,15 @@ __setup("default_hugepagesz=", hugetlb_default_setup);
 static unsigned int cpuset_mems_nr(unsigned int *array)
 {
 	int node;
-	unsigned int nr = 0;
+	unsigned int nr;
+	unsigned long flags, seq;
 
-	for_each_node_mask(node, cpuset_current_mems_allowed)
-		nr += array[node];
+	do {
+		nr = 0;
+		seq = mems_fastpath_lock_irqsave(current, flags);
+		for_each_node_mask(node, cpuset_current_mems_allowed)
+			nr += array[node];
+	} while (mems_fastpath_unlock_irqrestore(current, seq, flags));
 
 	return nr;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index dd3f5c5..2a42e8c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -187,8 +187,10 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
  * parameter with respect to the policy mode and flags.  But, we need to
  * handle an empty nodemask with MPOL_PREFERRED here.
  *
- * Must be called holding task's alloc_lock to protect task's mems_allowed
- * and mempolicy.  May also be called holding the mmap_semaphore for write.
+ * Must be called using
+ *     mems_slowpath_lock_irqsave()/mems_slowpath_unlock_irqrestore()
+ * to protect task's mems_allowed and mempolicy.  May also be called holding
+ * the mmap_semaphore for write.
  */
 static int mpol_set_nodemask(struct mempolicy *pol,
 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
@@ -344,9 +346,13 @@ static void mpol_rebind_policy(struct mempolicy *pol,
  * Wrapper for mpol_rebind_policy() that just requires task
  * pointer, and updates task mempolicy.
  *
- * Called with task's alloc_lock held.
+ * if task->pol==NULL, it will return -1, and tell us it is unnecessary to
+ * rebind task's mempolicy.
+ *
+ * Using
+ *     mems_slowpath_lock_irqsave()/mems_slowpath_unlock_irqrestore()
+ * to protect it.
  */
-
 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 {
 	mpol_rebind_policy(tsk->mempolicy, new);
@@ -644,6 +650,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	struct mempolicy *new, *old;
 	struct mm_struct *mm = current->mm;
 	NODEMASK_SCRATCH(scratch);
+	unsigned long irqflags;
 	int ret;
 
 	if (!scratch)
@@ -662,10 +669,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	 */
 	if (mm)
 		down_write(&mm->mmap_sem);
-	task_lock(current);
+	mems_slowpath_lock_irqsave(current, irqflags);
 	ret = mpol_set_nodemask(new, nodes, scratch);
 	if (ret) {
-		task_unlock(current);
+		mems_slowpath_unlock_irqrestore(current, irqflags);
 		if (mm)
 			up_write(&mm->mmap_sem);
 		mpol_put(new);
@@ -677,7 +684,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	if (new && new->mode == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
-	task_unlock(current);
+	mems_slowpath_unlock_irqrestore(current, irqflags);
 	if (mm)
 		up_write(&mm->mmap_sem);
 
@@ -691,7 +698,9 @@ out:
 /*
  * Return nodemask for policy for get_mempolicy() query
  *
- * Called with task's alloc_lock held
+ * Must be called using mems_slowpath_lock_irqsave()/
+ * mems_slowpath_unlock_irqrestore() to
+ * protect it.
  */
 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 {
@@ -736,6 +745,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
+	unsigned long irqflags;
 
 	if (flags &
 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
@@ -745,9 +755,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 			return -EINVAL;
 		*policy = 0;	/* just so it's initialized */
-		task_lock(current);
+
+		mems_slowpath_lock_irqsave(current, irqflags);
 		*nmask  = cpuset_current_mems_allowed;
-		task_unlock(current);
+		mems_slowpath_unlock_irqrestore(current, irqflags);
 		return 0;
 	}
 
@@ -803,13 +814,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 
 	err = 0;
 	if (nmask) {
+		mems_slowpath_lock_irqsave(current, irqflags);
 		if (mpol_store_user_nodemask(pol)) {
 			*nmask = pol->w.user_nodemask;
 		} else {
-			task_lock(current);
 			get_policy_nodemask(pol, nmask);
-			task_unlock(current);
 		}
+		mems_slowpath_unlock_irqrestore(current, irqflags);
 	}
 
  out:
@@ -1008,6 +1019,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	struct mempolicy *new;
 	unsigned long end;
 	int err;
+	unsigned long irqflags;
 	LIST_HEAD(pagelist);
 
 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
@@ -1055,9 +1067,9 @@ static long do_mbind(unsigned long start, unsigned long len,
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
 			down_write(&mm->mmap_sem);
-			task_lock(current);
+			mems_slowpath_lock_irqsave(current, irqflags);
 			err = mpol_set_nodemask(new, nmask, scratch);
-			task_unlock(current);
+			mems_slowpath_unlock_irqrestore(current, irqflags);
 			if (err)
 				up_write(&mm->mmap_sem);
 		} else
@@ -1408,8 +1420,10 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
 		} else if (vma->vm_policy)
 			pol = vma->vm_policy;
 	}
+
 	if (!pol)
 		pol = &default_policy;
+
 	return pol;
 }
 
@@ -1574,16 +1588,29 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 				nodemask_t **nodemask)
 {
 	struct zonelist *zl;
+	struct mempolicy policy;
+	struct mempolicy *pol;
+	unsigned long seq, irqflag;
 
 	*mpol = get_vma_policy(current, vma, addr);
 	*nodemask = NULL;	/* assume !MPOL_BIND */
 
-	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
-		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
+	pol = *mpol;
+	if (pol == current->mempolicy) {
+		do {
+			seq = mems_fastpath_lock_irqsave(current, irqflag);
+			policy = *pol;
+		} while (mems_fastpath_unlock_irqrestore(current,
+								seq, irqflag));
+		pol = &policy;
+	}
+
+	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+		zl = node_zonelist(interleave_nid(pol, vma, addr,
 				huge_page_shift(hstate_vma(vma))), gfp_flags);
 	} else {
-		zl = policy_zonelist(gfp_flags, *mpol);
-		if ((*mpol)->mode == MPOL_BIND)
+		zl = policy_zonelist(gfp_flags, pol);
+		if (pol->mode == MPOL_BIND)
 			*nodemask = &(*mpol)->v.nodes;
 	}
 	return zl;
@@ -1609,11 +1636,14 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 {
 	struct mempolicy *mempolicy;
 	int nid;
+	unsigned long irqflags;
 
 	if (!(mask && current->mempolicy))
 		return false;
 
+	mems_slowpath_lock_irqsave(current, irqflags);
 	mempolicy = current->mempolicy;
+
 	switch (mempolicy->mode) {
 	case MPOL_PREFERRED:
 		if (mempolicy->flags & MPOL_F_LOCAL)
@@ -1633,6 +1663,8 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 		BUG();
 	}
 
+	mems_slowpath_unlock_irqrestore(current, irqflags);
+
 	return true;
 }
 #endif
@@ -1722,7 +1754,18 @@ struct page *
 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+	struct mempolicy policy;
 	struct zonelist *zl;
+	struct page *page;
+	unsigned long seq, iflags;
+
+	if (pol == current->mempolicy) {
+		do {
+			seq = mems_fastpath_lock_irqsave(current, iflags);
+			policy = *pol;
+		} while (mems_fastpath_unlock_irqrestore(current, seq, iflags));
+		pol = &policy;
+	}
 
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
@@ -1736,15 +1779,16 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		/*
 		 * slow path: ref counted shared policy
 		 */
-		struct page *page =  __alloc_pages_nodemask(gfp, 0,
-						zl, policy_nodemask(gfp, pol));
+		page =  __alloc_pages_nodemask(gfp, 0, zl,
+					policy_nodemask(gfp, pol));
 		__mpol_put(pol);
 		return page;
 	}
 	/*
 	 * fast path:  default or task policy
 	 */
-	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+	page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+	return page;
 }
 
 /**
@@ -1761,26 +1805,37 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
  *	Allocate a page from the kernel page pool.  When not in
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
- *
- *	Don't call cpuset_update_task_memory_state() unless
- *	1) it's ok to take cpuset_sem (can WAIT), and
- *	2) allocating for current task (not interrupt).
  */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = current->mempolicy;
+	struct mempolicy policy;
+	struct page *page;
+	unsigned long seq, irqflags;
+
 
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
-
+	else {
+		do {
+			seq = mems_fastpath_lock_irqsave(current, irqflags);
+			policy = *pol;
+		} while (mems_fastpath_unlock_irqrestore(current,
+								seq, irqflags));
+		pol = &policy;
+	}
 	/*
 	 * No reference counting needed for current->mempolicy
 	 * nor system default_policy
 	 */
 	if (pol->mode == MPOL_INTERLEAVE)
-		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
-	return __alloc_pages_nodemask(gfp, order,
-			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+	else
+		page =  __alloc_pages_nodemask(gfp, order,
+					policy_zonelist(gfp, pol),
+					policy_nodemask(gfp, pol));
+
+	return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
@@ -2026,6 +2081,7 @@ restart:
  */
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 {
+	unsigned long irqflags;
 	int ret;
 
 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
@@ -2043,9 +2099,9 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 		if (IS_ERR(new))
 			goto put_free; /* no valid nodemask intersection */
 
-		task_lock(current);
+		mems_slowpath_lock_irqsave(current, irqflags);
 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
-		task_unlock(current);
+		mems_slowpath_unlock_irqrestore(current, irqflags);
 		mpol_put(mpol);	/* drop our ref on sb mpol */
 		if (ret)
 			goto put_free;
@@ -2200,6 +2256,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 	nodemask_t nodes;
 	char *nodelist = strchr(str, ':');
 	char *flags = strchr(str, '=');
+	unsigned long irqflags;
 	int err = 1;
 
 	if (nodelist) {
@@ -2291,9 +2348,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		int ret;
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
-			task_lock(current);
+			mems_slowpath_lock_irqsave(current, irqflags);
 			ret = mpol_set_nodemask(new, &nodes, scratch);
-			task_unlock(current);
+			mems_slowpath_unlock_irqrestore(current, irqflags);
 		} else
 			ret = -ENOMEM;
 		NODEMASK_SCRATCH_FREE(scratch);
@@ -2487,8 +2544,10 @@ int show_numa_map(struct seq_file *m, void *v)
 	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
 	struct mempolicy *pol;
+	struct mempolicy policy;
 	int n;
 	char buffer[50];
+	unsigned long iflags, seq;
 
 	if (!mm)
 		return 0;
@@ -2498,6 +2557,14 @@ int show_numa_map(struct seq_file *m, void *v)
 		return 0;
 
 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
+	if (pol == current->mempolicy) {
+		do {
+			seq = mems_fastpath_lock_irqsave(current, iflags);
+			policy = *pol;
+		} while (mems_fastpath_unlock_irqrestore(current, seq, iflags));
+		pol = &policy;
+	}
+
 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
 	mpol_cond_put(pol);
 
diff --git a/mm/slab.c b/mm/slab.c
index 09f1572..18e84a9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3282,14 +3282,24 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	int nid_alloc, nid_here;
+	unsigned long lflags, seq;
+	struct mempolicy mpol;
 
 	if (in_interrupt() || (flags & __GFP_THISNODE))
 		return NULL;
+
 	nid_alloc = nid_here = numa_node_id();
 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
 		nid_alloc = cpuset_mem_spread_node();
-	else if (current->mempolicy)
-		nid_alloc = slab_node(current->mempolicy);
+	else if (current->mempolicy) {
+		do {
+			seq = mems_fastpath_lock_irqsave(current, lflags);
+			mpol = *(current->mempolicy);
+		} while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
+
+		nid_alloc = slab_node(&mpol);
+	}
+
 	if (nid_alloc != nid_here)
 		return ____cache_alloc_node(cachep, flags, nid_alloc);
 	return NULL;
@@ -3312,11 +3322,19 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *obj = NULL;
 	int nid;
+	unsigned long lflags, seq;
+	struct mempolicy mpol;
 
 	if (flags & __GFP_THISNODE)
 		return NULL;
 
-	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+	do {
+		seq = mems_fastpath_lock_irqsave(current, lflags);
+		mpol = *(current->mempolicy);
+	} while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
+
+	zonelist = node_zonelist(slab_node(&mpol), flags);
+
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
 retry:
diff --git a/mm/slub.c b/mm/slub.c
index b364844..cd29f48 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1345,6 +1345,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	struct page *page;
+	unsigned long lflags, seq;
+	struct mempolicy mpol;
 
 	/*
 	 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1368,7 +1370,12 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 			get_cycles() % 1024 > s->remote_node_defrag_ratio)
 		return NULL;
 
-	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+	do {
+		seq = mems_fastpath_lock_irqsave(current, lflags);
+		mpol = *(current->mempolicy);
+	} while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
+
+	zonelist = node_zonelist(slab_node(&mpol), flags);
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		struct kmem_cache_node *n;
 
-- 
1.6.5.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily
  2010-03-25 10:23           ` Miao Xie
@ 2010-03-25 12:56             ` Miao Xie
  0 siblings, 0 replies; 14+ messages in thread
From: Miao Xie @ 2010-03-25 12:56 UTC (permalink / raw)
  To: Nick Piggin, David Rientjes, Lee Schermerhorn, Paul Menage,
	Andrew Morton
  Cc: Linux-Kernel, Linux-MM

on 2010-3-25 18:23, Miao Xie wrote:
> on 2010-3-11 19:03, Nick Piggin wrote:
>> Well... I do think seqlocks would be a bit simpler because they don't
>> require this checking and synchronizing of this patch.
> 
> Hi, Nick Piggin
> 
> I have made a new patch which uses seqlock to protect mems_allowed and mempolicy.
> please review it.
> 
> title: [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed
> 

Sorry! Please ignore this patch, because I sent an old version. I'll send the new one later.

Regards!
Miao

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed (v2) (was: Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily)
  2010-03-11 11:03         ` Nick Piggin
  2010-03-25 10:23           ` Miao Xie
@ 2010-03-25 13:33           ` Miao Xie
  2010-03-28  5:30             ` Bob Liu
  2010-03-31 19:42             ` Andrew Morton
  2010-03-31  9:54           ` [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily Miao Xie
  2 siblings, 2 replies; 14+ messages in thread
From: Miao Xie @ 2010-03-25 13:33 UTC (permalink / raw)
  To: Nick Piggin
  Cc: David Rientjes, Lee Schermerhorn, Paul Menage, Linux-Kernel,
	Linux-MM, Andrew Morton

on 2010-3-11 19:03, Nick Piggin wrote:
>> Ok, I try to make a new patch by using seqlock.
> 
> Well... I do think seqlocks would be a bit simpler because they don't
> require this checking and synchronizing of this patch.
Hi, Nick Piggin

I have made a new patch which uses seqlock to protect mems_allowed and mempolicy.
please review it.

Subject: [PATCH] [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed (v2)

Before applying this patch, cpuset updates task->mems_allowed by setting all
new bits in the nodemask first, and clearing all old unallowed bits later.
But in the way, the allocator can see an empty nodemask, though it is infrequent.

The problem is following:
The size of nodemask_t is greater than the size of long integer, so loading
and storing of nodemask_t are not atomic operations. If task->mems_allowed
don't intersect with new_mask, such as the first word of the mask is empty
and only the first word of new_mask is not empty. When the allocator
loads a word of the mask before

	current->mems_allowed |= new_mask;

and then loads another word of the mask after

	current->mems_allowed = new_mask;

the allocator gets an empty nodemask.

Besides that, if the size of nodemask_t is less than the size of long integer,
there is another problem. when the kernel allocater invokes the following function,

	struct zoneref *next_zones_zonelist(struct zoneref *z,
						enum zone_type highest_zoneidx,
						nodemask_t *nodes,
						struct zone **zone)
	{
		/*
		 * Find the next suitable zone to use for the allocation.
		 * Only filter based on nodemask if it's set
		 */
		if (likely(nodes == NULL))
			......
 	       else
			while (zonelist_zone_idx(z) > highest_zoneidx ||
					(z->zone && !zref_in_nodemask(z, nodes)))
				z++;

		*zone = zonelist_zone(z);
		return z;
	}

if we change nodemask between two calls of zref_in_nodemask(), such as
	Task1						Task2
	zref_in_nodemask(z = node0's z, nodes = 1-2)
	zref_in_nodemask return 0
							nodes = 0
	zref_in_nodemask(z = node1's z, nodes = 0)
	zref_in_nodemask return 0
z will overflow.

when the kernel allocater accesses task->mempolicy, there is the same problem.

The following method is used to fix these two problem.
A seqlock is used to protect task's mempolicy and mems_allowed for configs where
MAX_NUMNODES > BITS_PER_LONG, and when the kernel allocater accesses nodemask,
it locks the seqlock and gets the copy of nodemask, then it passes the copy of
nodemask to the memory allocating function.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 include/linux/cpuset.h    |   79 +++++++++++++++++++++++-
 include/linux/init_task.h |    8 +++
 include/linux/sched.h     |   17 ++++-
 kernel/cpuset.c           |   97 +++++++++++++++++++++++-------
 kernel/exit.c             |    4 +
 kernel/fork.c             |    4 +
 mm/hugetlb.c              |   22 ++++++-
 mm/mempolicy.c            |  144 ++++++++++++++++++++++++++++++++++-----------
 mm/slab.c                 |   26 +++++++-
 mm/slub.c                 |   12 ++++-
 10 files changed, 341 insertions(+), 72 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a5740fc..e307f89 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -53,8 +53,8 @@ static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
 	return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
 }
 
-extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
-					  const struct task_struct *tsk2);
+extern int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
+					  struct task_struct *tsk2);
 
 #define cpuset_memory_pressure_bump() 				\
 	do {							\
@@ -90,9 +90,68 @@ extern void rebuild_sched_domains(void);
 
 extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 
+# if MAX_NUMNODES > BITS_PER_LONG
+/*
+ * Be used to protect task->mempolicy and mems_allowed when reading them for
+ * page allocation.
+ *
+ * we don't care that the kernel page allocator allocate a page on a node in
+ * the old mems_allowed, which isn't a big deal, especially since it was
+ * previously allowed.
+ *
+ * We just worry whether the kernel page allocator gets an empty mems_allowed
+ * or not. But
+ *   if MAX_NUMNODES <= BITS_PER_LONG, loading/storing task->mems_allowed are
+ *   atomic operations. So we needn't do anything to protect the loading of
+ *   task->mems_allowed in fastpaths.
+ *
+ *   if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed are
+ *   not atomic operations. So we use a seqlock to protect the loading of
+ *   task->mems_allowed in fastpaths.
+ */
+#define mems_fastpath_lock_irqsave(p, flags)				\
+	({								\
+		read_seqbegin_irqsave(&p->mems_seqlock, flags);		\
+	})
+
+#define mems_fastpath_unlock_irqrestore(p, seq, flags)			\
+	({								\
+		read_seqretry_irqrestore(&p->mems_seqlock, seq, flags);	\
+	})
+
+#define mems_slowpath_lock_irqsave(p, flags)				\
+	do {								\
+		write_seqlock_irqsave(&p->mems_seqlock, flags);		\
+	} while (0)
+
+#define mems_slowpath_unlock_irqrestore(p, flags)			\
+	do {								\
+		write_sequnlock_irqrestore(&p->mems_seqlock, flags);	\
+	} while (0)
+# else
+#define mems_fastpath_lock_irqsave(p, flags)		({ (void)(flags); 0; })
+
+#define mems_fastpath_unlock_irqrestore(p, flags)	({ (void)(flags); 0; })
+
+#define mems_slowpath_lock_irqsave(p, flags)			\
+	do {							\
+		task_lock(p);					\
+		(void)(flags);					\
+	} while (0)
+
+#define mems_slowpath_unlock_irqrestore(p, flags)		\
+	do {							\
+		task_unlock(p);					\
+		(void)(flags);					\
+	} while (0)
+# endif
+
 static inline void set_mems_allowed(nodemask_t nodemask)
 {
+	unsigned long flags;
+	mems_slowpath_lock_irqsave(current, flags);
 	current->mems_allowed = nodemask;
+	mems_slowpath_unlock_irqrestore(current, flags);
 }
 
 #else /* !CONFIG_CPUSETS */
@@ -144,8 +203,8 @@ static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
 	return 1;
 }
 
-static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
-						 const struct task_struct *tsk2)
+static inline int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
+						 struct task_struct *tsk2)
 {
 	return 1;
 }
@@ -193,6 +252,18 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 {
 }
 
+#define mems_fastpath_lock_irqsave(p, flags)				\
+	({ (void)(flags); 0; })
+
+#define mems_fastpath_unlock_irqrestore(p, seq, flags)			\
+	({ (void)(flags); 0; })
+
+#define mems_slowpath_lock_irqsave(p, flags)				\
+	do { (void)(flags); } while (0)
+
+#define mems_slowpath_unlock_irqrestore(p, flags)			\
+	do { (void)(flags); } while (0)
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1ed6797..0394e20 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -102,6 +102,13 @@ extern struct cred init_cred;
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
+#if defined(CONFIG_CPUSETS) && MAX_NUMNODES > BITS_PER_LONG
+# define INIT_MEM_SEQLOCK(tsk)						\
+	.mems_seqlock	= __SEQLOCK_UNLOCKED(tsk.mems_seqlock),
+#else
+# define INIT_MEM_SEQLOCK(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -171,6 +178,7 @@ extern struct cred init_cred;
 	INIT_FTRACE_GRAPH						\
 	INIT_TRACE_RECURSION						\
 	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_MEM_SEQLOCK(tsk)						\
 }
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 84b8c22..1cf5fd3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1356,8 +1356,9 @@ struct task_struct {
 /* Thread group tracking */
    	u32 parent_exec_id;
    	u32 self_exec_id;
-/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
- * mempolicy */
+/* Protection of (de-)allocation: mm, files, fs, tty, keyrings.
+ * if MAX_NUMNODES <= BITS_PER_LONG,it will protect mems_allowed and mempolicy.
+ * Or we use other seqlock - mems_seqlock to protect them. */
 	spinlock_t alloc_lock;
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -1425,7 +1426,13 @@ struct task_struct {
 	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
-	nodemask_t mems_allowed;	/* Protected by alloc_lock */
+# if MAX_NUMNODES > BITS_PER_LONG
+	/* Protection of mems_allowed, and mempolicy */
+	seqlock_t mems_seqlock;
+# endif
+	/* if MAX_NUMNODES <= BITS_PER_LONG, Protected by alloc_lock;
+	 * else Protected by mems_seqlock */
+	nodemask_t mems_allowed;
 	int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
@@ -1448,7 +1455,9 @@ struct task_struct {
 	struct list_head perf_event_list;
 #endif
 #ifdef CONFIG_NUMA
-	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
+	/* if MAX_NUMNODES <= BITS_PER_LONG, Protected by alloc_lock;
+	 * else Protected by mems_seqlock */
+	struct mempolicy *mempolicy;
 	short il_next;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d109467..52e6f51 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -198,12 +198,13 @@ static struct cpuset top_cpuset = {
  * from one of the callbacks into the cpuset code from within
  * __alloc_pages().
  *
- * If a task is only holding callback_mutex, then it has read-only
- * access to cpusets.
+ * If a task is only holding callback_mutex or cgroup_mutext, then it has
+ * read-only access to cpusets.
  *
  * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * by other task, we use alloc_lock in the task_struct fields to protect
- * them.
+ * by other task, we use alloc_lock(if MAX_NUMNODES <= BITS_PER_LONG) or
+ * mems_seqlock(if MAX_NUMNODES > BITS_PER_LONG) in the task_struct fields
+ * to protect them.
  *
  * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
@@ -920,6 +921,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  *    call to guarantee_online_mems(), as we know no one is changing
  *    our task's cpuset.
  *
+ *    As the above comment said, no one can change current task's mems_allowed
+ *    except itself. so we needn't hold lock to protect task's mems_allowed
+ *    during this call.
+ *
  *    While the mm_struct we are migrating is typically from some
  *    other task, the task_struct mems_allowed that we are hacking
  *    is for our current task, which must allocate new pages for that
@@ -947,15 +952,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  * we structure updates as setting all new allowed nodes, then clearing newly
  * disallowed ones.
  *
- * Called with task's alloc_lock held
+ * Called with mems_slowpath_lock held
  */
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
-	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, &tsk->mems_allowed);
-	mpol_rebind_task(tsk, newmems);
 	tsk->mems_allowed = *newmems;
+	mpol_rebind_task(tsk, newmems);
 }
 
 /*
@@ -970,6 +973,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
+	unsigned long flags;
 	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
 
 	if (!newmems)
@@ -978,9 +982,9 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	cs = cgroup_cs(scan->cg);
 	guarantee_online_mems(cs, newmems);
 
-	task_lock(p);
+	mems_slowpath_lock_irqsave(p, flags);
 	cpuset_change_task_nodemask(p, newmems);
-	task_unlock(p);
+	mems_slowpath_unlock_irqrestore(p, flags);
 
 	NODEMASK_FREE(newmems);
 
@@ -1375,6 +1379,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
 			       struct cpuset *cs)
 {
+	unsigned long flags;
 	int err;
 	/*
 	 * can_attach beforehand should guarantee that this doesn't fail.
@@ -1383,9 +1388,10 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
 	err = set_cpus_allowed_ptr(tsk, cpus_attach);
 	WARN_ON_ONCE(err);
 
-	task_lock(tsk);
+	mems_slowpath_lock_irqsave(tsk, flags);
 	cpuset_change_task_nodemask(tsk, to);
-	task_unlock(tsk);
+	mems_slowpath_unlock_irqrestore(tsk, flags);
+
 	cpuset_update_task_spread_flag(cs, tsk);
 
 }
@@ -2233,7 +2239,15 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
  */
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 {
-	return nodes_intersects(*nodemask, current->mems_allowed);
+	unsigned long flags, seq;
+	int retval;
+
+	do {
+		seq = mems_fastpath_lock_irqsave(current, flags);
+		retval = nodes_intersects(*nodemask, current->mems_allowed);
+	} while (mems_fastpath_unlock_irqrestore(current, seq, flags));
+
+	return retval;
 }
 
 /*
@@ -2314,11 +2328,18 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
 	const struct cpuset *cs;	/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
+	unsigned long flags, seq;
 
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
-	if (node_isset(node, current->mems_allowed))
+
+	do {
+		seq = mems_fastpath_lock_irqsave(current, flags);
+		allowed = node_isset(node, current->mems_allowed);
+	} while (mems_fastpath_unlock_irqrestore(current, seq, flags));
+
+	if (allowed)
 		return 1;
 	/*
 	 * Allow tasks that have access to memory reserves because they have
@@ -2369,9 +2390,18 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
  */
 int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
+	int allowed;
+	unsigned long flags, seq;
+
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	if (node_isset(node, current->mems_allowed))
+
+	do {
+		seq = mems_fastpath_lock_irqsave(current, flags);
+		allowed = node_isset(node, current->mems_allowed);
+	} while (mems_fastpath_unlock_irqrestore(current, seq, flags));
+
+	if (allowed)
 		return 1;
 	/*
 	 * Allow tasks that have access to memory reserves because they have
@@ -2438,11 +2468,16 @@ void cpuset_unlock(void)
 int cpuset_mem_spread_node(void)
 {
 	int node;
-
-	node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
-	if (node == MAX_NUMNODES)
-		node = first_node(current->mems_allowed);
-	current->cpuset_mem_spread_rotor = node;
+	unsigned long flags, seq;
+
+	do {
+		seq = mems_fastpath_lock_irqsave(current, flags);
+		node = next_node(current->cpuset_mem_spread_rotor,
+					current->mems_allowed);
+		if (node == MAX_NUMNODES)
+			node = first_node(current->mems_allowed);
+		current->cpuset_mem_spread_rotor = node;
+	} while (mems_fastpath_unlock_irqrestore(current, seq, flags));
 	return node;
 }
 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
@@ -2458,10 +2493,26 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
  * to the other.
  **/
 
-int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
-				   const struct task_struct *tsk2)
+int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
+				   struct task_struct *tsk2)
 {
-	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
+	unsigned long flags1, flags2;
+	int retval;
+	struct task_struct *tsk;
+
+	if (tsk1 > tsk2) {
+		tsk = tsk1;
+		tsk1 = tsk2;
+		tsk2 = tsk;
+	}
+
+	mems_slowpath_lock_irqsave(tsk1, flags1);
+	mems_slowpath_lock_irqsave(tsk2, flags2);
+	retval = nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
+	mems_slowpath_unlock_irqrestore(tsk2, flags2);
+	mems_slowpath_unlock_irqrestore(tsk1, flags1);
+
+	return retval;
 }
 
 /**
diff --git a/kernel/exit.c b/kernel/exit.c
index 7b012a0..cbf045d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -16,6 +16,7 @@
 #include <linux/key.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
+#include <linux/cpuset.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/file.h>
@@ -649,6 +650,7 @@ static void exit_mm(struct task_struct * tsk)
 {
 	struct mm_struct *mm = tsk->mm;
 	struct core_state *core_state;
+	unsigned long flags;
 
 	mm_release(tsk, mm);
 	if (!mm)
@@ -694,8 +696,10 @@ static void exit_mm(struct task_struct * tsk)
 	/* We don't want this task to be frozen prematurely */
 	clear_freeze_flag(tsk);
 #ifdef CONFIG_NUMA
+	mems_slowpath_lock_irqsave(tsk, flags);
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
+	mems_slowpath_unlock_irqrestore(tsk, flags);
 #endif
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index fe73f8d..591346a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
+#include <linux/cpuset.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
@@ -1075,6 +1076,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cgroup_fork(p);
+#if defined(CONFIG_CPUSETS) && MAX_NUMNODES > BITS_PER_LONG
+	seqlock_init(&p->mems_seqlock);
+#endif
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
  	if (IS_ERR(p->mempolicy)) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3a5aeb3..b40cc52 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -465,6 +465,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 	struct page *page = NULL;
 	struct mempolicy *mpol;
 	nodemask_t *nodemask;
+	nodemask_t tmp_mask;
+	unsigned long seq, irqflag;
 	struct zonelist *zonelist = huge_zonelist(vma, address,
 					htlb_alloc_mask, &mpol, &nodemask);
 	struct zone *zone;
@@ -483,6 +485,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
 		return NULL;
 
+	if (mpol == current->mempolicy && nodemask) {
+		do {
+			seq = mems_fastpath_lock_irqsave(current, irqflag);
+			tmp_mask = *nodemask;
+		} while (mems_fastpath_unlock_irqrestore(current,
+								seq, irqflag));
+		nodemask = &tmp_mask;
+	}
+
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
 		nid = zone_to_nid(zone);
@@ -1835,10 +1846,15 @@ __setup("default_hugepagesz=", hugetlb_default_setup);
 static unsigned int cpuset_mems_nr(unsigned int *array)
 {
 	int node;
-	unsigned int nr = 0;
+	unsigned int nr;
+	unsigned long flags, seq;
 
-	for_each_node_mask(node, cpuset_current_mems_allowed)
-		nr += array[node];
+	do {
+		nr = 0;
+		seq = mems_fastpath_lock_irqsave(current, flags);
+		for_each_node_mask(node, cpuset_current_mems_allowed)
+			nr += array[node];
+	} while (mems_fastpath_unlock_irqrestore(current, seq, flags));
 
 	return nr;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index dd3f5c5..49abf11 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -187,8 +187,10 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
  * parameter with respect to the policy mode and flags.  But, we need to
  * handle an empty nodemask with MPOL_PREFERRED here.
  *
- * Must be called holding task's alloc_lock to protect task's mems_allowed
- * and mempolicy.  May also be called holding the mmap_semaphore for write.
+ * Must be called using
+ *     mems_slowpath_lock_irqsave()/mems_slowpath_unlock_irqrestore()
+ * to protect task's mems_allowed and mempolicy.  May also be called holding
+ * the mmap_semaphore for write.
  */
 static int mpol_set_nodemask(struct mempolicy *pol,
 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
@@ -344,9 +346,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
  * Wrapper for mpol_rebind_policy() that just requires task
  * pointer, and updates task mempolicy.
  *
- * Called with task's alloc_lock held.
+ * Using
+ *     mems_slowpath_lock_irqsave()/mems_slowpath_unlock_irqrestore()
+ * to protect it.
  */
-
 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 {
 	mpol_rebind_policy(tsk->mempolicy, new);
@@ -644,6 +647,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	struct mempolicy *new, *old;
 	struct mm_struct *mm = current->mm;
 	NODEMASK_SCRATCH(scratch);
+	unsigned long irqflags;
 	int ret;
 
 	if (!scratch)
@@ -662,10 +666,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	 */
 	if (mm)
 		down_write(&mm->mmap_sem);
-	task_lock(current);
+	mems_slowpath_lock_irqsave(current, irqflags);
 	ret = mpol_set_nodemask(new, nodes, scratch);
 	if (ret) {
-		task_unlock(current);
+		mems_slowpath_unlock_irqrestore(current, irqflags);
 		if (mm)
 			up_write(&mm->mmap_sem);
 		mpol_put(new);
@@ -677,7 +681,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	if (new && new->mode == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
-	task_unlock(current);
+	mems_slowpath_unlock_irqrestore(current, irqflags);
 	if (mm)
 		up_write(&mm->mmap_sem);
 
@@ -691,7 +695,9 @@ out:
 /*
  * Return nodemask for policy for get_mempolicy() query
  *
- * Called with task's alloc_lock held
+ * Must be called using mems_slowpath_lock_irqsave()/
+ * mems_slowpath_unlock_irqrestore() to
+ * protect it.
  */
 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 {
@@ -736,6 +742,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
+	unsigned long irqflags;
 
 	if (flags &
 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
@@ -745,9 +752,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 			return -EINVAL;
 		*policy = 0;	/* just so it's initialized */
-		task_lock(current);
+
+		mems_slowpath_lock_irqsave(current, irqflags);
 		*nmask  = cpuset_current_mems_allowed;
-		task_unlock(current);
+		mems_slowpath_unlock_irqrestore(current, irqflags);
 		return 0;
 	}
 
@@ -803,13 +811,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 
 	err = 0;
 	if (nmask) {
+		mems_slowpath_lock_irqsave(current, irqflags);
 		if (mpol_store_user_nodemask(pol)) {
 			*nmask = pol->w.user_nodemask;
 		} else {
-			task_lock(current);
 			get_policy_nodemask(pol, nmask);
-			task_unlock(current);
 		}
+		mems_slowpath_unlock_irqrestore(current, irqflags);
 	}
 
  out:
@@ -1008,6 +1016,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	struct mempolicy *new;
 	unsigned long end;
 	int err;
+	unsigned long irqflags;
 	LIST_HEAD(pagelist);
 
 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
@@ -1055,9 +1064,9 @@ static long do_mbind(unsigned long start, unsigned long len,
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
 			down_write(&mm->mmap_sem);
-			task_lock(current);
+			mems_slowpath_lock_irqsave(current, irqflags);
 			err = mpol_set_nodemask(new, nmask, scratch);
-			task_unlock(current);
+			mems_slowpath_unlock_irqrestore(current, irqflags);
 			if (err)
 				up_write(&mm->mmap_sem);
 		} else
@@ -1408,8 +1417,10 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
 		} else if (vma->vm_policy)
 			pol = vma->vm_policy;
 	}
+
 	if (!pol)
 		pol = &default_policy;
+
 	return pol;
 }
 
@@ -1475,7 +1486,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
  * next slab entry.
  * @policy must be protected by freeing by the caller.  If @policy is
  * the current task's mempolicy, this protection is implicit, as only the
- * task can change it's policy.  The system default policy requires no
+ * task can free it's policy.  The system default policy requires no
  * such protection.
  */
 unsigned slab_node(struct mempolicy *policy)
@@ -1574,16 +1585,33 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 				nodemask_t **nodemask)
 {
 	struct zonelist *zl;
+	struct mempolicy policy;
+	struct mempolicy *pol;
+	unsigned long seq, irqflag;
 
 	*mpol = get_vma_policy(current, vma, addr);
 	*nodemask = NULL;	/* assume !MPOL_BIND */
 
-	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
-		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
+	pol = *mpol;
+	if (pol == current->mempolicy) {
+		/*
+		 * get_vma_policy() doesn't return NULL, so we needn't worry
+		 * whether pol is NULL or not.
+		 */
+		do {
+			seq = mems_fastpath_lock_irqsave(current, irqflag);
+			policy = *pol;
+		} while (mems_fastpath_unlock_irqrestore(current,
+								seq, irqflag));
+		pol = &policy;
+	}
+
+	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+		zl = node_zonelist(interleave_nid(pol, vma, addr,
 				huge_page_shift(hstate_vma(vma))), gfp_flags);
 	} else {
-		zl = policy_zonelist(gfp_flags, *mpol);
-		if ((*mpol)->mode == MPOL_BIND)
+		zl = policy_zonelist(gfp_flags, pol);
+		if (pol->mode == MPOL_BIND)
 			*nodemask = &(*mpol)->v.nodes;
 	}
 	return zl;
@@ -1609,11 +1637,14 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 {
 	struct mempolicy *mempolicy;
 	int nid;
+	unsigned long irqflags;
 
 	if (!(mask && current->mempolicy))
 		return false;
 
+	mems_slowpath_lock_irqsave(current, irqflags);
 	mempolicy = current->mempolicy;
+
 	switch (mempolicy->mode) {
 	case MPOL_PREFERRED:
 		if (mempolicy->flags & MPOL_F_LOCAL)
@@ -1633,6 +1664,8 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 		BUG();
 	}
 
+	mems_slowpath_unlock_irqrestore(current, irqflags);
+
 	return true;
 }
 #endif
@@ -1722,7 +1755,22 @@ struct page *
 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+	struct mempolicy policy;
 	struct zonelist *zl;
+	struct page *page;
+	unsigned long seq, iflags;
+
+	if (pol == current->mempolicy) {
+		/*
+		 * get_vma_policy() doesn't return NULL, so we needn't worry
+		 * whether pol is NULL or not.
+		 */
+		do {
+			seq = mems_fastpath_lock_irqsave(current, iflags);
+			policy = *pol;
+		} while (mems_fastpath_unlock_irqrestore(current, seq, iflags));
+		pol = &policy;
+	}
 
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
@@ -1736,15 +1784,16 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		/*
 		 * slow path: ref counted shared policy
 		 */
-		struct page *page =  __alloc_pages_nodemask(gfp, 0,
-						zl, policy_nodemask(gfp, pol));
+		page =  __alloc_pages_nodemask(gfp, 0, zl,
+					policy_nodemask(gfp, pol));
 		__mpol_put(pol);
 		return page;
 	}
 	/*
 	 * fast path:  default or task policy
 	 */
-	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+	page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+	return page;
 }
 
 /**
@@ -1761,26 +1810,37 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
  *	Allocate a page from the kernel page pool.  When not in
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
- *
- *	Don't call cpuset_update_task_memory_state() unless
- *	1) it's ok to take cpuset_sem (can WAIT), and
- *	2) allocating for current task (not interrupt).
  */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = current->mempolicy;
+	struct mempolicy policy;
+	struct page *page;
+	unsigned long seq, irqflags;
+
 
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
-
+	else {
+		do {
+			seq = mems_fastpath_lock_irqsave(current, irqflags);
+			policy = *pol;
+		} while (mems_fastpath_unlock_irqrestore(current,
+								seq, irqflags));
+		pol = &policy;
+	}
 	/*
 	 * No reference counting needed for current->mempolicy
 	 * nor system default_policy
 	 */
 	if (pol->mode == MPOL_INTERLEAVE)
-		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
-	return __alloc_pages_nodemask(gfp, order,
-			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+	else
+		page =  __alloc_pages_nodemask(gfp, order,
+					policy_zonelist(gfp, pol),
+					policy_nodemask(gfp, pol));
+
+	return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
@@ -2026,6 +2086,7 @@ restart:
  */
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 {
+	unsigned long irqflags;
 	int ret;
 
 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
@@ -2043,9 +2104,9 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 		if (IS_ERR(new))
 			goto put_free; /* no valid nodemask intersection */
 
-		task_lock(current);
+		mems_slowpath_lock_irqsave(current, irqflags);
 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
-		task_unlock(current);
+		mems_slowpath_unlock_irqrestore(current, irqflags);
 		mpol_put(mpol);	/* drop our ref on sb mpol */
 		if (ret)
 			goto put_free;
@@ -2200,6 +2261,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 	nodemask_t nodes;
 	char *nodelist = strchr(str, ':');
 	char *flags = strchr(str, '=');
+	unsigned long irqflags;
 	int err = 1;
 
 	if (nodelist) {
@@ -2291,9 +2353,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		int ret;
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
-			task_lock(current);
+			mems_slowpath_lock_irqsave(current, irqflags);
 			ret = mpol_set_nodemask(new, &nodes, scratch);
-			task_unlock(current);
+			mems_slowpath_unlock_irqrestore(current, irqflags);
 		} else
 			ret = -ENOMEM;
 		NODEMASK_SCRATCH_FREE(scratch);
@@ -2487,8 +2549,10 @@ int show_numa_map(struct seq_file *m, void *v)
 	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
 	struct mempolicy *pol;
+	struct mempolicy policy;
 	int n;
 	char buffer[50];
+	unsigned long iflags, seq;
 
 	if (!mm)
 		return 0;
@@ -2498,6 +2562,18 @@ int show_numa_map(struct seq_file *m, void *v)
 		return 0;
 
 	pol = get_vma_policy(priv->task, vma, vma->vm_start);
+	if (pol == current->mempolicy) {
+		/*
+		 * get_vma_policy() doesn't return NULL, so we needn't worry
+		 * whether pol is NULL or not.
+		 */
+		do {
+			seq = mems_fastpath_lock_irqsave(current, iflags);
+			policy = *pol;
+		} while (mems_fastpath_unlock_irqrestore(current, seq, iflags));
+		pol = &policy;
+	}
+
 	mpol_to_str(buffer, sizeof(buffer), pol, 0);
 	mpol_cond_put(pol);
 
diff --git a/mm/slab.c b/mm/slab.c
index 09f1572..b8f5acb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3282,14 +3282,24 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	int nid_alloc, nid_here;
+	unsigned long lflags, seq;
+	struct mempolicy mpol;
 
 	if (in_interrupt() || (flags & __GFP_THISNODE))
 		return NULL;
+
 	nid_alloc = nid_here = numa_node_id();
 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
 		nid_alloc = cpuset_mem_spread_node();
-	else if (current->mempolicy)
-		nid_alloc = slab_node(current->mempolicy);
+	else if (current->mempolicy) {
+		do {
+			seq = mems_fastpath_lock_irqsave(current, lflags);
+			mpol = *(current->mempolicy);
+		} while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
+
+		nid_alloc = slab_node(&mpol);
+	}
+
 	if (nid_alloc != nid_here)
 		return ____cache_alloc_node(cachep, flags, nid_alloc);
 	return NULL;
@@ -3312,11 +3322,21 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *obj = NULL;
 	int nid;
+	unsigned long lflags, seq;
+	struct mempolicy mpol;
 
 	if (flags & __GFP_THISNODE)
 		return NULL;
 
-	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+	if (current->mempolicy) {
+		do {
+			seq = mems_fastpath_lock_irqsave(current, lflags);
+			mpol = *(current->mempolicy);
+		} while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
+		zonelist = node_zonelist(slab_node(&mpol), flags);
+	} else
+		zonelist = node_zonelist(slab_node(NULL), flags);
+
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
 retry:
diff --git a/mm/slub.c b/mm/slub.c
index b364844..436c521 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1345,6 +1345,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	struct page *page;
+	unsigned long lflags, seq;
+	struct mempolicy mpol;
 
 	/*
 	 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1368,7 +1370,15 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 			get_cycles() % 1024 > s->remote_node_defrag_ratio)
 		return NULL;
 
-	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+	if (current->mempolicy) {
+		do {
+			seq = mems_fastpath_lock_irqsave(current, lflags);
+			mpol = *(current->mempolicy);
+		} while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
+		zonelist = node_zonelist(slab_node(&mpol), flags);
+	} else
+		zonelist = node_zonelist(slab_node(NULL), flags);
+
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		struct kmem_cache_node *n;
 
-- 
1.6.5.2



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed (v2) (was: Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily)
  2010-03-25 13:33           ` [PATCH] [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed (v2) (was: Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily) Miao Xie
@ 2010-03-28  5:30             ` Bob Liu
  2010-03-31 19:42             ` Andrew Morton
  1 sibling, 0 replies; 14+ messages in thread
From: Bob Liu @ 2010-03-28  5:30 UTC (permalink / raw)
  To: miaox
  Cc: Nick Piggin, David Rientjes, Lee Schermerhorn, Paul Menage,
	Linux-Kernel, Linux-MM, Andrew Morton

On Thu, Mar 25, 2010 at 9:33 PM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> on 2010-3-11 19:03, Nick Piggin wrote:
>>> Ok, I try to make a new patch by using seqlock.
>>
>> Well... I do think seqlocks would be a bit simpler because they don't
>> require this checking and synchronizing of this patch.
> Hi, Nick Piggin
>
> I have made a new patch which uses seqlock to protect mems_allowed and mempolicy.
> please review it.
>
> Subject: [PATCH] [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed (v2)
>
> Before applying this patch, cpuset updates task->mems_allowed by setting all
> new bits in the nodemask first, and clearing all old unallowed bits later.
> But in the way, the allocator can see an empty nodemask, though it is infrequent.
>
> The problem is following:
> The size of nodemask_t is greater than the size of long integer, so loading
> and storing of nodemask_t are not atomic operations. If task->mems_allowed
> don't intersect with new_mask, such as the first word of the mask is empty
> and only the first word of new_mask is not empty. When the allocator
> loads a word of the mask before
>
>        current->mems_allowed |= new_mask;
>
> and then loads another word of the mask after
>
>        current->mems_allowed = new_mask;
>
> the allocator gets an empty nodemask.
>
> Besides that, if the size of nodemask_t is less than the size of long integer,
> there is another problem. when the kernel allocater invokes the following function,
>
>        struct zoneref *next_zones_zonelist(struct zoneref *z,
>                                                enum zone_type highest_zoneidx,
>                                                nodemask_t *nodes,
>                                                struct zone **zone)
>        {
>                /*
>                 * Find the next suitable zone to use for the allocation.
>                 * Only filter based on nodemask if it's set
>                 */
>                if (likely(nodes == NULL))
>                        ......
>               else
>                        while (zonelist_zone_idx(z) > highest_zoneidx ||
>                                        (z->zone && !zref_in_nodemask(z, nodes)))
>                                z++;
>
>                *zone = zonelist_zone(z);
>                return z;
>        }
>
> if we change nodemask between two calls of zref_in_nodemask(), such as
>        Task1                                           Task2
>        zref_in_nodemask(z = node0's z, nodes = 1-2)
>        zref_in_nodemask return 0
>                                                        nodes = 0
>        zref_in_nodemask(z = node1's z, nodes = 0)
>        zref_in_nodemask return 0
> z will overflow.
>
> when the kernel allocater accesses task->mempolicy, there is the same problem.
>
> The following method is used to fix these two problem.
> A seqlock is used to protect task's mempolicy and mems_allowed for configs where
> MAX_NUMNODES > BITS_PER_LONG, and when the kernel allocater accesses nodemask,
> it locks the seqlock and gets the copy of nodemask, then it passes the copy of
> nodemask to the memory allocating function.
>
> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
> ---
>  include/linux/cpuset.h    |   79 +++++++++++++++++++++++-
>  include/linux/init_task.h |    8 +++
>  include/linux/sched.h     |   17 ++++-
>  kernel/cpuset.c           |   97 +++++++++++++++++++++++-------
>  kernel/exit.c             |    4 +
>  kernel/fork.c             |    4 +
>  mm/hugetlb.c              |   22 ++++++-
>  mm/mempolicy.c            |  144 ++++++++++++++++++++++++++++++++++-----------
>  mm/slab.c                 |   26 +++++++-
>  mm/slub.c                 |   12 ++++-
>  10 files changed, 341 insertions(+), 72 deletions(-)
>
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index a5740fc..e307f89 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -53,8 +53,8 @@ static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
>        return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
>  }
>
> -extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
> -                                         const struct task_struct *tsk2);
> +extern int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
> +                                         struct task_struct *tsk2);
>
>  #define cpuset_memory_pressure_bump()                          \
>        do {                                                    \
> @@ -90,9 +90,68 @@ extern void rebuild_sched_domains(void);
>
>  extern void cpuset_print_task_mems_allowed(struct task_struct *p);
>
> +# if MAX_NUMNODES > BITS_PER_LONG
> +/*
> + * Be used to protect task->mempolicy and mems_allowed when reading them for
> + * page allocation.
> + *
> + * we don't care that the kernel page allocator allocate a page on a node in
> + * the old mems_allowed, which isn't a big deal, especially since it was
> + * previously allowed.
> + *
> + * We just worry whether the kernel page allocator gets an empty mems_allowed
> + * or not. But
> + *   if MAX_NUMNODES <= BITS_PER_LONG, loading/storing task->mems_allowed are
> + *   atomic operations. So we needn't do anything to protect the loading of
> + *   task->mems_allowed in fastpaths.
> + *
> + *   if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed are
> + *   not atomic operations. So we use a seqlock to protect the loading of
> + *   task->mems_allowed in fastpaths.
> + */
> +#define mems_fastpath_lock_irqsave(p, flags)                           \
> +       ({                                                              \
> +               read_seqbegin_irqsave(&p->mems_seqlock, flags);         \
> +       })
> +
> +#define mems_fastpath_unlock_irqrestore(p, seq, flags)                 \
> +       ({                                                              \
> +               read_seqretry_irqrestore(&p->mems_seqlock, seq, flags); \
> +       })
> +
> +#define mems_slowpath_lock_irqsave(p, flags)                           \
> +       do {                                                            \
> +               write_seqlock_irqsave(&p->mems_seqlock, flags);         \
> +       } while (0)
> +
> +#define mems_slowpath_unlock_irqrestore(p, flags)                      \
> +       do {                                                            \
> +               write_sequnlock_irqrestore(&p->mems_seqlock, flags);    \
> +       } while (0)
> +# else
> +#define mems_fastpath_lock_irqsave(p, flags)           ({ (void)(flags); 0; })
> +
> +#define mems_fastpath_unlock_irqrestore(p, flags)      ({ (void)(flags); 0; })
> +
> +#define mems_slowpath_lock_irqsave(p, flags)                   \
> +       do {                                                    \
> +               task_lock(p);                                   \
> +               (void)(flags);                                  \
> +       } while (0)
> +
> +#define mems_slowpath_unlock_irqrestore(p, flags)              \
> +       do {                                                    \
> +               task_unlock(p);                                 \
> +               (void)(flags);                                  \
> +       } while (0)
> +# endif
> +
>  static inline void set_mems_allowed(nodemask_t nodemask)
>  {
> +       unsigned long flags;
> +       mems_slowpath_lock_irqsave(current, flags);
>        current->mems_allowed = nodemask;
> +       mems_slowpath_unlock_irqrestore(current, flags);
>  }
>
>  #else /* !CONFIG_CPUSETS */
> @@ -144,8 +203,8 @@ static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
>        return 1;
>  }
>
> -static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
> -                                                const struct task_struct *tsk2)
> +static inline int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
> +                                                struct task_struct *tsk2)
>  {
>        return 1;
>  }
> @@ -193,6 +252,18 @@ static inline void set_mems_allowed(nodemask_t nodemask)
>  {
>  }
>
> +#define mems_fastpath_lock_irqsave(p, flags)                           \
> +       ({ (void)(flags); 0; })
> +
> +#define mems_fastpath_unlock_irqrestore(p, seq, flags)                 \
> +       ({ (void)(flags); 0; })
> +
> +#define mems_slowpath_lock_irqsave(p, flags)                           \
> +       do { (void)(flags); } while (0)
> +
> +#define mems_slowpath_unlock_irqrestore(p, flags)                      \
> +       do { (void)(flags); } while (0)
> +
>  #endif /* !CONFIG_CPUSETS */
>
>  #endif /* _LINUX_CPUSET_H */
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 1ed6797..0394e20 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -102,6 +102,13 @@ extern struct cred init_cred;
>  # define INIT_PERF_EVENTS(tsk)
>  #endif
>
> +#if defined(CONFIG_CPUSETS) && MAX_NUMNODES > BITS_PER_LONG
> +# define INIT_MEM_SEQLOCK(tsk)                                         \
> +       .mems_seqlock   = __SEQLOCK_UNLOCKED(tsk.mems_seqlock),
> +#else
> +# define INIT_MEM_SEQLOCK(tsk)
> +#endif
> +
>  /*
>  *  INIT_TASK is used to set up the first task table, touch at
>  * your own risk!. Base=0, limit=0x1fffff (=2MB)
> @@ -171,6 +178,7 @@ extern struct cred init_cred;
>        INIT_FTRACE_GRAPH                                               \
>        INIT_TRACE_RECURSION                                            \
>        INIT_TASK_RCU_PREEMPT(tsk)                                      \
> +       INIT_MEM_SEQLOCK(tsk)                                           \
>  }
>
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 84b8c22..1cf5fd3 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1356,8 +1356,9 @@ struct task_struct {
>  /* Thread group tracking */
>        u32 parent_exec_id;
>        u32 self_exec_id;
> -/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
> - * mempolicy */
> +/* Protection of (de-)allocation: mm, files, fs, tty, keyrings.
> + * if MAX_NUMNODES <= BITS_PER_LONG,it will protect mems_allowed and mempolicy.
> + * Or we use other seqlock - mems_seqlock to protect them. */
>        spinlock_t alloc_lock;
>
>  #ifdef CONFIG_GENERIC_HARDIRQS
> @@ -1425,7 +1426,13 @@ struct task_struct {
>        cputime_t acct_timexpd; /* stime + utime since last update */
>  #endif
>  #ifdef CONFIG_CPUSETS
> -       nodemask_t mems_allowed;        /* Protected by alloc_lock */
> +# if MAX_NUMNODES > BITS_PER_LONG
> +       /* Protection of mems_allowed, and mempolicy */
> +       seqlock_t mems_seqlock;
> +# endif
> +       /* if MAX_NUMNODES <= BITS_PER_LONG, Protected by alloc_lock;
> +        * else Protected by mems_seqlock */
> +       nodemask_t mems_allowed;
>        int cpuset_mem_spread_rotor;
>  #endif
>  #ifdef CONFIG_CGROUPS
> @@ -1448,7 +1455,9 @@ struct task_struct {
>        struct list_head perf_event_list;
>  #endif
>  #ifdef CONFIG_NUMA
> -       struct mempolicy *mempolicy;    /* Protected by alloc_lock */
> +       /* if MAX_NUMNODES <= BITS_PER_LONG, Protected by alloc_lock;
> +        * else Protected by mems_seqlock */
> +       struct mempolicy *mempolicy;
>        short il_next;
>  #endif
>        atomic_t fs_excl;       /* holding fs exclusive resources */
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index d109467..52e6f51 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -198,12 +198,13 @@ static struct cpuset top_cpuset = {
>  * from one of the callbacks into the cpuset code from within
>  * __alloc_pages().
>  *
> - * If a task is only holding callback_mutex, then it has read-only
> - * access to cpusets.
> + * If a task is only holding callback_mutex or cgroup_mutext, then it has
> + * read-only access to cpusets.
>  *
>  * Now, the task_struct fields mems_allowed and mempolicy may be changed
> - * by other task, we use alloc_lock in the task_struct fields to protect
> - * them.
> + * by other task, we use alloc_lock(if MAX_NUMNODES <= BITS_PER_LONG) or
> + * mems_seqlock(if MAX_NUMNODES > BITS_PER_LONG) in the task_struct fields
> + * to protect them.
>  *
>  * The cpuset_common_file_read() handlers only hold callback_mutex across
>  * small pieces of code, such as when reading out possibly multi-word
> @@ -920,6 +921,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
>  *    call to guarantee_online_mems(), as we know no one is changing
>  *    our task's cpuset.
>  *
> + *    As the above comment said, no one can change current task's mems_allowed
> + *    except itself. so we needn't hold lock to protect task's mems_allowed
> + *    during this call.
> + *
>  *    While the mm_struct we are migrating is typically from some
>  *    other task, the task_struct mems_allowed that we are hacking
>  *    is for our current task, which must allocate new pages for that
> @@ -947,15 +952,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
>  * we structure updates as setting all new allowed nodes, then clearing newly
>  * disallowed ones.
>  *
> - * Called with task's alloc_lock held
> + * Called with mems_slowpath_lock held
>  */
>  static void cpuset_change_task_nodemask(struct task_struct *tsk,
>                                        nodemask_t *newmems)
>  {
> -       nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
> -       mpol_rebind_task(tsk, &tsk->mems_allowed);
> -       mpol_rebind_task(tsk, newmems);
>        tsk->mems_allowed = *newmems;
> +       mpol_rebind_task(tsk, newmems);
>  }
>
>  /*
> @@ -970,6 +973,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
>        struct cpuset *cs;
>        int migrate;
>        const nodemask_t *oldmem = scan->data;
> +       unsigned long flags;
>        NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
>
>        if (!newmems)
> @@ -978,9 +982,9 @@ static void cpuset_change_nodemask(struct task_struct *p,
>        cs = cgroup_cs(scan->cg);
>        guarantee_online_mems(cs, newmems);
>
> -       task_lock(p);
> +       mems_slowpath_lock_irqsave(p, flags);
>        cpuset_change_task_nodemask(p, newmems);
> -       task_unlock(p);
> +       mems_slowpath_unlock_irqrestore(p, flags);
>
>        NODEMASK_FREE(newmems);
>
> @@ -1375,6 +1379,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
>  static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
>                               struct cpuset *cs)
>  {
> +       unsigned long flags;
>        int err;
>        /*
>         * can_attach beforehand should guarantee that this doesn't fail.
> @@ -1383,9 +1388,10 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
>        err = set_cpus_allowed_ptr(tsk, cpus_attach);
>        WARN_ON_ONCE(err);
>
> -       task_lock(tsk);
> +       mems_slowpath_lock_irqsave(tsk, flags);
>        cpuset_change_task_nodemask(tsk, to);
> -       task_unlock(tsk);
> +       mems_slowpath_unlock_irqrestore(tsk, flags);
> +
>        cpuset_update_task_spread_flag(cs, tsk);
>
>  }
> @@ -2233,7 +2239,15 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
>  */
>  int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
>  {
> -       return nodes_intersects(*nodemask, current->mems_allowed);
> +       unsigned long flags, seq;
> +       int retval;
> +
> +       do {
> +               seq = mems_fastpath_lock_irqsave(current, flags);
> +               retval = nodes_intersects(*nodemask, current->mems_allowed);
> +       } while (mems_fastpath_unlock_irqrestore(current, seq, flags));
> +
> +       return retval;
>  }
>
>  /*
> @@ -2314,11 +2328,18 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
>  {
>        const struct cpuset *cs;        /* current cpuset ancestors */
>        int allowed;                    /* is allocation in zone z allowed? */
> +       unsigned long flags, seq;
>
>        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
>                return 1;
>        might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
> -       if (node_isset(node, current->mems_allowed))
> +
> +       do {
> +               seq = mems_fastpath_lock_irqsave(current, flags);
> +               allowed = node_isset(node, current->mems_allowed);
> +       } while (mems_fastpath_unlock_irqrestore(current, seq, flags));
> +
> +       if (allowed)
>                return 1;
>        /*
>         * Allow tasks that have access to memory reserves because they have
> @@ -2369,9 +2390,18 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
>  */
>  int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
>  {
> +       int allowed;
> +       unsigned long flags, seq;
> +
>        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
>                return 1;
> -       if (node_isset(node, current->mems_allowed))
> +
> +       do {
> +               seq = mems_fastpath_lock_irqsave(current, flags);
> +               allowed = node_isset(node, current->mems_allowed);
> +       } while (mems_fastpath_unlock_irqrestore(current, seq, flags));
> +
> +       if (allowed)
>                return 1;
>        /*
>         * Allow tasks that have access to memory reserves because they have
> @@ -2438,11 +2468,16 @@ void cpuset_unlock(void)
>  int cpuset_mem_spread_node(void)
>  {
>        int node;
> -
> -       node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
> -       if (node == MAX_NUMNODES)
> -               node = first_node(current->mems_allowed);
> -       current->cpuset_mem_spread_rotor = node;
> +       unsigned long flags, seq;
> +
> +       do {
> +               seq = mems_fastpath_lock_irqsave(current, flags);
> +               node = next_node(current->cpuset_mem_spread_rotor,
> +                                       current->mems_allowed);
> +               if (node == MAX_NUMNODES)
> +                       node = first_node(current->mems_allowed);
> +               current->cpuset_mem_spread_rotor = node;
> +       } while (mems_fastpath_unlock_irqrestore(current, seq, flags));
>        return node;
>  }
>  EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
> @@ -2458,10 +2493,26 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
>  * to the other.
>  **/
>
> -int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
> -                                  const struct task_struct *tsk2)
> +int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
> +                                  struct task_struct *tsk2)
>  {
> -       return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
> +       unsigned long flags1, flags2;
> +       int retval;
> +       struct task_struct *tsk;
> +
> +       if (tsk1 > tsk2) {
> +               tsk = tsk1;
> +               tsk1 = tsk2;
> +               tsk2 = tsk;
> +       }
> +
> +       mems_slowpath_lock_irqsave(tsk1, flags1);
> +       mems_slowpath_lock_irqsave(tsk2, flags2);
> +       retval = nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
> +       mems_slowpath_unlock_irqrestore(tsk2, flags2);
> +       mems_slowpath_unlock_irqrestore(tsk1, flags1);
> +
> +       return retval;
>  }
>
>  /**
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 7b012a0..cbf045d 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -16,6 +16,7 @@
>  #include <linux/key.h>
>  #include <linux/security.h>
>  #include <linux/cpu.h>
> +#include <linux/cpuset.h>
>  #include <linux/acct.h>
>  #include <linux/tsacct_kern.h>
>  #include <linux/file.h>
> @@ -649,6 +650,7 @@ static void exit_mm(struct task_struct * tsk)
>  {
>        struct mm_struct *mm = tsk->mm;
>        struct core_state *core_state;
> +       unsigned long flags;
>
>        mm_release(tsk, mm);
>        if (!mm)
> @@ -694,8 +696,10 @@ static void exit_mm(struct task_struct * tsk)
>        /* We don't want this task to be frozen prematurely */
>        clear_freeze_flag(tsk);
>  #ifdef CONFIG_NUMA
> +       mems_slowpath_lock_irqsave(tsk, flags);
>        mpol_put(tsk->mempolicy);
>        tsk->mempolicy = NULL;
> +       mems_slowpath_unlock_irqrestore(tsk, flags);
>  #endif
>        task_unlock(tsk);
>        mm_update_next_owner(mm);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index fe73f8d..591346a 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -32,6 +32,7 @@
>  #include <linux/capability.h>
>  #include <linux/cpu.h>
>  #include <linux/cgroup.h>
> +#include <linux/cpuset.h>
>  #include <linux/security.h>
>  #include <linux/hugetlb.h>
>  #include <linux/swap.h>
> @@ -1075,6 +1076,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>        p->io_context = NULL;
>        p->audit_context = NULL;
>        cgroup_fork(p);
> +#if defined(CONFIG_CPUSETS) && MAX_NUMNODES > BITS_PER_LONG
> +       seqlock_init(&p->mems_seqlock);
> +#endif
>  #ifdef CONFIG_NUMA
>        p->mempolicy = mpol_dup(p->mempolicy);
>        if (IS_ERR(p->mempolicy)) {
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 3a5aeb3..b40cc52 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -465,6 +465,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
>        struct page *page = NULL;
>        struct mempolicy *mpol;
>        nodemask_t *nodemask;
> +       nodemask_t tmp_mask;
> +       unsigned long seq, irqflag;
>        struct zonelist *zonelist = huge_zonelist(vma, address,
>                                        htlb_alloc_mask, &mpol, &nodemask);
>        struct zone *zone;
> @@ -483,6 +485,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
>        if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
>                return NULL;
>
> +       if (mpol == current->mempolicy && nodemask) {
> +               do {
> +                       seq = mems_fastpath_lock_irqsave(current, irqflag);
> +                       tmp_mask = *nodemask;
> +               } while (mems_fastpath_unlock_irqrestore(current,
> +                                                               seq, irqflag));
> +               nodemask = &tmp_mask;
> +       }
> +

Maybe you can define these to a macro or inline function, I saw
serveral similar places  :-)

>        for_each_zone_zonelist_nodemask(zone, z, zonelist,
>                                                MAX_NR_ZONES - 1, nodemask) {
>                nid = zone_to_nid(zone);
> @@ -1835,10 +1846,15 @@ __setup("default_hugepagesz=", hugetlb_default_setup);
>  static unsigned int cpuset_mems_nr(unsigned int *array)
>  {
>        int node;
> -       unsigned int nr = 0;
> +       unsigned int nr;
> +       unsigned long flags, seq;
>
> -       for_each_node_mask(node, cpuset_current_mems_allowed)
> -               nr += array[node];
> +       do {
> +               nr = 0;
> +               seq = mems_fastpath_lock_irqsave(current, flags);
> +               for_each_node_mask(node, cpuset_current_mems_allowed)
> +                       nr += array[node];
> +       } while (mems_fastpath_unlock_irqrestore(current, seq, flags));
>
>        return nr;
>  }
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index dd3f5c5..49abf11 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -187,8 +187,10 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
>  * parameter with respect to the policy mode and flags.  But, we need to
>  * handle an empty nodemask with MPOL_PREFERRED here.
>  *
> - * Must be called holding task's alloc_lock to protect task's mems_allowed
> - * and mempolicy.  May also be called holding the mmap_semaphore for write.
> + * Must be called using
> + *     mems_slowpath_lock_irqsave()/mems_slowpath_unlock_irqrestore()
> + * to protect task's mems_allowed and mempolicy.  May also be called holding
> + * the mmap_semaphore for write.
>  */
>  static int mpol_set_nodemask(struct mempolicy *pol,
>                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
> @@ -344,9 +346,10 @@ static void mpol_rebind_policy(struct mempolicy *pol,
>  * Wrapper for mpol_rebind_policy() that just requires task
>  * pointer, and updates task mempolicy.
>  *
> - * Called with task's alloc_lock held.
> + * Using
> + *     mems_slowpath_lock_irqsave()/mems_slowpath_unlock_irqrestore()
> + * to protect it.
>  */
> -
>  void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
>  {
>        mpol_rebind_policy(tsk->mempolicy, new);
> @@ -644,6 +647,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>        struct mempolicy *new, *old;
>        struct mm_struct *mm = current->mm;
>        NODEMASK_SCRATCH(scratch);
> +       unsigned long irqflags;
>        int ret;
>
>        if (!scratch)
> @@ -662,10 +666,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>         */
>        if (mm)
>                down_write(&mm->mmap_sem);
> -       task_lock(current);
> +       mems_slowpath_lock_irqsave(current, irqflags);
>        ret = mpol_set_nodemask(new, nodes, scratch);
>        if (ret) {
> -               task_unlock(current);
> +               mems_slowpath_unlock_irqrestore(current, irqflags);
>                if (mm)
>                        up_write(&mm->mmap_sem);
>                mpol_put(new);
> @@ -677,7 +681,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>        if (new && new->mode == MPOL_INTERLEAVE &&
>            nodes_weight(new->v.nodes))
>                current->il_next = first_node(new->v.nodes);
> -       task_unlock(current);
> +       mems_slowpath_unlock_irqrestore(current, irqflags);
>        if (mm)
>                up_write(&mm->mmap_sem);
>
> @@ -691,7 +695,9 @@ out:
>  /*
>  * Return nodemask for policy for get_mempolicy() query
>  *
> - * Called with task's alloc_lock held
> + * Must be called using mems_slowpath_lock_irqsave()/
> + * mems_slowpath_unlock_irqrestore() to
> + * protect it.
>  */
>  static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
>  {
> @@ -736,6 +742,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
>        struct mm_struct *mm = current->mm;
>        struct vm_area_struct *vma = NULL;
>        struct mempolicy *pol = current->mempolicy;
> +       unsigned long irqflags;
>
>        if (flags &
>                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
> @@ -745,9 +752,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
>                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
>                        return -EINVAL;
>                *policy = 0;    /* just so it's initialized */
> -               task_lock(current);
> +
> +               mems_slowpath_lock_irqsave(current, irqflags);
>                *nmask  = cpuset_current_mems_allowed;
> -               task_unlock(current);
> +               mems_slowpath_unlock_irqrestore(current, irqflags);
>                return 0;
>        }
>
> @@ -803,13 +811,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
>
>        err = 0;
>        if (nmask) {
> +               mems_slowpath_lock_irqsave(current, irqflags);
>                if (mpol_store_user_nodemask(pol)) {
>                        *nmask = pol->w.user_nodemask;
>                } else {
> -                       task_lock(current);
>                        get_policy_nodemask(pol, nmask);
> -                       task_unlock(current);
>                }
> +               mems_slowpath_unlock_irqrestore(current, irqflags);
>        }
>
>  out:
> @@ -1008,6 +1016,7 @@ static long do_mbind(unsigned long start, unsigned long len,
>        struct mempolicy *new;
>        unsigned long end;
>        int err;
> +       unsigned long irqflags;
>        LIST_HEAD(pagelist);
>
>        if (flags & ~(unsigned long)(MPOL_MF_STRICT |
> @@ -1055,9 +1064,9 @@ static long do_mbind(unsigned long start, unsigned long len,
>                NODEMASK_SCRATCH(scratch);
>                if (scratch) {
>                        down_write(&mm->mmap_sem);
> -                       task_lock(current);
> +                       mems_slowpath_lock_irqsave(current, irqflags);
>                        err = mpol_set_nodemask(new, nmask, scratch);
> -                       task_unlock(current);
> +                       mems_slowpath_unlock_irqrestore(current, irqflags);
>                        if (err)
>                                up_write(&mm->mmap_sem);
>                } else
> @@ -1408,8 +1417,10 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
>                } else if (vma->vm_policy)
>                        pol = vma->vm_policy;
>        }
> +
>        if (!pol)
>                pol = &default_policy;
> +
>        return pol;
>  }
>
> @@ -1475,7 +1486,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
>  * next slab entry.
>  * @policy must be protected by freeing by the caller.  If @policy is
>  * the current task's mempolicy, this protection is implicit, as only the
> - * task can change it's policy.  The system default policy requires no
> + * task can free it's policy.  The system default policy requires no
>  * such protection.
>  */
>  unsigned slab_node(struct mempolicy *policy)
> @@ -1574,16 +1585,33 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
>                                nodemask_t **nodemask)
>  {
>        struct zonelist *zl;
> +       struct mempolicy policy;
> +       struct mempolicy *pol;
> +       unsigned long seq, irqflag;
>
>        *mpol = get_vma_policy(current, vma, addr);
>        *nodemask = NULL;       /* assume !MPOL_BIND */
>
> -       if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
> -               zl = node_zonelist(interleave_nid(*mpol, vma, addr,
> +       pol = *mpol;
> +       if (pol == current->mempolicy) {
> +               /*
> +                * get_vma_policy() doesn't return NULL, so we needn't worry
> +                * whether pol is NULL or not.
> +                */
> +               do {
> +                       seq = mems_fastpath_lock_irqsave(current, irqflag);
> +                       policy = *pol;
> +               } while (mems_fastpath_unlock_irqrestore(current,
> +                                                               seq, irqflag));
> +               pol = &policy;
> +       }
> +
> +       if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
> +               zl = node_zonelist(interleave_nid(pol, vma, addr,
>                                huge_page_shift(hstate_vma(vma))), gfp_flags);
>        } else {
> -               zl = policy_zonelist(gfp_flags, *mpol);
> -               if ((*mpol)->mode == MPOL_BIND)
> +               zl = policy_zonelist(gfp_flags, pol);
> +               if (pol->mode == MPOL_BIND)
>                        *nodemask = &(*mpol)->v.nodes;
>        }
>        return zl;
> @@ -1609,11 +1637,14 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
>  {
>        struct mempolicy *mempolicy;
>        int nid;
> +       unsigned long irqflags;
>
>        if (!(mask && current->mempolicy))
>                return false;
>
> +       mems_slowpath_lock_irqsave(current, irqflags);
>        mempolicy = current->mempolicy;
> +
>        switch (mempolicy->mode) {
>        case MPOL_PREFERRED:
>                if (mempolicy->flags & MPOL_F_LOCAL)
> @@ -1633,6 +1664,8 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
>                BUG();
>        }
>
> +       mems_slowpath_unlock_irqrestore(current, irqflags);
> +
>        return true;
>  }
>  #endif
> @@ -1722,7 +1755,22 @@ struct page *
>  alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
>  {
>        struct mempolicy *pol = get_vma_policy(current, vma, addr);
> +       struct mempolicy policy;
>        struct zonelist *zl;
> +       struct page *page;
> +       unsigned long seq, iflags;
> +
> +       if (pol == current->mempolicy) {
> +               /*
> +                * get_vma_policy() doesn't return NULL, so we needn't worry
> +                * whether pol is NULL or not.
> +                */
> +               do {
> +                       seq = mems_fastpath_lock_irqsave(current, iflags);
> +                       policy = *pol;
> +               } while (mems_fastpath_unlock_irqrestore(current, seq, iflags));
> +               pol = &policy;
> +       }
>
>        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
>                unsigned nid;
> @@ -1736,15 +1784,16 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
>                /*
>                 * slow path: ref counted shared policy
>                 */
> -               struct page *page =  __alloc_pages_nodemask(gfp, 0,
> -                                               zl, policy_nodemask(gfp, pol));
> +               page =  __alloc_pages_nodemask(gfp, 0, zl,
> +                                       policy_nodemask(gfp, pol));
>                __mpol_put(pol);
>                return page;
>        }
>        /*
>         * fast path:  default or task policy
>         */
> -       return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
> +       page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
> +       return page;
>  }
>
>  /**
> @@ -1761,26 +1810,37 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
>  *     Allocate a page from the kernel page pool.  When not in
>  *     interrupt context and apply the current process NUMA policy.
>  *     Returns NULL when no page can be allocated.
> - *
> - *     Don't call cpuset_update_task_memory_state() unless
> - *     1) it's ok to take cpuset_sem (can WAIT), and
> - *     2) allocating for current task (not interrupt).
>  */
>  struct page *alloc_pages_current(gfp_t gfp, unsigned order)
>  {
>        struct mempolicy *pol = current->mempolicy;
> +       struct mempolicy policy;
> +       struct page *page;
> +       unsigned long seq, irqflags;
> +
>
>        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
>                pol = &default_policy;
> -
> +       else {
> +               do {
> +                       seq = mems_fastpath_lock_irqsave(current, irqflags);
> +                       policy = *pol;
> +               } while (mems_fastpath_unlock_irqrestore(current,
> +                                                               seq, irqflags));
> +               pol = &policy;
> +       }
>        /*
>         * No reference counting needed for current->mempolicy
>         * nor system default_policy
>         */
>        if (pol->mode == MPOL_INTERLEAVE)
> -               return alloc_page_interleave(gfp, order, interleave_nodes(pol));
> -       return __alloc_pages_nodemask(gfp, order,
> -                       policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
> +               page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
> +       else
> +               page =  __alloc_pages_nodemask(gfp, order,
> +                                       policy_zonelist(gfp, pol),
> +                                       policy_nodemask(gfp, pol));
> +
> +       return page;
>  }
>  EXPORT_SYMBOL(alloc_pages_current);
>
> @@ -2026,6 +2086,7 @@ restart:
>  */
>  void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
>  {
> +       unsigned long irqflags;
>        int ret;
>
>        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
> @@ -2043,9 +2104,9 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
>                if (IS_ERR(new))
>                        goto put_free; /* no valid nodemask intersection */
>
> -               task_lock(current);
> +               mems_slowpath_lock_irqsave(current, irqflags);
>                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
> -               task_unlock(current);
> +               mems_slowpath_unlock_irqrestore(current, irqflags);
>                mpol_put(mpol); /* drop our ref on sb mpol */
>                if (ret)
>                        goto put_free;
> @@ -2200,6 +2261,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
>        nodemask_t nodes;
>        char *nodelist = strchr(str, ':');
>        char *flags = strchr(str, '=');
> +       unsigned long irqflags;
>        int err = 1;
>
>        if (nodelist) {
> @@ -2291,9 +2353,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
>                int ret;
>                NODEMASK_SCRATCH(scratch);
>                if (scratch) {
> -                       task_lock(current);
> +                       mems_slowpath_lock_irqsave(current, irqflags);
>                        ret = mpol_set_nodemask(new, &nodes, scratch);
> -                       task_unlock(current);
> +                       mems_slowpath_unlock_irqrestore(current, irqflags);
>                } else
>                        ret = -ENOMEM;
>                NODEMASK_SCRATCH_FREE(scratch);
> @@ -2487,8 +2549,10 @@ int show_numa_map(struct seq_file *m, void *v)
>        struct file *file = vma->vm_file;
>        struct mm_struct *mm = vma->vm_mm;
>        struct mempolicy *pol;
> +       struct mempolicy policy;
>        int n;
>        char buffer[50];
> +       unsigned long iflags, seq;
>
>        if (!mm)
>                return 0;
> @@ -2498,6 +2562,18 @@ int show_numa_map(struct seq_file *m, void *v)
>                return 0;
>
>        pol = get_vma_policy(priv->task, vma, vma->vm_start);
> +       if (pol == current->mempolicy) {
> +               /*
> +                * get_vma_policy() doesn't return NULL, so we needn't worry
> +                * whether pol is NULL or not.
> +                */
> +               do {
> +                       seq = mems_fastpath_lock_irqsave(current, iflags);
> +                       policy = *pol;
> +               } while (mems_fastpath_unlock_irqrestore(current, seq, iflags));
> +               pol = &policy;
> +       }
> +
>        mpol_to_str(buffer, sizeof(buffer), pol, 0);
>        mpol_cond_put(pol);
>
> diff --git a/mm/slab.c b/mm/slab.c
> index 09f1572..b8f5acb 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -3282,14 +3282,24 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
>  static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
>  {
>        int nid_alloc, nid_here;
> +       unsigned long lflags, seq;
> +       struct mempolicy mpol;
>
>        if (in_interrupt() || (flags & __GFP_THISNODE))
>                return NULL;
> +
>        nid_alloc = nid_here = numa_node_id();
>        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
>                nid_alloc = cpuset_mem_spread_node();
> -       else if (current->mempolicy)
> -               nid_alloc = slab_node(current->mempolicy);
> +       else if (current->mempolicy) {
> +               do {
> +                       seq = mems_fastpath_lock_irqsave(current, lflags);
> +                       mpol = *(current->mempolicy);
> +               } while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
> +
> +               nid_alloc = slab_node(&mpol);
> +       }
> +
>        if (nid_alloc != nid_here)
>                return ____cache_alloc_node(cachep, flags, nid_alloc);
>        return NULL;
> @@ -3312,11 +3322,21 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
>        enum zone_type high_zoneidx = gfp_zone(flags);
>        void *obj = NULL;
>        int nid;
> +       unsigned long lflags, seq;
> +       struct mempolicy mpol;
>
>        if (flags & __GFP_THISNODE)
>                return NULL;
>
> -       zonelist = node_zonelist(slab_node(current->mempolicy), flags);
> +       if (current->mempolicy) {
> +               do {
> +                       seq = mems_fastpath_lock_irqsave(current, lflags);
> +                       mpol = *(current->mempolicy);
> +               } while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
> +               zonelist = node_zonelist(slab_node(&mpol), flags);
> +       } else
> +               zonelist = node_zonelist(slab_node(NULL), flags);
> +
>        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
>
>  retry:
> diff --git a/mm/slub.c b/mm/slub.c
> index b364844..436c521 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1345,6 +1345,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
>        struct zone *zone;
>        enum zone_type high_zoneidx = gfp_zone(flags);
>        struct page *page;
> +       unsigned long lflags, seq;
> +       struct mempolicy mpol;
>
>        /*
>         * The defrag ratio allows a configuration of the tradeoffs between
> @@ -1368,7 +1370,15 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
>                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
>                return NULL;
>
> -       zonelist = node_zonelist(slab_node(current->mempolicy), flags);
> +       if (current->mempolicy) {
> +               do {
> +                       seq = mems_fastpath_lock_irqsave(current, lflags);
> +                       mpol = *(current->mempolicy);
> +               } while (mems_fastpath_unlock_irqrestore(current, seq, lflags));
> +               zonelist = node_zonelist(slab_node(&mpol), flags);
> +       } else
> +               zonelist = node_zonelist(slab_node(NULL), flags);
> +
>        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
>                struct kmem_cache_node *n;
>
> --
> 1.6.5.2
>
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>



-- 
Regards,
-Bob Liu

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily
  2010-03-11 11:03         ` Nick Piggin
  2010-03-25 10:23           ` Miao Xie
  2010-03-25 13:33           ` [PATCH] [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed (v2) (was: Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily) Miao Xie
@ 2010-03-31  9:54           ` Miao Xie
  2010-03-31 10:34             ` David Rientjes
  2 siblings, 1 reply; 14+ messages in thread
From: Miao Xie @ 2010-03-31  9:54 UTC (permalink / raw)
  To: Nick Piggin
  Cc: David Rientjes, Lee Schermerhorn, Paul Menage, Linux-Kernel,
	Linux-MM, Li Zefan, Andrew Morton

on 2010-3-11 19:03, Nick Piggin wrote:
>> Ok, I try to make a new patch by using seqlock.
> 
> Well... I do think seqlocks would be a bit simpler because they don't
> require this checking and synchronizing of this patch.
> 
> But you are right: on non-x86 architectures seqlocks would probably be
> more costly than your patch in the fastpaths. Unless you can avoid
> using the seqlock in fastpaths and just have callers handle the rare
> case of an empty nodemask.
> 
> cpuset_node_allowed_*wall doesn't need anything because it is just
> interested in one bit in the mask.
> 
> cpuset_mem_spread_node doesn't matter because it will loop around and
> try again if it doesn't find any nodes online.
> 
> cpuset_mems_allowed seems totally broken anyway
> 
> etc.
> 
> This approach might take a little more work, but I think it might be the
> best way. 

Hi, Nick Piggin

The following is the new patch made by your idea. Could you review it?

Thanks!
Miao
---
 kernel/cpuset.c |   11 +++++++++--
 mm/mempolicy.c  |   20 +++++++++++++++++---
 mm/mmzone.c     |   15 +++++++++++----
 3 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d109467..fbb1f1c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -952,8 +952,6 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
-	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, &tsk->mems_allowed);
 	mpol_rebind_task(tsk, newmems);
 	tsk->mems_allowed = *newmems;
 }
@@ -2442,6 +2440,15 @@ int cpuset_mem_spread_node(void)
 	node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
 	if (node == MAX_NUMNODES)
 		node = first_node(current->mems_allowed);
+
+	/*
+	 * if node is still MAX_NUMNODES, that means the kernel allocator saw
+	 * an empty nodemask. In that case, the kernel allocator allocate
+	 * memory on the current node.
+	 */
+	if (unlikely(node == MAX_NUMNODES))
+		node = numa_node_id();
+
 	current->cpuset_mem_spread_rotor = node;
 	return node;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8034abd..75e819e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1449,8 +1449,16 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
 		 * the first node in the mask instead.
 		 */
 		if (unlikely(gfp & __GFP_THISNODE) &&
-				unlikely(!node_isset(nd, policy->v.nodes)))
+				unlikely(!node_isset(nd, policy->v.nodes))) {
 			nd = first_node(policy->v.nodes);
+			/*
+			 * When rebinding task->mempolicy, th kernel allocator
+			 * may see an empty nodemask, and first_node() returns
+			 * MAX_NUMNODES, In that case, we will use current node.
+			 */
+			if (unlikely(nd == MAX_NUMNODES))
+				nd = numa_node_id();
+		}
 		break;
 	case MPOL_INTERLEAVE: /* should not happen */
 		break;
@@ -1522,17 +1530,21 @@ unsigned slab_node(struct mempolicy *policy)
 static unsigned offset_il_node(struct mempolicy *pol,
 		struct vm_area_struct *vma, unsigned long off)
 {
-	unsigned nnodes = nodes_weight(pol->v.nodes);
+	nodemask_t tmp_nodes;
+	unsigned nnodes;
 	unsigned target;
 	int c;
 	int nid = -1;
 
+	tmp_nodes = pol->v.nodes;
+	nnodes = nodes_weight(tmp_nodes);
 	if (!nnodes)
 		return numa_node_id();
+
 	target = (unsigned int)off % nnodes;
 	c = 0;
 	do {
-		nid = next_node(nid, pol->v.nodes);
+		nid = next_node(nid, tmp_nodes);
 		c++;
 	} while (c <= target);
 	return nid;
@@ -1631,7 +1643,9 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
+		task_lock(current);
 		*mask =  mempolicy->v.nodes;
+		task_unlock(current);
 		break;
 
 	default:
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d17..43ac21b 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -58,6 +58,7 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
 					nodemask_t *nodes,
 					struct zone **zone)
 {
+	nodemask_t tmp_nodes;
 	/*
 	 * Find the next suitable zone to use for the allocation.
 	 * Only filter based on nodemask if it's set
@@ -65,10 +66,16 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
 	if (likely(nodes == NULL))
 		while (zonelist_zone_idx(z) > highest_zoneidx)
 			z++;
-	else
-		while (zonelist_zone_idx(z) > highest_zoneidx ||
-				(z->zone && !zref_in_nodemask(z, nodes)))
-			z++;
+	else {
+		tmp_nodes = *nodes;
+		if (nodes_empty(tmp_nodes))
+			while (zonelist_zone_idx(z) > highest_zoneidx)
+				z++;
+		else
+			while (zonelist_zone_idx(z) > highest_zoneidx ||
+				(z->zone && !zref_in_nodemask(z, &tmp_nodes)))
+				z++;
+	}
 
 	*zone = zonelist_zone(z);
 	return z;
-- 
1.6.5.2


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily
  2010-03-31  9:54           ` [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily Miao Xie
@ 2010-03-31 10:34             ` David Rientjes
  2010-04-01  2:16               ` Miao Xie
  0 siblings, 1 reply; 14+ messages in thread
From: David Rientjes @ 2010-03-31 10:34 UTC (permalink / raw)
  To: Miao Xie
  Cc: Nick Piggin, Lee Schermerhorn, Paul Menage, Linux-Kernel,
	Linux-MM, Li Zefan, Andrew Morton

On Wed, 31 Mar 2010, Miao Xie wrote:

> diff --git a/mm/mmzone.c b/mm/mmzone.c
> index f5b7d17..43ac21b 100644
> --- a/mm/mmzone.c
> +++ b/mm/mmzone.c
> @@ -58,6 +58,7 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
>  					nodemask_t *nodes,
>  					struct zone **zone)
>  {
> +	nodemask_t tmp_nodes;
>  	/*
>  	 * Find the next suitable zone to use for the allocation.
>  	 * Only filter based on nodemask if it's set
> @@ -65,10 +66,16 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
>  	if (likely(nodes == NULL))
>  		while (zonelist_zone_idx(z) > highest_zoneidx)
>  			z++;
> -	else
> -		while (zonelist_zone_idx(z) > highest_zoneidx ||
> -				(z->zone && !zref_in_nodemask(z, nodes)))
> -			z++;
> +	else {
> +		tmp_nodes = *nodes;
> +		if (nodes_empty(tmp_nodes))
> +			while (zonelist_zone_idx(z) > highest_zoneidx)
> +				z++;
> +		else
> +			while (zonelist_zone_idx(z) > highest_zoneidx ||
> +				(z->zone && !zref_in_nodemask(z, &tmp_nodes)))
> +				z++;
> +	}
>  
>  	*zone = zonelist_zone(z);
>  	return z;

Unfortunately, you can't allocate a nodemask_t on the stack here because 
this is used in the iteration for get_page_from_freelist() which can occur 
very deep in the stack already and there's a probability of overflow.  
Dynamically allocating a nodemask_t simply wouldn't scale here, either, 
since it would allocate on every iteration of a zonelist.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed (v2) (was: Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily)
  2010-03-25 13:33           ` [PATCH] [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed (v2) (was: Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily) Miao Xie
  2010-03-28  5:30             ` Bob Liu
@ 2010-03-31 19:42             ` Andrew Morton
  1 sibling, 0 replies; 14+ messages in thread
From: Andrew Morton @ 2010-03-31 19:42 UTC (permalink / raw)
  To: miaox
  Cc: Nick Piggin, David Rientjes, Lee Schermerhorn, Paul Menage,
	Linux-Kernel, Linux-MM

On Thu, 25 Mar 2010 21:33:58 +0800
Miao Xie <miaox@cn.fujitsu.com> wrote:

> on 2010-3-11 19:03, Nick Piggin wrote:
> >> Ok, I try to make a new patch by using seqlock.
> > 
> > Well... I do think seqlocks would be a bit simpler because they don't
> > require this checking and synchronizing of this patch.
> Hi, Nick Piggin
> 
> I have made a new patch which uses seqlock to protect mems_allowed and mempolicy.
> please review it.

That's an awfully big patch for a pretty small bug?

> Subject: [PATCH] [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed (v2)
> 
> Before applying this patch, cpuset updates task->mems_allowed by setting all
> new bits in the nodemask first, and clearing all old unallowed bits later.
> But in the way, the allocator can see an empty nodemask, though it is infrequent.
> 
> The problem is following:
> The size of nodemask_t is greater than the size of long integer, so loading
> and storing of nodemask_t are not atomic operations. If task->mems_allowed
> don't intersect with new_mask, such as the first word of the mask is empty
> and only the first word of new_mask is not empty. When the allocator
> loads a word of the mask before
> 
> 	current->mems_allowed |= new_mask;
> 
> and then loads another word of the mask after
> 
> 	current->mems_allowed = new_mask;
> 
> the allocator gets an empty nodemask.

Probably we could fix this via careful ordering of the updates,
barriers and perhaps some avoicance action at the reader side.

> Besides that, if the size of nodemask_t is less than the size of long integer,
> there is another problem. when the kernel allocater invokes the following function,
> 
> 	struct zoneref *next_zones_zonelist(struct zoneref *z,
> 						enum zone_type highest_zoneidx,
> 						nodemask_t *nodes,
> 						struct zone **zone)
> 	{
> 		/*
> 		 * Find the next suitable zone to use for the allocation.
> 		 * Only filter based on nodemask if it's set
> 		 */
> 		if (likely(nodes == NULL))
> 			......
>  	       else
> 			while (zonelist_zone_idx(z) > highest_zoneidx ||
> 					(z->zone && !zref_in_nodemask(z, nodes)))
> 				z++;
> 
> 		*zone = zonelist_zone(z);
> 		return z;
> 	}
> 
> if we change nodemask between two calls of zref_in_nodemask(), such as
> 	Task1						Task2
> 	zref_in_nodemask(z = node0's z, nodes = 1-2)
> 	zref_in_nodemask return 0
> 							nodes = 0
> 	zref_in_nodemask(z = node1's z, nodes = 0)
> 	zref_in_nodemask return 0
> z will overflow.

And maybe we can fix this by taking a copy into a local.

> when the kernel allocater accesses task->mempolicy, there is the same problem.

And maybe we can fix those in a similar way.

But it's all too much, and we'll just break it again in the future.  So
yup, I guess locking is needed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily
  2010-03-31 10:34             ` David Rientjes
@ 2010-04-01  2:16               ` Miao Xie
  0 siblings, 0 replies; 14+ messages in thread
From: Miao Xie @ 2010-04-01  2:16 UTC (permalink / raw)
  To: David Rientjes
  Cc: Nick Piggin, Lee Schermerhorn, Paul Menage, Linux-Kernel,
	Linux-MM, Li Zefan, Andrew Morton

on 2010-3-31 18:34, David Rientjes wrote:
> On Wed, 31 Mar 2010, Miao Xie wrote:
> 
>> diff --git a/mm/mmzone.c b/mm/mmzone.c
>> index f5b7d17..43ac21b 100644
>> --- a/mm/mmzone.c
>> +++ b/mm/mmzone.c
>> @@ -58,6 +58,7 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
>>  					nodemask_t *nodes,
>>  					struct zone **zone)
>>  {
>> +	nodemask_t tmp_nodes;
>>  	/*
>>  	 * Find the next suitable zone to use for the allocation.
>>  	 * Only filter based on nodemask if it's set
>> @@ -65,10 +66,16 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
>>  	if (likely(nodes == NULL))
>>  		while (zonelist_zone_idx(z) > highest_zoneidx)
>>  			z++;
>> -	else
>> -		while (zonelist_zone_idx(z) > highest_zoneidx ||
>> -				(z->zone && !zref_in_nodemask(z, nodes)))
>> -			z++;
>> +	else {
>> +		tmp_nodes = *nodes;
>> +		if (nodes_empty(tmp_nodes))
>> +			while (zonelist_zone_idx(z) > highest_zoneidx)
>> +				z++;
>> +		else
>> +			while (zonelist_zone_idx(z) > highest_zoneidx ||
>> +				(z->zone && !zref_in_nodemask(z, &tmp_nodes)))
>> +				z++;
>> +	}
>>  
>>  	*zone = zonelist_zone(z);
>>  	return z;
> 
> Unfortunately, you can't allocate a nodemask_t on the stack here because 
> this is used in the iteration for get_page_from_freelist() which can occur 
> very deep in the stack already and there's a probability of overflow.  
> Dynamically allocating a nodemask_t simply wouldn't scale here, either, 
> since it would allocate on every iteration of a zonelist.
> 

Maybe it is better to fix this problem by checking this function's return
value, because this function will return NULL if seeing an empty nodemask.

The new patch is attached below.
---
 kernel/cpuset.c |   11 +++++++++--
 mm/mempolicy.c  |   28 +++++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d109467..fbb1f1c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -952,8 +952,6 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
-	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, &tsk->mems_allowed);
 	mpol_rebind_task(tsk, newmems);
 	tsk->mems_allowed = *newmems;
 }
@@ -2442,6 +2440,15 @@ int cpuset_mem_spread_node(void)
 	node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
 	if (node == MAX_NUMNODES)
 		node = first_node(current->mems_allowed);
+
+	/*
+	 * if node is still MAX_NUMNODES, that means the kernel allocator saw
+	 * an empty nodemask. In that case, the kernel allocator allocate
+	 * memory on the current node.
+	 */
+	if (unlikely(node == MAX_NUMNODES))
+		node = numa_node_id();
+
 	current->cpuset_mem_spread_rotor = node;
 	return node;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8034abd..0905b84 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1449,8 +1449,16 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
 		 * the first node in the mask instead.
 		 */
 		if (unlikely(gfp & __GFP_THISNODE) &&
-				unlikely(!node_isset(nd, policy->v.nodes)))
+				unlikely(!node_isset(nd, policy->v.nodes))) {
 			nd = first_node(policy->v.nodes);
+			/*
+			 * When rebinding task->mempolicy, th kernel allocator
+			 * may see an empty nodemask, and first_node() returns
+			 * MAX_NUMNODES, In that case, we will use current node.
+			 */
+			if (unlikely(nd == MAX_NUMNODES))
+				nd = numa_node_id();
+		}
 		break;
 	case MPOL_INTERLEAVE: /* should not happen */
 		break;
@@ -1510,6 +1518,14 @@ unsigned slab_node(struct mempolicy *policy)
 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
 							&policy->v.nodes,
 							&zone);
+		/*
+		 * When rebinding task->mempolicy, th kernel allocator
+		 * may see an empty nodemask, and first_zones_zonelist()
+		 * returns a NULL zone, In that case, we will use current
+		 * node.
+		 */
+		if (unlikely(zone == NULL))
+			return numa_node_id();
 		return zone->node;
 	}
 
@@ -1529,10 +1545,18 @@ static unsigned offset_il_node(struct mempolicy *pol,
 
 	if (!nnodes)
 		return numa_node_id();
+
 	target = (unsigned int)off % nnodes;
 	c = 0;
 	do {
 		nid = next_node(nid, pol->v.nodes);
+		/*
+		 * When rebinding task->mempolicy, th kernel allocator
+		 * may see an empty nodemask, and next_node() returns
+		 * MAX_NUMNODES, In that case, we will use current node.
+		 */
+		if (unlikely(nid == MAX_NUMNODES))
+			return numa_node_id();
 		c++;
 	} while (c <= target);
 	return nid;
@@ -1631,7 +1655,9 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
+		task_lock(current);
 		*mask =  mempolicy->v.nodes;
+		task_unlock(current);
 		break;
 
 	default:
-- 
1.6.5.2


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2010-04-01  2:16 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-08 10:10 [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily Miao Xie
2010-03-08 21:46 ` David Rientjes
2010-03-09  7:25   ` Miao Xie
2010-03-11  8:15     ` Nick Piggin
2010-03-11 10:33       ` Miao Xie
2010-03-11 11:03         ` Nick Piggin
2010-03-25 10:23           ` Miao Xie
2010-03-25 12:56             ` Miao Xie
2010-03-25 13:33           ` [PATCH] [PATCH -mmotm] cpuset,mm: use seqlock to protect task->mempolicy and mems_allowed (v2) (was: Re: [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily) Miao Xie
2010-03-28  5:30             ` Bob Liu
2010-03-31 19:42             ` Andrew Morton
2010-03-31  9:54           ` [PATCH V2 4/4] cpuset,mm: update task's mems_allowed lazily Miao Xie
2010-03-31 10:34             ` David Rientjes
2010-04-01  2:16               ` Miao Xie

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).