[PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
@ 2010-03-03 10:52 Miao Xie
  2010-03-03 23:50 ` Andrew Morton
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: Miao Xie @ 2010-03-03 10:52 UTC (permalink / raw)
  To: David Rientjes, Lee Schermerhorn, Nick Piggin, Paul Menage
  Cc: Linux-Kernel, Linux-MM

if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed or mems_allowed in
task->mempolicy are not atomic operations, and the kernel page allocator gets an empty
mems_allowed when updating task->mems_allowed or mems_allowed in task->mempolicy. So we
use a rwlock to protect them to fix this probelm.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 include/linux/cpuset.h    |  104 +++++++++++++++++++++++++++++-
 include/linux/init_task.h |    8 +++
 include/linux/mempolicy.h |   24 ++++++--
 include/linux/sched.h     |   17 ++++-
 kernel/cpuset.c           |  113 +++++++++++++++++++++++++++------
 kernel/exit.c             |    4 +
 kernel/fork.c             |   13 ++++-
 mm/hugetlb.c              |    3 +
 mm/mempolicy.c            |  153 ++++++++++++++++++++++++++++++++++----------
 mm/slab.c                 |   27 +++++++-
 mm/slub.c                 |   10 +++
 11 files changed, 403 insertions(+), 73 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a5740fc..b7a9ab0 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -53,8 +53,8 @@ static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
 	return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
 }
 
-extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
-					  const struct task_struct *tsk2);
+extern int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
+					  struct task_struct *tsk2);
 
 #define cpuset_memory_pressure_bump() 				\
 	do {							\
@@ -90,9 +90,92 @@ extern void rebuild_sched_domains(void);
 
 extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 
+# if MAX_NUMNODES > BITS_PER_LONG
+/*
+ * Be used to protect task->mempolicy and mems_allowed when reading them for
+ * page allocation.
+ *
+ * we don't care that the kernel page allocator allocate a page on a node in
+ * the old mems_allowed, which isn't a big deal, especially since it was
+ * previously allowed.
+ *
+ * We just worry whether the kernel page allocator gets an empty mems_allowed
+ * or not. But
+ *   if MAX_NUMNODES <= BITS_PER_LONG, loading/storing task->mems_allowed are
+ *   atomic operations. So we needn't do anything to protect the loading of
+ *   task->mems_allowed.
+ *
+ *   if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed are
+ *   not atomic operations. So we use a rwlock to protect the loading of
+ *   task->mems_allowed.
+ */
+#define read_mem_lock_irqsave(p, flags)				\
+	do {							\
+		read_lock_irqsave(&p->mem_rwlock, flags);	\
+	} while (0)
+
+#define read_mem_unlock_irqrestore(p, flags)			\
+	do {							\
+		read_unlock_irqrestore(&p->mem_rwlock, flags);	\
+	} while (0)
+
+/* Used to protect task->mempolicy and mems_allowed when user get mempolciy */
+#define read_mempolicy_lock_irqsave(p, flags)			\
+	do {							\
+		read_lock_irqsave(&p->mem_rwlock, flags);	\
+	} while (0)
+
+#define read_mempolicy_unlock_irqrestore(p, flags)		\
+	do {							\
+		read_unlock_irqrestore(&p->mem_rwlock, flags);	\
+	} while (0)
+
+#define write_mem_lock_irqsave(p, flags)			\
+	do {							\
+		write_lock_irqsave(&p->mem_rwlock, flags);	\
+	} while (0)
+
+#define write_mem_unlock_irqrestore(p, flags)			\
+	do {							\
+		write_unlock_irqrestore(&p->mem_rwlock, flags);	\
+	} while (0)
+# else
+#define read_mem_lock_irqsave(p, flags)		do { (void)(flags); } while (0)
+
+#define read_mem_unlock_irqrestore(p, flags)	do { (void)(flags); } while (0)
+
+/* Be used to protect task->mempolicy and mems_allowed when user reads them */
+#define read_mempolicy_lock_irqsave(p, flags)			\
+	do {							\
+		task_lock(p);					\
+		(void)(flags);					\
+	} while (0)
+
+#define read_mempolicy_unlock_irqrestore(p, flags)		\
+	do {							\
+		task_unlock(p);					\
+		(void)(flags);					\
+	} while (0)
+
+#define write_mem_lock_irqsave(p, flags)			\
+	do {							\
+		task_lock(p);					\
+		(void)(flags);					\
+	} while (0)
+
+#define write_mem_unlock_irqrestore(p, flags)			\
+	do {							\
+		task_unlock(p);					\
+		(void)(flags);					\
+	} while (0)
+# endif
+
 static inline void set_mems_allowed(nodemask_t nodemask)
 {
+	unsigned long flags;
+	write_mem_lock_irqsave(current, flags);
 	current->mems_allowed = nodemask;
+	write_mem_unlock_irqrestore(current, flags);
 }
 
 #else /* !CONFIG_CPUSETS */
@@ -144,8 +227,8 @@ static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
 	return 1;
 }
 
-static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
-						 const struct task_struct *tsk2)
+static inline int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
+						 struct task_struct *tsk2)
 {
 	return 1;
 }
@@ -193,6 +276,19 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 {
 }
 
+#define read_mem_lock_irqsave(p, flags)		do { (void)(flags); } while (0)
+
+#define read_mem_unlock_irqrestore(p, flags)	do { (void)(flags); } while (0)
+
+#define read_mempolicy_lock_irqsave(p, flags)	do { (void)(flags); } while (0)
+
+#define read_mempolicy_unlock_irqrestore(p, flags)	\
+	do { (void)(flags); } while (0)
+
+#define write_mem_lock_irqsave(p, flags)	do { (void)(flags); } while (0)
+
+#define write_mem_unlock_irqrestore(p, flags)	do { (void)(flags); } while (0)
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index abec69b..1c1e3bf 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -111,6 +111,13 @@ extern struct cred init_cred;
 # define INIT_PERF_EVENTS(tsk)
 #endif
 
+#if defined(CONFIG_CPUSETS) && MAX_NUMNODES > BITS_PER_LONG
+# define INIT_MEM_RWLOCK(tsk)						\
+	.mem_rwlock	= __RW_LOCK_UNLOCKED(tsk.mem_rwlock),
+#else
+# define INIT_MEM_RWLOCK(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -180,6 +187,7 @@ extern struct cred init_cred;
 	INIT_FTRACE_GRAPH						\
 	INIT_TRACE_RECURSION						\
 	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_MEM_RWLOCK(tsk)						\
 }
 
 
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 1cc966c..aae93bc 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -51,6 +51,7 @@ enum {
  */
 #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
 #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
+#define MPOL_F_TASK    (1 << 2)	/* identify tasks' policies */
 
 #ifdef __KERNEL__
 
@@ -107,6 +108,12 @@ struct mempolicy {
  * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
  */
 
+extern struct mempolicy *__mpol_alloc(void);
+static inline struct mempolicy *mpol_alloc(void)
+{
+	return __mpol_alloc();
+}
+
 extern void __mpol_put(struct mempolicy *pol);
 static inline void mpol_put(struct mempolicy *pol)
 {
@@ -125,7 +132,7 @@ static inline int mpol_needs_cond_ref(struct mempolicy *pol)
 
 static inline void mpol_cond_put(struct mempolicy *pol)
 {
-	if (mpol_needs_cond_ref(pol))
+	if (mpol_needs_cond_ref(pol) || (pol && (pol->flags & MPOL_F_TASK)))
 		__mpol_put(pol);
 }
 
@@ -193,8 +200,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
-extern void mpol_rebind_task(struct task_struct *tsk,
-					const nodemask_t *new);
+extern int mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
+				struct mempolicy *newpol);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 extern void mpol_fix_fork_child_flag(struct task_struct *p);
 
@@ -249,6 +256,11 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
 	return 1;
 }
 
+static inline struct mempolicy *mpol_alloc(void)
+{
+	return NULL;
+}
+
 static inline void mpol_put(struct mempolicy *p)
 {
 }
@@ -307,9 +319,11 @@ static inline void numa_default_policy(void)
 {
 }
 
-static inline void mpol_rebind_task(struct task_struct *tsk,
-					const nodemask_t *new)
+static inline int mpol_rebind_task(struct task_struct *tsk,
+					const nodemask_t *new,
+					struct mempolicy *newpol)
 {
+	return 0;
 }
 
 static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4b1753f..8401e7d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1403,8 +1403,9 @@ struct task_struct {
 /* Thread group tracking */
    	u32 parent_exec_id;
    	u32 self_exec_id;
-/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
- * mempolicy */
+/* Protection of (de-)allocation: mm, files, fs, tty, keyrings.
+ * if MAX_NUMNODES <= BITS_PER_LONG,it will protect mems_allowed and mempolicy.
+ * Or we use other rwlock - mem_rwlock to protect them. */
 	spinlock_t alloc_lock;
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -1472,7 +1473,13 @@ struct task_struct {
 	cputime_t acct_timexpd;	/* stime + utime since last update */
 #endif
 #ifdef CONFIG_CPUSETS
-	nodemask_t mems_allowed;	/* Protected by alloc_lock */
+# if MAX_NUMNODES > BITS_PER_LONG
+	/* Protection of mems_allowed, and mempolicy */
+	rwlock_t mem_rwlock;
+# endif
+	/* if MAX_NUMNODES <= BITS_PER_LONG, Protected by alloc_lock;
+	 * else Protected by mem_rwlock */
+	nodemask_t mems_allowed;
 	int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CGROUPS
@@ -1495,7 +1502,9 @@ struct task_struct {
 	struct list_head perf_event_list;
 #endif
 #ifdef CONFIG_NUMA
-	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
+	/* if MAX_NUMNODES <= BITS_PER_LONG, Protected by alloc_lock;
+	 * else Protected by mem_rwlock */
+	struct mempolicy *mempolicy;
 	short il_next;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c6edd06..7575e79 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -198,12 +198,13 @@ static struct cpuset top_cpuset = {
  * from one of the callbacks into the cpuset code from within
  * __alloc_pages().
  *
- * If a task is only holding callback_mutex, then it has read-only
- * access to cpusets.
+ * If a task is only holding callback_mutex or cgroup_mutext, then it has
+ * read-only access to cpusets.
  *
  * Now, the task_struct fields mems_allowed and mempolicy may be changed
- * by other task, we use alloc_lock in the task_struct fields to protect
- * them.
+ * by other task, we use alloc_lock(if MAX_NUMNODES <= BITS_PER_LONG) or
+ * mem_rwlock(if MAX_NUMNODES > BITS_PER_LONG) in the task_struct fields
+ * to protect them.
  *
  * The cpuset_common_file_read() handlers only hold callback_mutex across
  * small pieces of code, such as when reading out possibly multi-word
@@ -920,6 +921,10 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  *    call to guarantee_online_mems(), as we know no one is changing
  *    our task's cpuset.
  *
+ *    As the above comment said, no one can change current task's mems_allowed
+ *    except itself. so we needn't hold lock to protect task's mems_allowed
+ *    during this call.
+ *
  *    While the mm_struct we are migrating is typically from some
  *    other task, the task_struct mems_allowed that we are hacking
  *    is for our current task, which must allocate new pages for that
@@ -961,15 +966,19 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  * we structure updates as setting all new allowed nodes, then clearing newly
  * disallowed ones.
  *
- * Called with task's alloc_lock held
+ * Called with write_mem_lock held
  */
-static void cpuset_change_task_nodemask(struct task_struct *tsk,
-					nodemask_t *newmems)
+static int cpuset_change_task_nodemask(struct task_struct *tsk,
+					nodemask_t *newmems,
+					struct mempolicy *newpol)
 {
+	int retval;
+
 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, &tsk->mems_allowed);
-	mpol_rebind_task(tsk, newmems);
+	retval = mpol_rebind_task(tsk, newmems, newpol);
 	tsk->mems_allowed = *newmems;
+
+	return retval;
 }
 
 /*
@@ -984,17 +993,31 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
+	unsigned long flags;
+	struct mempolicy *newpol = NULL;
+	int retval;
 	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
 
 	if (!newmems)
 		return;
 
+#if MAX_NUMNODES > BITS_PER_LONG
+	newpol = mpol_alloc();
+	if (newpol == NULL) {
+		NODEMASK_FREE(newmems);
+		return;
+	}
+#endif
+
 	cs = cgroup_cs(scan->cg);
 	guarantee_online_mems(cs, newmems);
 
-	task_lock(p);
-	cpuset_change_task_nodemask(p, newmems);
-	task_unlock(p);
+	write_mem_lock_irqsave(p, flags);
+	retval = cpuset_change_task_nodemask(p, newmems, newpol);
+	write_mem_unlock_irqrestore(p, flags);
+
+	if (retval)
+		mpol_put(newpol);
 
 	NODEMASK_FREE(newmems);
 
@@ -1389,6 +1412,8 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
 			       struct cpuset *cs)
 {
+	struct mempolicy *newpol = NULL;
+	unsigned long flags;
 	int err;
 	/*
 	 * can_attach beforehand should guarantee that this doesn't fail.
@@ -1397,9 +1422,19 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
 	err = set_cpus_allowed_ptr(tsk, cpus_attach);
 	WARN_ON_ONCE(err);
 
-	task_lock(tsk);
-	cpuset_change_task_nodemask(tsk, to);
-	task_unlock(tsk);
+#if MAX_NUMNODES > BITS_PER_LONG
+	newpol = mpol_alloc();
+	if (newpol == NULL)
+		return;
+#endif
+
+	write_mem_lock_irqsave(tsk, flags);
+	err = cpuset_change_task_nodemask(tsk, to, newpol);
+	write_mem_unlock_irqrestore(tsk, flags);
+
+	if (err)
+		mpol_put(newpol);
+
 	cpuset_update_task_spread_flag(cs, tsk);
 
 }
@@ -2242,7 +2277,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
  */
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 {
-	return nodes_intersects(*nodemask, current->mems_allowed);
+	unsigned long flags;
+	int retval;
+
+	read_mem_lock_irqsave(current, flags);
+	retval = nodes_intersects(*nodemask, current->mems_allowed);
+	read_mem_unlock_irqrestore(current, flags);
+
+	return retval;
 }
 
 /*
@@ -2323,11 +2365,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
 	const struct cpuset *cs;	/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
+	unsigned long flags;
 
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
-	if (node_isset(node, current->mems_allowed))
+
+	read_mem_lock_irqsave(current, flags);
+	allowed = node_isset(node, current->mems_allowed);
+	read_mem_unlock_irqrestore(current, flags);
+
+	if (allowed)
 		return 1;
 	/*
 	 * Allow tasks that have access to memory reserves because they have
@@ -2378,9 +2426,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
  */
 int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
+	int allowed;
+	unsigned long flags;
+
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	if (node_isset(node, current->mems_allowed))
+
+	read_mem_lock_irqsave(current, flags);
+	allowed = node_isset(node, current->mems_allowed);
+	read_mem_unlock_irqrestore(current, flags);
+
+	if (allowed)
 		return 1;
 	/*
 	 * Allow tasks that have access to memory reserves because they have
@@ -2447,11 +2503,14 @@ void cpuset_unlock(void)
 int cpuset_mem_spread_node(void)
 {
 	int node;
+	unsigned long flags;
 
+	read_mem_lock_irqsave(current, flags);
 	node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
 	if (node == MAX_NUMNODES)
 		node = first_node(current->mems_allowed);
 	current->cpuset_mem_spread_rotor = node;
+	read_mem_unlock_irqrestore(current, flags);
 	return node;
 }
 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
@@ -2467,10 +2526,19 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
  * to the other.
  **/
 
-int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
-				   const struct task_struct *tsk2)
+int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
+				   struct task_struct *tsk2)
 {
-	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
+	unsigned long flags1, flags2;
+	int retval;
+
+	read_mem_lock_irqsave(tsk1, flags1);
+	read_mem_lock_irqsave(tsk2, flags2);
+	retval = nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
+	read_mem_unlock_irqrestore(tsk2, flags2);
+	read_mem_unlock_irqrestore(tsk1, flags1);
+
+	return retval;
 }
 
 /**
@@ -2483,14 +2551,17 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
  */
 void cpuset_print_task_mems_allowed(struct task_struct *tsk)
 {
+	unsigned long flags;
 	struct dentry *dentry;
 
 	dentry = task_cs(tsk)->css.cgroup->dentry;
 	spin_lock(&cpuset_buffer_lock);
 	snprintf(cpuset_name, CPUSET_NAME_LEN,
 		 dentry ? (const char *)dentry->d_name.name : "/");
+	read_mem_lock_irqsave(tsk, flags);
 	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
 			   tsk->mems_allowed);
+	read_mem_unlock_irqrestore(tsk, flags);
 	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
 	       tsk->comm, cpuset_name, cpuset_nodelist);
 	spin_unlock(&cpuset_buffer_lock);
diff --git a/kernel/exit.c b/kernel/exit.c
index 45ed043..28162dd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -16,6 +16,7 @@
 #include <linux/key.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
+#include <linux/cpuset.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/file.h>
@@ -901,6 +902,7 @@ NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
 	int group_dead;
+	unsigned long flags;
 
 	profile_task_exit(tsk);
 
@@ -1001,8 +1003,10 @@ NORET_TYPE void do_exit(long code)
 
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
+	write_mem_lock_irqsave(tsk, flags);
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
+	write_mem_unlock_irqrestore(tsk, flags);
 #endif
 #ifdef CONFIG_FUTEX
 	if (unlikely(current->pi_state_cache))
diff --git a/kernel/fork.c b/kernel/fork.c
index 17bbf09..7ed253d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
 #include <linux/capability.h>
 #include <linux/cpu.h>
 #include <linux/cgroup.h>
+#include <linux/cpuset.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
@@ -986,6 +987,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	int retval;
 	struct task_struct *p;
 	int cgroup_callbacks_done = 0;
+	struct mempolicy *pol;
+	unsigned long flags;
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
@@ -1091,8 +1094,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	cgroup_fork(p);
+#if defined(CONFIG_CPUSETS) && MAX_NUMNODES > BITS_PER_LONG
+	rwlock_init(&p->mem_rwlock);
+#endif
 #ifdef CONFIG_NUMA
-	p->mempolicy = mpol_dup(p->mempolicy);
+	read_mem_lock_irqsave(current, flags);
+	pol = current->mempolicy;
+	mpol_get(pol);
+	read_mem_unlock_irqrestore(current, flags);
+	p->mempolicy = mpol_dup(pol);
+	mpol_put(pol);
  	if (IS_ERR(p->mempolicy)) {
  		retval = PTR_ERR(p->mempolicy);
  		p->mempolicy = NULL;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3a5aeb3..523cf46 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1836,9 +1836,12 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 {
 	int node;
 	unsigned int nr = 0;
+	unsigned long flags;
 
+	read_mem_lock_irqsave(current, flags);
 	for_each_node_mask(node, cpuset_current_mems_allowed)
 		nr += array[node];
+	read_mem_unlock_irqrestore(current, flags);
 
 	return nr;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 290fb5b..324dfc3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -190,8 +190,9 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
  * parameter with respect to the policy mode and flags.  But, we need to
  * handle an empty nodemask with MPOL_PREFERRED here.
  *
- * Must be called holding task's alloc_lock to protect task's mems_allowed
- * and mempolicy.  May also be called holding the mmap_semaphore for write.
+ * Must be called using write_mem_lock_irqsave()/write_mem_unlock_irqrestore()
+ * to protect task's mems_allowed and mempolicy.  May also be called holding
+ * the mmap_semaphore for write.
  */
 static int mpol_set_nodemask(struct mempolicy *pol,
 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
@@ -270,6 +271,16 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 	return policy;
 }
 
+struct mempolicy *__mpol_alloc(void)
+{
+	struct mempolicy *pol;
+
+	pol = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+	if (pol)
+		atomic_set(&pol->refcnt, 1);
+	return pol;
+}
+
 /* Slow path of a mpol destructor. */
 void __mpol_put(struct mempolicy *p)
 {
@@ -347,12 +358,30 @@ static void mpol_rebind_policy(struct mempolicy *pol,
  * Wrapper for mpol_rebind_policy() that just requires task
  * pointer, and updates task mempolicy.
  *
- * Called with task's alloc_lock held.
+ * if task->pol==NULL, it will return -1, and tell us it is unnecessary to
+ * rebind task's mempolicy.
+ *
+ * Using write_mem_lock_irqsave()/write_mem_unlock_irqrestore() to protect it.
  */
-
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+int mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
+						struct mempolicy *newpol)
 {
+#if MAX_NUMNODES > BITS_PER_LONG
+	struct mempolicy *pol = tsk->mempolicy;
+
+	if (!pol)
+		return -1;
+
+	*newpol = *pol;
+	atomic_set(&newpol->refcnt, 1);
+
+	mpol_rebind_policy(newpol, new);
+	tsk->mempolicy = newpol;
+	mpol_put(pol);
+#else
 	mpol_rebind_policy(tsk->mempolicy, new);
+#endif
+	return 0;
 }
 
 /*
@@ -621,12 +650,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	struct mempolicy *new, *old;
 	struct mm_struct *mm = current->mm;
 	NODEMASK_SCRATCH(scratch);
+	unsigned long irqflags;
 	int ret;
 
 	if (!scratch)
 		return -ENOMEM;
 
-	new = mpol_new(mode, flags, nodes);
+	new = mpol_new(mode, flags | MPOL_F_TASK, nodes);
 	if (IS_ERR(new)) {
 		ret = PTR_ERR(new);
 		goto out;
@@ -639,10 +669,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	 */
 	if (mm)
 		down_write(&mm->mmap_sem);
-	task_lock(current);
+	write_mem_lock_irqsave(current, irqflags);
 	ret = mpol_set_nodemask(new, nodes, scratch);
 	if (ret) {
-		task_unlock(current);
+		write_mem_unlock_irqrestore(current, irqflags);
 		if (mm)
 			up_write(&mm->mmap_sem);
 		mpol_put(new);
@@ -654,7 +684,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	if (new && new->mode == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
-	task_unlock(current);
+	write_mem_unlock_irqrestore(current, irqflags);
 	if (mm)
 		up_write(&mm->mmap_sem);
 
@@ -668,7 +698,9 @@ out:
 /*
  * Return nodemask for policy for get_mempolicy() query
  *
- * Called with task's alloc_lock held
+ * Must be called using read_mempolicy_lock_irqsave()/
+ * read_mempolicy_unlock_irqrestore() to
+ * protect it.
  */
 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 {
@@ -712,7 +744,8 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	int err;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
-	struct mempolicy *pol = current->mempolicy;
+	struct mempolicy *pol = NULL;
+	unsigned long irqflags;
 
 	if (flags &
 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
@@ -722,9 +755,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 			return -EINVAL;
 		*policy = 0;	/* just so it's initialized */
-		task_lock(current);
+
+		read_mempolicy_lock_irqsave(current, irqflags);
 		*nmask  = cpuset_current_mems_allowed;
-		task_unlock(current);
+		read_mempolicy_unlock_irqrestore(current, irqflags);
 		return 0;
 	}
 
@@ -747,6 +781,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	} else if (addr)
 		return -EINVAL;
 
+	if (!pol) {
+		read_mempolicy_lock_irqsave(current, irqflags);
+		pol = current->mempolicy;
+		mpol_get(pol);
+		read_mempolicy_unlock_irqrestore(current, irqflags);
+	}
+
 	if (!pol)
 		pol = &default_policy;	/* indicates default behavior */
 
@@ -756,9 +797,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			if (err < 0)
 				goto out;
 			*policy = err;
-		} else if (pol == current->mempolicy &&
+		} else if (pol->flags & MPOL_F_TASK &&
 				pol->mode == MPOL_INTERLEAVE) {
+			read_mempolicy_lock_irqsave(current, irqflags);
 			*policy = current->il_next;
+			read_mempolicy_unlock_irqrestore(current, irqflags);
 		} else {
 			err = -EINVAL;
 			goto out;
@@ -780,9 +823,17 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 
 	err = 0;
 	if (nmask) {
-		task_lock(current);
+		/* Maybe task->mempolicy was updated by cpuset, so we must get
+		 * a new one. */
+		mpol_cond_put(pol);
+		read_mempolicy_lock_irqsave(current, irqflags);
+		pol = current->mempolicy;
+		if (pol)
+			mpol_get(pol);
+		else
+			pol = &default_policy;
 		get_policy_nodemask(pol, nmask);
-		task_unlock(current);
+		read_mempolicy_unlock_irqrestore(current, irqflags);
 	}
 
  out:
@@ -981,6 +1032,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	struct mempolicy *new;
 	unsigned long end;
 	int err;
+	unsigned long irqflags;
 	LIST_HEAD(pagelist);
 
 	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
@@ -1028,9 +1080,9 @@ static long do_mbind(unsigned long start, unsigned long len,
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
 			down_write(&mm->mmap_sem);
-			task_lock(current);
+			write_mem_lock_irqsave(current, irqflags);
 			err = mpol_set_nodemask(new, nmask, scratch);
-			task_unlock(current);
+			write_mem_unlock_irqrestore(current, irqflags);
 			if (err)
 				up_write(&mm->mmap_sem);
 		} else
@@ -1370,7 +1422,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 static struct mempolicy *get_vma_policy(struct task_struct *task,
 		struct vm_area_struct *vma, unsigned long addr)
 {
-	struct mempolicy *pol = task->mempolicy;
+	struct mempolicy *pol = NULL;
+	unsigned long irqflags;
 
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1381,8 +1434,16 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
 		} else if (vma->vm_policy)
 			pol = vma->vm_policy;
 	}
+	if (!pol) {
+		read_mem_lock_irqsave(task, irqflags);
+		pol = task->mempolicy;
+		mpol_get(pol);
+		read_mem_unlock_irqrestore(task, irqflags);
+	}
+
 	if (!pol)
 		pol = &default_policy;
+
 	return pol;
 }
 
@@ -1584,11 +1645,15 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 {
 	struct mempolicy *mempolicy;
 	int nid;
+	unsigned long irqflags;
 
 	if (!(mask && current->mempolicy))
 		return false;
 
+	read_mempolicy_lock_irqsave(current, irqflags);
 	mempolicy = current->mempolicy;
+	mpol_get(mempolicy);
+
 	switch (mempolicy->mode) {
 	case MPOL_PREFERRED:
 		if (mempolicy->flags & MPOL_F_LOCAL)
@@ -1608,6 +1673,9 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 		BUG();
 	}
 
+	read_mempolicy_unlock_irqrestore(current, irqflags);
+	mpol_cond_put(mempolicy);
+
 	return true;
 }
 #endif
@@ -1654,6 +1722,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
+	struct page *page;
 
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
@@ -1667,15 +1736,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		/*
 		 * slow path: ref counted shared policy
 		 */
-		struct page *page =  __alloc_pages_nodemask(gfp, 0,
-						zl, policy_nodemask(gfp, pol));
+		page =  __alloc_pages_nodemask(gfp, 0, zl,
+					policy_nodemask(gfp, pol));
 		__mpol_put(pol);
 		return page;
 	}
 	/*
 	 * fast path:  default or task policy
 	 */
-	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+	page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+	mpol_cond_put(pol);
+	return page;
 }
 
 /**
@@ -1692,26 +1763,36 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
  *	Allocate a page from the kernel page pool.  When not in
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
- *
- *	Don't call cpuset_update_task_memory_state() unless
- *	1) it's ok to take cpuset_sem (can WAIT), and
- *	2) allocating for current task (not interrupt).
  */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
-	struct mempolicy *pol = current->mempolicy;
+	struct mempolicy *pol;
+	struct page *page;
+	unsigned long irqflags;
+
+	read_mem_lock_irqsave(current, irqflags);
+	pol = current->mempolicy;
+	mpol_get(pol);
+	read_mem_unlock_irqrestore(current, irqflags);
 
-	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
+	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) {
+		mpol_put(pol);
 		pol = &default_policy;
+	}
 
 	/*
 	 * No reference counting needed for current->mempolicy
 	 * nor system default_policy
 	 */
 	if (pol->mode == MPOL_INTERLEAVE)
-		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
-	return __alloc_pages_nodemask(gfp, order,
-			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+	else
+		page =  __alloc_pages_nodemask(gfp, order,
+					policy_zonelist(gfp, pol),
+					policy_nodemask(gfp, pol));
+
+	mpol_cond_put(pol);
+	return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
@@ -1961,6 +2042,7 @@ restart:
  */
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 {
+	unsigned long irqflags;
 	int ret;
 
 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
@@ -1981,9 +2063,9 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 			return;		/* no valid nodemask intersection */
 		}
 
-		task_lock(current);
+		write_mem_lock_irqsave(current, irqflags);
 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
-		task_unlock(current);
+		write_mem_unlock_irqrestore(current, irqflags);
 		mpol_put(mpol);	/* drop our ref on sb mpol */
 		if (ret) {
 			NODEMASK_SCRATCH_FREE(scratch);
@@ -2134,6 +2216,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 	char *nodelist = strchr(str, ':');
 	char *flags = strchr(str, '=');
 	int i;
+	unsigned long irqflags;
 	int err = 1;
 
 	if (nodelist) {
@@ -2215,9 +2298,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		int ret;
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
-			task_lock(current);
+			write_mem_lock_irqsave(current, irqflags);
 			ret = mpol_set_nodemask(new, &nodes, scratch);
-			task_unlock(current);
+			write_mem_unlock_irqrestore(current, irqflags);
 		} else
 			ret = -ENOMEM;
 		NODEMASK_SCRATCH_FREE(scratch);
diff --git a/mm/slab.c b/mm/slab.c
index 7451bda..2df5185 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3145,14 +3145,25 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	int nid_alloc, nid_here;
+	struct mempolicy *pol;
+	unsigned long lflags;
 
 	if (in_interrupt() || (flags & __GFP_THISNODE))
 		return NULL;
+
+	read_mem_lock_irqsave(current, lflags);
+	pol = current->mempolicy;
+	mpol_get(pol);
+	read_mem_unlock_irqrestore(current, lflags);
+
 	nid_alloc = nid_here = numa_node_id();
 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
 		nid_alloc = cpuset_mem_spread_node();
-	else if (current->mempolicy)
-		nid_alloc = slab_node(current->mempolicy);
+	else if (pol)
+		nid_alloc = slab_node(pol);
+
+	mpol_put(pol);
+
 	if (nid_alloc != nid_here)
 		return ____cache_alloc_node(cachep, flags, nid_alloc);
 	return NULL;
@@ -3175,11 +3186,21 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *obj = NULL;
 	int nid;
+	struct mempolicy *pol;
+	unsigned long lflags;
 
 	if (flags & __GFP_THISNODE)
 		return NULL;
 
-	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+	read_mem_lock_irqsave(current, lflags);
+	pol = current->mempolicy;
+	mpol_get(pol);
+	read_mem_unlock_irqrestore(current, lflags);
+
+	zonelist = node_zonelist(slab_node(pol), flags);
+
+	mpol_put(pol);
+
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
 retry:
diff --git a/mm/slub.c b/mm/slub.c
index 8d71aaf..cb533d4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1357,6 +1357,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	struct page *page;
+	struct mempolicy *pol;
+	unsigned long lflags
 
 	/*
 	 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1380,7 +1382,15 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 			get_cycles() % 1024 > s->remote_node_defrag_ratio)
 		return NULL;
 
+	read_mem_lock_irqsave(current, lflags);
+	pol = current->mempolicy;
+	mpol_get(pol);
+	read_mem_unlock_irqrestore(current, lflags);
+
 	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+
+	mpol_put(pol);
+
 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 		struct kmem_cache_node *n;
 
-- 
1.6.5.2


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
  2010-03-03 10:52 [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed Miao Xie
@ 2010-03-03 23:50 ` Andrew Morton
  2010-03-04  9:03   ` Miao Xie
  2010-03-04  3:30 ` Nick Piggin
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 16+ messages in thread
From: Andrew Morton @ 2010-03-03 23:50 UTC (permalink / raw)
  To: miaox
  Cc: David Rientjes, Lee Schermerhorn, Nick Piggin, Paul Menage,
	Linux-Kernel, Linux-MM

On Wed, 03 Mar 2010 18:52:39 +0800
Miao Xie <miaox@cn.fujitsu.com> wrote:

> if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed or mems_allowed in
> task->mempolicy are not atomic operations, and the kernel page allocator gets an empty
> mems_allowed when updating task->mems_allowed or mems_allowed in task->mempolicy. So we
> use a rwlock to protect them to fix this probelm.

Boy, that is one big ugly patch.  Is there no other way of doing this?

>
> ...
>
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -51,6 +51,7 @@ enum {
>   */
>  #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
>  #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
> +#define MPOL_F_TASK    (1 << 2)	/* identify tasks' policies */

What's this?  It wasn't mentioned in the changelog - I suspect it
should have been?

>
> ...
>
> +int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
> +				   struct task_struct *tsk2)
>  {
> -	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
> +	unsigned long flags1, flags2;
> +	int retval;
> +
> +	read_mem_lock_irqsave(tsk1, flags1);
> +	read_mem_lock_irqsave(tsk2, flags2);
> +	retval = nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
> +	read_mem_unlock_irqrestore(tsk2, flags2);
> +	read_mem_unlock_irqrestore(tsk1, flags1);

I suspect this is deadlockable in sufficiently arcane circumstances:
one task takes the locks in a,b order, another task takes them in b,a
order and a third task gets in at the right time and does a
write_lock().  Probably that's not possible for some reason, dunno.  The usual
way of solving this is to always take the locks in
sorted-by-ascending-virtual-address order.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
  2010-03-03 10:52 [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed Miao Xie
  2010-03-03 23:50 ` Andrew Morton
@ 2010-03-04  3:30 ` Nick Piggin
  2010-03-04  9:36   ` Miao Xie
  2010-03-04 14:58   ` Peter Zijlstra
  2010-03-04  4:53 ` Nick Piggin
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 16+ messages in thread
From: Nick Piggin @ 2010-03-04  3:30 UTC (permalink / raw)
  To: Miao Xie
  Cc: David Rientjes, Lee Schermerhorn, Paul Menage, Linux-Kernel,
	Linux-MM

On Wed, Mar 03, 2010 at 06:52:39PM +0800, Miao Xie wrote:
> if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed or mems_allowed in
> task->mempolicy are not atomic operations, and the kernel page allocator gets an empty
> mems_allowed when updating task->mems_allowed or mems_allowed in task->mempolicy. So we
> use a rwlock to protect them to fix this probelm.

Thanks for working on this. However, rwlocks are pretty nasty to use
when you have short critical sections and hot read-side (they're twice
as heavy as even spinlocks in that case).

It's being used in the page allocator path, so I would say rwlocks are
almost a showstopper. Wouldn't it be possible to use a seqlock for this?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
  2010-03-03 10:52 [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed Miao Xie
  2010-03-03 23:50 ` Andrew Morton
  2010-03-04  3:30 ` Nick Piggin
@ 2010-03-04  4:53 ` Nick Piggin
  2010-03-04 14:31 ` Lee Schermerhorn
  2010-03-05 12:03 ` Paul Menage
  4 siblings, 0 replies; 16+ messages in thread
From: Nick Piggin @ 2010-03-04  4:53 UTC (permalink / raw)
  To: Miao Xie
  Cc: David Rientjes, Lee Schermerhorn, Paul Menage, Linux-Kernel,
	Linux-MM

On Wed, Mar 03, 2010 at 06:52:39PM +0800, Miao Xie wrote:
> if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed or mems_allowed in
> task->mempolicy are not atomic operations, and the kernel page allocator gets an empty
> mems_allowed when updating task->mems_allowed or mems_allowed in task->mempolicy. So we
> use a rwlock to protect them to fix this probelm.

Oh, and something else I'm also concerned about:

If  MAX_NUMNODES <= BITS_PER_LONG then these locks are a noop.

> +#define read_mem_lock_irqsave(p, flags)		do { (void)(flags); } while (0)
> +
> +#define read_mem_unlock_irqrestore(p, flags)	do { (void)(flags); } while (0)
> +
> +/* Be used to protect task->mempolicy and mems_allowed when user reads them */

However you are appearing to use them for more than just atomically
loading of the nodemasks.

> @@ -2447,11 +2503,14 @@ void cpuset_unlock(void)
>  int cpuset_mem_spread_node(void)
>  {
>  	int node;
> +	unsigned long flags;
>  
> +	read_mem_lock_irqsave(current, flags);
>  	node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
>  	if (node == MAX_NUMNODES)
>  		node = first_node(current->mems_allowed);
>  	current->cpuset_mem_spread_rotor = node;
> +	read_mem_unlock_irqrestore(current, flags);
>  	return node;
>  }
>  EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);

If you are worried about doing this kind of atomic RMW on the mask, then
you cannot make the lock a noop. So if you're nooping the lock in this
way then you really need to cuddle it neatly around loading of the mask.

Once you do that, it would be trivial to use a seqlock.

...

> @@ -1381,8 +1434,16 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
>  		} else if (vma->vm_policy)
>  			pol = vma->vm_policy;
>  	}
> +	if (!pol) {
> +		read_mem_lock_irqsave(task, irqflags);
> +		pol = task->mempolicy;
> +		mpol_get(pol);
> +		read_mem_unlock_irqrestore(task, irqflags);
> +	}
> +
>  	if (!pol)
>  		pol = &default_policy;
> +
>  	return pol;
>  }

And a couple of others. It looks like you're using it here to guarantee
existence of the mempolicy.... Did you mean read_mempolicy_lock? Or do
you have another problem (there seems to be several cases of this).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
  2010-03-03 23:50 ` Andrew Morton
@ 2010-03-04  9:03   ` Miao Xie
  0 siblings, 0 replies; 16+ messages in thread
From: Miao Xie @ 2010-03-04  9:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Rientjes, Lee Schermerhorn, Nick Piggin, Paul Menage,
	Linux-Kernel, Linux-MM

on 2010-3-4 7:50, Andrew Morton wrote:
> On Wed, 03 Mar 2010 18:52:39 +0800
> Miao Xie <miaox@cn.fujitsu.com> wrote:
> 
>> if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed or mems_allowed in
>> task->mempolicy are not atomic operations, and the kernel page allocator gets an empty
>> mems_allowed when updating task->mems_allowed or mems_allowed in task->mempolicy. So we
>> use a rwlock to protect them to fix this probelm.
> 
> Boy, that is one big ugly patch.  Is there no other way of doing this?

Let me consider!

> 
>>
>> ...
>>
>> --- a/include/linux/mempolicy.h
>> +++ b/include/linux/mempolicy.h
>> @@ -51,6 +51,7 @@ enum {
>>   */
>>  #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
>>  #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
>> +#define MPOL_F_TASK    (1 << 2)	/* identify tasks' policies */
> 
> What's this?  It wasn't mentioned in the changelog - I suspect it
> should have been?

I hope task->mempolicy has the same get/put operation just like shared mempolicy,
this new feature is used when the kernel memory allocater accesses
task->mempolicy.

I'll rewrite the changelog in the next version of the patch if I still
use this flag.

>> +int cpuset_mems_allowed_intersects(struct task_struct *tsk1,
>> +				   struct task_struct *tsk2)
>>  {
>> -	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
>> +	unsigned long flags1, flags2;
>> +	int retval;
>> +
>> +	read_mem_lock_irqsave(tsk1, flags1);
>> +	read_mem_lock_irqsave(tsk2, flags2);
>> +	retval = nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
>> +	read_mem_unlock_irqrestore(tsk2, flags2);
>> +	read_mem_unlock_irqrestore(tsk1, flags1);
> 
> I suspect this is deadlockable in sufficiently arcane circumstances:
> one task takes the locks in a,b order, another task takes them in b,a
> order and a third task gets in at the right time and does a
> write_lock().  Probably that's not possible for some reason, dunno.  The usual
> way of solving this is to always take the locks in
> sorted-by-ascending-virtual-address order.

Don't worry about this problem, because rwlock is read_preference lock.

But your advice is very good, I'll change it in the next version of the patch.

Thanks!

> 
> 
> 
> 
> 


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
  2010-03-04  3:30 ` Nick Piggin
@ 2010-03-04  9:36   ` Miao Xie
  2010-03-04 14:58   ` Peter Zijlstra
  1 sibling, 0 replies; 16+ messages in thread
From: Miao Xie @ 2010-03-04  9:36 UTC (permalink / raw)
  To: Nick Piggin
  Cc: David Rientjes, Lee Schermerhorn, Paul Menage, Linux-Kernel,
	Linux-MM

on 2010-3-4 11:30, Nick Piggin wrote:
> On Wed, Mar 03, 2010 at 06:52:39PM +0800, Miao Xie wrote:
>> if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed or mems_allowed in
>> task->mempolicy are not atomic operations, and the kernel page allocator gets an empty
>> mems_allowed when updating task->mems_allowed or mems_allowed in task->mempolicy. So we
>> use a rwlock to protect them to fix this probelm.
> 
> Thanks for working on this. However, rwlocks are pretty nasty to use
> when you have short critical sections and hot read-side (they're twice
> as heavy as even spinlocks in that case).
> 
> It's being used in the page allocator path, so I would say rwlocks are
> almost a showstopper. Wouldn't it be possible to use a seqlock for this?
> 

I will do my best to try to do it.

Thanks!
Miao

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
  2010-03-03 10:52 [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed Miao Xie
                   ` (2 preceding siblings ...)
  2010-03-04  4:53 ` Nick Piggin
@ 2010-03-04 14:31 ` Lee Schermerhorn
  2010-03-05 13:05   ` Lee Schermerhorn
  2010-03-05 12:03 ` Paul Menage
  4 siblings, 1 reply; 16+ messages in thread
From: Lee Schermerhorn @ 2010-03-04 14:31 UTC (permalink / raw)
  To: miaox
  Cc: David Rientjes, Nick Piggin, Paul Menage, Linux-Kernel, Linux-MM,
	Eric Whitney

On Wed, 2010-03-03 at 18:52 +0800, Miao Xie wrote:
> if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed or mems_allowed in
> task->mempolicy are not atomic operations, and the kernel page allocator gets an empty
> mems_allowed when updating task->mems_allowed or mems_allowed in task->mempolicy. So we
> use a rwlock to protect them to fix this probelm.
> 
> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
> ---
>  include/linux/cpuset.h    |  104 +++++++++++++++++++++++++++++-
>  include/linux/init_task.h |    8 +++
>  include/linux/mempolicy.h |   24 ++++++--
>  include/linux/sched.h     |   17 ++++-
>  kernel/cpuset.c           |  113 +++++++++++++++++++++++++++------
>  kernel/exit.c             |    4 +
>  kernel/fork.c             |   13 ++++-
>  mm/hugetlb.c              |    3 +
>  mm/mempolicy.c            |  153 ++++++++++++++++++++++++++++++++++----------
>  mm/slab.c                 |   27 +++++++-
>  mm/slub.c                 |   10 +++
>  11 files changed, 403 insertions(+), 73 deletions(-)
> 
<snip>
> 
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index 1cc966c..aae93bc 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -51,6 +51,7 @@ enum {
>   */
>  #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
>  #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
> +#define MPOL_F_TASK    (1 << 2)	/* identify tasks' policies */
>  
>  #ifdef __KERNEL__
>  
> @@ -107,6 +108,12 @@ struct mempolicy {
>   * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
>   */
>  
> +extern struct mempolicy *__mpol_alloc(void);
> +static inline struct mempolicy *mpol_alloc(void)
> +{
> +	return __mpol_alloc();
> +}
> +
>  extern void __mpol_put(struct mempolicy *pol);
>  static inline void mpol_put(struct mempolicy *pol)
>  {
> @@ -125,7 +132,7 @@ static inline int mpol_needs_cond_ref(struct mempolicy *pol)
>  
>  static inline void mpol_cond_put(struct mempolicy *pol)
>  {
> -	if (mpol_needs_cond_ref(pol))
> +	if (mpol_needs_cond_ref(pol) || (pol && (pol->flags & MPOL_F_TASK)))
>  		__mpol_put(pol);
>  }
>  
> @@ -193,8 +200,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
>  
>  extern void numa_default_policy(void);
>  extern void numa_policy_init(void);
> -extern void mpol_rebind_task(struct task_struct *tsk,
> -					const nodemask_t *new);
> +extern int mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
> +				struct mempolicy *newpol);
>  extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
>  extern void mpol_fix_fork_child_flag(struct task_struct *p);
>  
> @@ -249,6 +256,11 @@ static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
>  	return 1;
>  }
>  
> +static inline struct mempolicy *mpol_alloc(void)
> +{
> +	return NULL;
> +}
> +
>  static inline void mpol_put(struct mempolicy *p)
>  {
>  }
> @@ -307,9 +319,11 @@ static inline void numa_default_policy(void)
>  {
>  }
>  
> -static inline void mpol_rebind_task(struct task_struct *tsk,
> -					const nodemask_t *new)
> +static inline int mpol_rebind_task(struct task_struct *tsk,
> +					const nodemask_t *new,
> +					struct mempolicy *newpol)
>  {
> +	return 0;
>  }
>  
>  static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
<snip>
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 290fb5b..324dfc3 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -190,8 +190,9 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
>   * parameter with respect to the policy mode and flags.  But, we need to
>   * handle an empty nodemask with MPOL_PREFERRED here.
>   *
> - * Must be called holding task's alloc_lock to protect task's mems_allowed
> - * and mempolicy.  May also be called holding the mmap_semaphore for write.
> + * Must be called using write_mem_lock_irqsave()/write_mem_unlock_irqrestore()
> + * to protect task's mems_allowed and mempolicy.  May also be called holding
> + * the mmap_semaphore for write.
>   */
>  static int mpol_set_nodemask(struct mempolicy *pol,
>  		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
> @@ -270,6 +271,16 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
>  	return policy;
>  }
>  
> +struct mempolicy *__mpol_alloc(void)
> +{
> +	struct mempolicy *pol;
> +
> +	pol = kmem_cache_alloc(policy_cache, GFP_KERNEL);
> +	if (pol)
> +		atomic_set(&pol->refcnt, 1);
> +	return pol;
> +}
> +
>  /* Slow path of a mpol destructor. */
>  void __mpol_put(struct mempolicy *p)
>  {
> @@ -347,12 +358,30 @@ static void mpol_rebind_policy(struct mempolicy *pol,
>   * Wrapper for mpol_rebind_policy() that just requires task
>   * pointer, and updates task mempolicy.
>   *
> - * Called with task's alloc_lock held.
> + * if task->pol==NULL, it will return -1, and tell us it is unnecessary to
> + * rebind task's mempolicy.
> + *
> + * Using write_mem_lock_irqsave()/write_mem_unlock_irqrestore() to protect it.
>   */
> -
> -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
> +int mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
> +						struct mempolicy *newpol)
>  {
> +#if MAX_NUMNODES > BITS_PER_LONG
> +	struct mempolicy *pol = tsk->mempolicy;
> +
> +	if (!pol)
> +		return -1;
> +
> +	*newpol = *pol;
> +	atomic_set(&newpol->refcnt, 1);
> +
> +	mpol_rebind_policy(newpol, new);
> +	tsk->mempolicy = newpol;
> +	mpol_put(pol);
> +#else
>  	mpol_rebind_policy(tsk->mempolicy, new);
> +#endif
> +	return 0;
>  }
>  
>  /*
> @@ -621,12 +650,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>  	struct mempolicy *new, *old;
>  	struct mm_struct *mm = current->mm;
>  	NODEMASK_SCRATCH(scratch);
> +	unsigned long irqflags;
>  	int ret;
>  
>  	if (!scratch)
>  		return -ENOMEM;
>  
> -	new = mpol_new(mode, flags, nodes);
> +	new = mpol_new(mode, flags | MPOL_F_TASK, nodes);
>  	if (IS_ERR(new)) {
>  		ret = PTR_ERR(new);
>  		goto out;
> @@ -639,10 +669,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>  	 */
>  	if (mm)
>  		down_write(&mm->mmap_sem);
> -	task_lock(current);
> +	write_mem_lock_irqsave(current, irqflags);
>  	ret = mpol_set_nodemask(new, nodes, scratch);
>  	if (ret) {
> -		task_unlock(current);
> +		write_mem_unlock_irqrestore(current, irqflags);
>  		if (mm)
>  			up_write(&mm->mmap_sem);
>  		mpol_put(new);
> @@ -654,7 +684,7 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
>  	if (new && new->mode == MPOL_INTERLEAVE &&
>  	    nodes_weight(new->v.nodes))
>  		current->il_next = first_node(new->v.nodes);
> -	task_unlock(current);
> +	write_mem_unlock_irqrestore(current, irqflags);
>  	if (mm)
>  		up_write(&mm->mmap_sem);
>  
> @@ -668,7 +698,9 @@ out:
>  /*
>   * Return nodemask for policy for get_mempolicy() query
>   *
> - * Called with task's alloc_lock held
> + * Must be called using read_mempolicy_lock_irqsave()/
> + * read_mempolicy_unlock_irqrestore() to
> + * protect it.
>   */
>  static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
>  {
> @@ -712,7 +744,8 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
>  	int err;
>  	struct mm_struct *mm = current->mm;
>  	struct vm_area_struct *vma = NULL;
> -	struct mempolicy *pol = current->mempolicy;
> +	struct mempolicy *pol = NULL;
> +	unsigned long irqflags;
>  
>  	if (flags &
>  		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
> @@ -722,9 +755,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
>  		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
>  			return -EINVAL;
>  		*policy = 0;	/* just so it's initialized */
> -		task_lock(current);
> +
> +		read_mempolicy_lock_irqsave(current, irqflags);
>  		*nmask  = cpuset_current_mems_allowed;
> -		task_unlock(current);
> +		read_mempolicy_unlock_irqrestore(current, irqflags);
>  		return 0;
>  	}
>  
> @@ -747,6 +781,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
>  	} else if (addr)
>  		return -EINVAL;
>  
> +	if (!pol) {
> +		read_mempolicy_lock_irqsave(current, irqflags);
> +		pol = current->mempolicy;
> +		mpol_get(pol);
> +		read_mempolicy_unlock_irqrestore(current, irqflags);
> +	}
> +
>  	if (!pol)
>  		pol = &default_policy;	/* indicates default behavior */
>  
> @@ -756,9 +797,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
>  			if (err < 0)
>  				goto out;
>  			*policy = err;
> -		} else if (pol == current->mempolicy &&
> +		} else if (pol->flags & MPOL_F_TASK &&
>  				pol->mode == MPOL_INTERLEAVE) {
> +			read_mempolicy_lock_irqsave(current, irqflags);
>  			*policy = current->il_next;
> +			read_mempolicy_unlock_irqrestore(current, irqflags);
>  		} else {
>  			err = -EINVAL;
>  			goto out;
> @@ -780,9 +823,17 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
>  
>  	err = 0;
>  	if (nmask) {
> -		task_lock(current);
> +		/* Maybe task->mempolicy was updated by cpuset, so we must get
> +		 * a new one. */
> +		mpol_cond_put(pol);
> +		read_mempolicy_lock_irqsave(current, irqflags);
> +		pol = current->mempolicy;
> +		if (pol)
> +			mpol_get(pol);
> +		else
> +			pol = &default_policy;
>  		get_policy_nodemask(pol, nmask);
> -		task_unlock(current);
> +		read_mempolicy_unlock_irqrestore(current, irqflags);
>  	}
>  
>   out:
> @@ -981,6 +1032,7 @@ static long do_mbind(unsigned long start, unsigned long len,
>  	struct mempolicy *new;
>  	unsigned long end;
>  	int err;
> +	unsigned long irqflags;
>  	LIST_HEAD(pagelist);
>  
>  	if (flags & ~(unsigned long)(MPOL_MF_STRICT |
> @@ -1028,9 +1080,9 @@ static long do_mbind(unsigned long start, unsigned long len,
>  		NODEMASK_SCRATCH(scratch);
>  		if (scratch) {
>  			down_write(&mm->mmap_sem);
> -			task_lock(current);
> +			write_mem_lock_irqsave(current, irqflags);
>  			err = mpol_set_nodemask(new, nmask, scratch);
> -			task_unlock(current);
> +			write_mem_unlock_irqrestore(current, irqflags);
>  			if (err)
>  				up_write(&mm->mmap_sem);
>  		} else
> @@ -1370,7 +1422,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
>  static struct mempolicy *get_vma_policy(struct task_struct *task,
>  		struct vm_area_struct *vma, unsigned long addr)
>  {
> -	struct mempolicy *pol = task->mempolicy;
> +	struct mempolicy *pol = NULL;
> +	unsigned long irqflags;
>  
>  	if (vma) {
>  		if (vma->vm_ops && vma->vm_ops->get_policy) {
> @@ -1381,8 +1434,16 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
>  		} else if (vma->vm_policy)
>  			pol = vma->vm_policy;
>  	}
> +	if (!pol) {
> +		read_mem_lock_irqsave(task, irqflags);
> +		pol = task->mempolicy;
> +		mpol_get(pol);
> +		read_mem_unlock_irqrestore(task, irqflags);
> +	}
> +

Please note that this change is in the fast path of task page
allocations.  We tried real hard when reworking the mempolicy reference
counts not to reference count the task's mempolicy because only the task
could change its' own task mempolicy.  cpuset rebinding breaks this
assumption, of course.

I'll run some page fault overhead tests on this series to see whether
the effect of the additional lock round trip and reference count is
measurable and unacceptable.  If so, you might consider forcing the task
to update it's own task memory policy on return to user space using the
TIF_NOTIFY_RESUME handler.  That is, allocated and construct the new
mempolicy and queue it to the task to be updated by do_notify_resume().
set_notify_resume() will kick the process so that it enters and exits
the kernel, servicing the pending thread flag.

Yeah, there might be a window where allocations will use the old policy
before the kick_process() takes effect, but any such allocations could
have completed before the policy update anyway, so it shouldn't be an
issue as long as the allocation uses one policy or the other and not a
mixture of the two, right?

Regards,
Lee
>  	if (!pol)
>  		pol = &default_policy;
> +
>  	return pol;
>  }
>  
> @@ -1584,11 +1645,15 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
>  {
>  	struct mempolicy *mempolicy;
>  	int nid;
> +	unsigned long irqflags;
>  
>  	if (!(mask && current->mempolicy))
>  		return false;
>  
> +	read_mempolicy_lock_irqsave(current, irqflags);
>  	mempolicy = current->mempolicy;
> +	mpol_get(mempolicy);
> +
>  	switch (mempolicy->mode) {
>  	case MPOL_PREFERRED:
>  		if (mempolicy->flags & MPOL_F_LOCAL)
> @@ -1608,6 +1673,9 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
>  		BUG();
>  	}
>  
> +	read_mempolicy_unlock_irqrestore(current, irqflags);
> +	mpol_cond_put(mempolicy);
> +
>  	return true;
>  }
>  #endif
> @@ -1654,6 +1722,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
>  {
>  	struct mempolicy *pol = get_vma_policy(current, vma, addr);
>  	struct zonelist *zl;
> +	struct page *page;
>  
>  	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
>  		unsigned nid;
> @@ -1667,15 +1736,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
>  		/*
>  		 * slow path: ref counted shared policy
>  		 */
> -		struct page *page =  __alloc_pages_nodemask(gfp, 0,
> -						zl, policy_nodemask(gfp, pol));
> +		page =  __alloc_pages_nodemask(gfp, 0, zl,
> +					policy_nodemask(gfp, pol));
>  		__mpol_put(pol);
>  		return page;
>  	}
>  	/*
>  	 * fast path:  default or task policy
>  	 */
> -	return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
> +	page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
> +	mpol_cond_put(pol);
> +	return page;
>  }
>  
>  /**
> @@ -1692,26 +1763,36 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
>   *	Allocate a page from the kernel page pool.  When not in
>   *	interrupt context and apply the current process NUMA policy.
>   *	Returns NULL when no page can be allocated.
> - *
> - *	Don't call cpuset_update_task_memory_state() unless
> - *	1) it's ok to take cpuset_sem (can WAIT), and
> - *	2) allocating for current task (not interrupt).
>   */
>  struct page *alloc_pages_current(gfp_t gfp, unsigned order)
>  {
> -	struct mempolicy *pol = current->mempolicy;
> +	struct mempolicy *pol;
> +	struct page *page;
> +	unsigned long irqflags;
> +
> +	read_mem_lock_irqsave(current, irqflags);
> +	pol = current->mempolicy;
> +	mpol_get(pol);
> +	read_mem_unlock_irqrestore(current, irqflags);
>  
> -	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
> +	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) {
> +		mpol_put(pol);
>  		pol = &default_policy;
> +	}
>  
>  	/*
>  	 * No reference counting needed for current->mempolicy
>  	 * nor system default_policy
>  	 */
>  	if (pol->mode == MPOL_INTERLEAVE)
> -		return alloc_page_interleave(gfp, order, interleave_nodes(pol));
> -	return __alloc_pages_nodemask(gfp, order,
> -			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
> +		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
> +	else
> +		page =  __alloc_pages_nodemask(gfp, order,
> +					policy_zonelist(gfp, pol),
> +					policy_nodemask(gfp, pol));
> +
> +	mpol_cond_put(pol);
> +	return page;
>  }
>  EXPORT_SYMBOL(alloc_pages_current);
>  
> @@ -1961,6 +2042,7 @@ restart:
>   */
>  void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
>  {
> +	unsigned long irqflags;
>  	int ret;
>  
>  	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
> @@ -1981,9 +2063,9 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
>  			return;		/* no valid nodemask intersection */
>  		}
>  
> -		task_lock(current);
> +		write_mem_lock_irqsave(current, irqflags);
>  		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
> -		task_unlock(current);
> +		write_mem_unlock_irqrestore(current, irqflags);
>  		mpol_put(mpol);	/* drop our ref on sb mpol */
>  		if (ret) {
>  			NODEMASK_SCRATCH_FREE(scratch);
> @@ -2134,6 +2216,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
>  	char *nodelist = strchr(str, ':');
>  	char *flags = strchr(str, '=');
>  	int i;
> +	unsigned long irqflags;
>  	int err = 1;
>  
>  	if (nodelist) {
> @@ -2215,9 +2298,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
>  		int ret;
>  		NODEMASK_SCRATCH(scratch);
>  		if (scratch) {
> -			task_lock(current);
> +			write_mem_lock_irqsave(current, irqflags);
>  			ret = mpol_set_nodemask(new, &nodes, scratch);
> -			task_unlock(current);
> +			write_mem_unlock_irqrestore(current, irqflags);
>  		} else
>  			ret = -ENOMEM;
>  		NODEMASK_SCRATCH_FREE(scratch);
> diff --git a/mm/slab.c b/mm/slab.c
> index 7451bda..2df5185 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -3145,14 +3145,25 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
>  static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
>  {
>  	int nid_alloc, nid_here;
> +	struct mempolicy *pol;
> +	unsigned long lflags;
>  
>  	if (in_interrupt() || (flags & __GFP_THISNODE))
>  		return NULL;
> +
> +	read_mem_lock_irqsave(current, lflags);
> +	pol = current->mempolicy;
> +	mpol_get(pol);
> +	read_mem_unlock_irqrestore(current, lflags);
> +
>  	nid_alloc = nid_here = numa_node_id();
>  	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
>  		nid_alloc = cpuset_mem_spread_node();
> -	else if (current->mempolicy)
> -		nid_alloc = slab_node(current->mempolicy);
> +	else if (pol)
> +		nid_alloc = slab_node(pol);
> +
> +	mpol_put(pol);
> +
>  	if (nid_alloc != nid_here)
>  		return ____cache_alloc_node(cachep, flags, nid_alloc);
>  	return NULL;
> @@ -3175,11 +3186,21 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
>  	enum zone_type high_zoneidx = gfp_zone(flags);
>  	void *obj = NULL;
>  	int nid;
> +	struct mempolicy *pol;
> +	unsigned long lflags;
>  
>  	if (flags & __GFP_THISNODE)
>  		return NULL;
>  
> -	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
> +	read_mem_lock_irqsave(current, lflags);
> +	pol = current->mempolicy;
> +	mpol_get(pol);
> +	read_mem_unlock_irqrestore(current, lflags);
> +
> +	zonelist = node_zonelist(slab_node(pol), flags);
> +
> +	mpol_put(pol);
> +
>  	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
>  
>  retry:
> diff --git a/mm/slub.c b/mm/slub.c
> index 8d71aaf..cb533d4 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -1357,6 +1357,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
>  	struct zone *zone;
>  	enum zone_type high_zoneidx = gfp_zone(flags);
>  	struct page *page;
> +	struct mempolicy *pol;
> +	unsigned long lflags
>  
>  	/*
>  	 * The defrag ratio allows a configuration of the tradeoffs between
> @@ -1380,7 +1382,15 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
>  			get_cycles() % 1024 > s->remote_node_defrag_ratio)
>  		return NULL;
>  
> +	read_mem_lock_irqsave(current, lflags);
> +	pol = current->mempolicy;
> +	mpol_get(pol);
> +	read_mem_unlock_irqrestore(current, lflags);
> +
>  	zonelist = node_zonelist(slab_node(current->mempolicy), flags);
> +
> +	mpol_put(pol);
> +
>  	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
>  		struct kmem_cache_node *n;
>  

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
  2010-03-04  3:30 ` Nick Piggin
  2010-03-04  9:36   ` Miao Xie
@ 2010-03-04 14:58   ` Peter Zijlstra
  2010-03-04 16:34     ` Nick Piggin
  1 sibling, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2010-03-04 14:58 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Miao Xie, David Rientjes, Lee Schermerhorn, Paul Menage,
	Linux-Kernel, Linux-MM, tglx

On Thu, 2010-03-04 at 14:30 +1100, Nick Piggin wrote:
> 
> Thanks for working on this. However, rwlocks are pretty nasty to use
> when you have short critical sections and hot read-side (they're twice
> as heavy as even spinlocks in that case). 

Should we add a checkpatch.pl warning for them? 

There really rarely is a good case for using rwlock_t, for as you say
they're a pain and often more expensive than a spinlock_t, and if
possible RCU has the best performance.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
  2010-03-04 14:58   ` Peter Zijlstra
@ 2010-03-04 16:34     ` Nick Piggin
  0 siblings, 0 replies; 16+ messages in thread
From: Nick Piggin @ 2010-03-04 16:34 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Miao Xie, David Rientjes, Lee Schermerhorn, Paul Menage,
	Linux-Kernel, Linux-MM, tglx

On Thu, Mar 04, 2010 at 03:58:24PM +0100, Peter Zijlstra wrote:
> On Thu, 2010-03-04 at 14:30 +1100, Nick Piggin wrote:
> > 
> > Thanks for working on this. However, rwlocks are pretty nasty to use
> > when you have short critical sections and hot read-side (they're twice
> > as heavy as even spinlocks in that case). 
> 
> Should we add a checkpatch.pl warning for them? 

Yes I think it could be useful.

Most people agree rwlock is *almost* always the wrong thing to do. Or at
least, they can easily be used wrongly because they seem like a great
idea for read-mostly data.

> 
> There really rarely is a good case for using rwlock_t, for as you say
> they're a pain and often more expensive than a spinlock_t, and if
> possible RCU has the best performance.

Yep. Not to mention they starve writers (and don't FIFO like spinlocks).

Between normal spinlocks, RCU, percpu, and seqlocks, there's not much
room for rwlocks. Even tasklist lock should be RCUable if the effort is
put into it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
  2010-03-03 10:52 [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed Miao Xie
                   ` (3 preceding siblings ...)
  2010-03-04 14:31 ` Lee Schermerhorn
@ 2010-03-05 12:03 ` Paul Menage
  2010-03-07  2:33   ` Miao Xie
  4 siblings, 1 reply; 16+ messages in thread
From: Paul Menage @ 2010-03-05 12:03 UTC (permalink / raw)
  To: miaox; +Cc: David Rientjes, Lee Schermerhorn, Nick Piggin, Linux-Kernel,
	Linux-MM

On Wed, Mar 3, 2010 at 2:52 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed or mems_allowed in
> task->mempolicy are not atomic operations, and the kernel page allocator gets an empty
> mems_allowed when updating task->mems_allowed or mems_allowed in task->mempolicy. So we
> use a rwlock to protect them to fix this probelm.

Rather than adding locks, if the intention is just to avoid the
allocator seeing an empty nodemask couldn't we instead do the
equivalent of:

current->mems_allowed |= new_mask;
current->mems_allowed = new_mask;

i.e. effectively set all new bits in the nodemask first, and then
clear all old bits that are no longer in the new mask. The only
downside of this is that a page allocation that races with the update
could potentially allocate from any node in the union of the old and
new nodemasks - but that's the case anyway for an allocation that
races with an update, so I don't see that it's any worse.

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
  2010-03-04 14:31 ` Lee Schermerhorn
@ 2010-03-05 13:05   ` Lee Schermerhorn
  0 siblings, 0 replies; 16+ messages in thread
From: Lee Schermerhorn @ 2010-03-05 13:05 UTC (permalink / raw)
  To: miaox
  Cc: David Rientjes, Nick Piggin, Paul Menage, Linux-Kernel, Linux-MM,
	Eric Whitney

On Thu, 2010-03-04 at 09:31 -0500, Lee Schermerhorn wrote: 
> On Wed, 2010-03-03 at 18:52 +0800, Miao Xie wrote:
> > if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed or mems_allowed in
> > task->mempolicy are not atomic operations, and the kernel page allocator gets an empty
> > mems_allowed when updating task->mems_allowed or mems_allowed in task->mempolicy. So we
> > use a rwlock to protect them to fix this probelm.
> > 
> > Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
> > ---
> >  include/linux/cpuset.h    |  104 +++++++++++++++++++++++++++++-
> >  include/linux/init_task.h |    8 +++
> >  include/linux/mempolicy.h |   24 ++++++--
> >  include/linux/sched.h     |   17 ++++-
> >  kernel/cpuset.c           |  113 +++++++++++++++++++++++++++------
> >  kernel/exit.c             |    4 +
> >  kernel/fork.c             |   13 ++++-
> >  mm/hugetlb.c              |    3 +
> >  mm/mempolicy.c            |  153 ++++++++++++++++++++++++++++++++++----------
> >  mm/slab.c                 |   27 +++++++-
> >  mm/slub.c                 |   10 +++
> >  11 files changed, 403 insertions(+), 73 deletions(-)
> > 
> <snip>
> > 
<snip even more> 
> > @@ -1381,8 +1434,16 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
> >  		} else if (vma->vm_policy)
> >  			pol = vma->vm_policy;
> >  	}
> > +	if (!pol) {
> > +		read_mem_lock_irqsave(task, irqflags);
> > +		pol = task->mempolicy;
> > +		mpol_get(pol);
> > +		read_mem_unlock_irqrestore(task, irqflags);
> > +	}
> > +
> 
> Please note that this change is in the fast path of task page
> allocations.  We tried real hard when reworking the mempolicy reference
> counts not to reference count the task's mempolicy because only the task
> could change its' own task mempolicy.  cpuset rebinding breaks this
> assumption, of course.
> 
> I'll run some page fault overhead tests on this series to see whether
> the effect of the additional lock round trip and reference count is
> measurable and unacceptable.

Well, I wanted to run page fault overhead tests, but when I added this
series to the 3March mmotm [applied w/ offsets, no rejects], I got NULL
pointer derefs during boot, and then the remainder of the boot crept
along for a while with looooong pauses between chunks of console output.
Finally appeared to hang. 

Stack traces below.  

Config available on request.

Lee

-----------------------
Excerpt from console output:

Platform is 8 socket x 6 core AMD numa w/ 512GB memory.

...
Loading drivers, configuring devices: input: Power Button
as /devices/LNXSYSTM:00/LNXPWRBN:00/input/input0
ACPI: Power Button [PWRF]
ipmi message handler version 39.2
hpilo 0000:00:04.2: PCI INT B -> GSI 41 (level, low) -> IRQ 41
pci_hotplug: PCI Hot Plug PCI Core version: 0.5
BUG: unable to handle kernel NULL pointer dereference at
0000000000000001
IP: [<ffffffff8106197e>] sysctl_check_table+0x277/0x35e
PGD 3049162067 PUD 3049056067 PMD 0 
input: PC Speaker as /devices/platform/pcspkr/input/input1
Oops: 0000 [#1] SMP 
last sysfs file: /sys/devices/LNXSYSTM:00/modalias
CPU 10 
Modules linked in: pcspkr cdrom(+) pci_hotplug hpilo ipmi_msghandler
i2c_core container button ohci_hcd uhci_hcd ehci_hcd usbcore edd ext3
mbcache jbd fan ide_pci_generic serverworks ide_core ata_generic
pata_serverworks libata cciss scsi_mod thermal processor thermal_sys
hwmon

Pid: 2429, comm: modprobe Not tainted
2.6.33-mmotm-100302-1838-mx-mempolicy #6 /ProLiant DL785 G6   
RIP: 0010:[<ffffffff8106197e>]  [<ffffffff8106197e>] sysctl_check_table
+0x277/0x35e
RSP: 0018:ffff8840484a3d58  EFLAGS: 00010246
RAX: 0000000000000002 RBX: ffffffffa0184340 RCX: ffff883049d090c0
RDX: ffffffff81530864 RSI: ffff88104895ca00 RDI: ffffffff816e4910
RBP: ffff8840484a3da8 R08: ffffffff810bb073 R09: 0000000000000000
R10: ffff883049d7eb40 R11: 0000000000000002 R12: 0000000000000001
R13: ffffffffa0184210 R14: 0000000000000002 R15: ffff88104895ca00
FS:  00007f8c4c7316f0(0000) GS:ffff882088240000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000001 CR3: 0000003049250000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process modprobe (pid: 2429, threadinfo ffff8840484a2000, task
ffff8840493aa140)
Stack:
0000000000000000 ffffffff816ab520 0000000200000000 ffff882050001c08
<0> 0000000000000000 ffffffffa01842c0 ffff883049d090c0 ffffffff8140aba0
<0> ffffffff816a4430 ffffffffa0184210 ffff8840484a3e08 ffffffff81061a0d
Call Trace:
[<ffffffff81061a0d>] sysctl_check_table+0x306/0x35e
[<ffffffffa017e004>] ? cdrom_dummy_generic_packet+0x4/0x3c [cdrom]
[<ffffffff8101c461>] ? do_ftrace_mod_code+0xb5/0x147
[<ffffffff81061a0d>] sysctl_check_table+0x306/0x35e
[<ffffffff81049537>] ? __register_sysctl_paths+0x4a/0x297
[<ffffffff810495ec>] __register_sysctl_paths+0xff/0x297
[<ffffffff81094a1b>] ? tracepoint_module_notify+0x2c/0x30
[<ffffffff812f05cf>] ? notifier_call_chain+0x38/0x60
[<ffffffffa00b5000>] ? cdrom_init+0x0/0x6a [cdrom]
EDAC MC: Ver: 2.1.0 Mar  4 2010
[<ffffffff810497b2>] register_sysctl_paths+0x2e/0x30
[<ffffffff810497cc>] register_sysctl_table+0x18/0x1a
[<ffffffffa00b5019>] cdrom_init+0x19/0x6a [cdrom]
[<ffffffff8100020e>] do_one_initcall+0x73/0x180
[<ffffffff810714d8>] sys_init_module+0xd5/0x22e
[<ffffffff81002b9b>] system_call_fastpath+0x16/0x1b
Code: 7c 24 18 00 74 21 49 8b 7d 00 48 85 ff 74 18 e8 3d fb 12 00 85 c0
75 0f 45 85 f6 74 2e 41 ff ce 4d 8b 64 24 18 eb b9 49 83 c4 40 <49> 8b
34 24 48 85 f6 75 c5 4c 89 fe 48 8b 7d b8 e8 b7 7e fe ff 
RIP  [<ffffffff8106197e>] sysctl_check_table+0x277/0x35e
RSP <ffff8840484a3d58>
CR2: 0000000000000001
QLogic/NetXen Network Driver v4.0.72
netxen_nic 0000:04:00.0: PCI INT A -> GSI 22 (level, low) -> IRQ 22
netxen_nic 0000:04:00.0: setting latency timer to 64
---[ end trace 42261946992ac8eb ]---

Then, another one:

...
netxen_nic: Dual XGb SFP+ LP Board S/N CM9BBK0915  Chip rev 0x42
netxen_nic 0000:04:00.0: firmware v4.0.406 [cut-through]
IPMI System Interface driver.
ipmi_si: Trying SMBIOS-specified kcs state machine at i/o address 0xca2,
slave address 0x20, irq 0
netxen_nic 0000:04:00.0: irq 72 for MSI/MSI-X
netxen_nic 0000:04:00.0: irq 73 for MSI/MSI-X
netxen_nic 0000:04:00.0: irq 74 for MSI/MSI-X
netxen_nic 0000:04:00.0: irq 75 for MSI/MSI-X
netxen_nic 0000:04:00.0: using msi-x interrupts
BUG: unable to handle kernel NULL pointer dereference at
0000000000000001
IP: [<ffffffff8106197e>] sysctl_check_table+0x277/0x35e
PGD 0 
Oops: 0000 [#2] SMP 
last sysfs
file: /sys/devices/pci0000:00/0000:00:06.1/host0/target0:0:0/0:0:0:0/type
CPU 0 
Modules linked in: ipmi_si(+) shpchp(+) rtc_cmos hid i2c_piix4 tpm_bios
amd64_edac_mod sg rtc_core rtc_lib serio_raw netxen_nic(+) edac_core
pcspkr cdrom(+) pci_hotplug hpilo ipmi_msghandler i2c_core container
button ohci_hcd uhci_hcd ehci_hcd usbcore edd ext3 mbcache jbd fan
ide_pci_generic serverworks ide_core ata_generic pata_serverworks libata
cciss scsi_mod thermal processor thermal_sys hwmon

Pid: 3105, comm: work_for_cpu Tainted: G      D
2.6.33-mmotm-100302-1838-mx-mempolicy #6 /ProLiant DL785 G6   
RIP: 0010:[<ffffffff8106197e>]  [<ffffffff8106197e>] sysctl_check_table
+0x277/0x35e
RSP: 0018:ffff8870495ff9a0  EFLAGS: 00010246
RAX: 0000000000000004 RBX: ffff881048432808 RCX: ffff8810451eb9f0
RDX: ffffffff81530869 RSI: ffff88104895ca00 RDI: ffffffff816e4910
RBP: ffff8870495ff9f0 R08: 0000000000000083 R09: ffffffff81946190
R10: ffff880001a16178 R11: ffff8870495ffc70 R12: 0000000000000001
R13: ffff8810451eb858 R14: 0000000000000004 R15: ffff88104895ca00
FS:  00007f65e914e6f0(0000) GS:ffff880001a00000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000001 CR3: 0000000001693000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process work_for_cpu (pid: 3105, threadinfo ffff8870495fe000, task
ffff8870493b8680)
Stack:
ffff8870495ffac0 ffff8870495ffc00 0000000400000000 00000000000412d0
<0> 0000000000000000 ffff8810451eb9d8 ffff8810451eb800 ffff8870495ffc70
<0> ffffffff816e4910 ffff881048432808 ffff8870495ffa50 ffffffff81061a0d
Call Trace:
[<ffffffff81061a0d>] sysctl_check_table+0x306/0x35e
[<ffffffff81061a0d>] sysctl_check_table+0x306/0x35e
[<ffffffff81061a0d>] sysctl_check_table+0x306/0x35e
[<ffffffff81061a0d>] sysctl_check_table+0x306/0x35e
[<ffffffff81048591>] ? sysctl_set_parent+0x29/0x38
[<ffffffff810495ec>] __register_sysctl_paths+0xff/0x297
[<ffffffff812d6b4b>] register_net_sysctl_table+0x50/0x55
[<ffffffff8127207e>] neigh_sysctl_register+0x1e4/0x21d
[<ffffffff810474bf>] ? local_bh_enable_ip+0xc1/0xc6
[<ffffffff812b6d40>] devinet_sysctl_register+0x29/0x44
[<ffffffff812b6e76>] inetdev_init+0x11b/0x158
[<ffffffff812b6f13>] inetdev_event+0x60/0x3d6
[<ffffffff81225706>] ? device_add+0x46e/0x541
[<ffffffff812f05cf>] notifier_call_chain+0x38/0x60
[<ffffffff8105e338>] raw_notifier_call_chain+0x14/0x16
[<ffffffff8126e142>] register_netdevice+0x346/0x3c2
[<ffffffff8126e1fd>] register_netdev+0x3f/0x4d
[<ffffffffa01a7b0e>] netxen_nic_probe+0x8b9/0xac4 [netxen_nic]
[<ffffffff81056910>] ? do_work_for_cpu+0x0/0x2a
[<ffffffff811a54b5>] local_pci_probe+0x17/0x1b
ipmi: Found new BMC (man_id: 0x00000b,  prod_id: 0x0000, dev_id: 0x11)
IPMI kcs interface initialized
ipmi_si: Trying SPMI-specified kcs state machine at i/o address 0xca2,
slave address 0x0, irq 0
ipmi_si: duplicate interface
[<ffffffff81056928>] do_work_for_cpu+0x18/0x2a
[<ffffffff81056910>] ? do_work_for_cpu+0x0/0x2a
[<ffffffff81059d54>] kthread+0x82/0x8a
[<ffffffff81003994>] kernel_thread_helper+0x4/0x10
[<ffffffff812ed5e9>] ? restore_args+0x0/0x30
[<ffffffff81059cd2>] ? kthread+0x0/0x8a
[<ffffffff81003990>] ? kernel_thread_helper+0x0/0x10
Code: 7c 24 18 00 74 21 49 8b 7d 00 48 85 ff 74 18 e8 3d fb 12 00 85 c0
75 0f 45 85 f6 74 2e 41 ff ce 4d 8b 64 24 18 eb b9 49 83 c4 40 <49> 8b
34 24 48 85 f6 75 c5 4c 89 fe 48 8b 7d b8 e8 b7 7e fe ff 
RIP  [<ffffffff8106197e>] sysctl_check_table+0x277/0x35e
RSP <ffff8870495ff9a0>
CR2: 0000000000000001
---[ end trace 42261946992ac8ec ]---

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and  mems_allowed
  2010-03-05 12:03 ` Paul Menage
@ 2010-03-07  2:33   ` Miao Xie
  2010-03-09 19:42     ` Paul Menage
  0 siblings, 1 reply; 16+ messages in thread
From: Miao Xie @ 2010-03-07  2:33 UTC (permalink / raw)
  To: Paul Menage
  Cc: David Rientjes, Lee Schermerhorn, Nick Piggin, Linux-Kernel,
	Linux-MM

on 2010-3-5 20:03, Paul Menage wrote:
> On Wed, Mar 3, 2010 at 2:52 AM, Miao Xie <miaox@cn.fujitsu.com> wrote:
>> if MAX_NUMNODES > BITS_PER_LONG, loading/storing task->mems_allowed or mems_allowed in
>> task->mempolicy are not atomic operations, and the kernel page allocator gets an empty
>> mems_allowed when updating task->mems_allowed or mems_allowed in task->mempolicy. So we
>> use a rwlock to protect them to fix this probelm.
> 
> Rather than adding locks, if the intention is just to avoid the
> allocator seeing an empty nodemask couldn't we instead do the
> equivalent of:
> 
> current->mems_allowed |= new_mask;
> current->mems_allowed = new_mask;
> 
> i.e. effectively set all new bits in the nodemask first, and then
> clear all old bits that are no longer in the new mask. The only
> downside of this is that a page allocation that races with the update
> could potentially allocate from any node in the union of the old and
> new nodemasks - but that's the case anyway for an allocation that
> races with an update, so I don't see that it's any worse.

Before applying this patch, cpuset updates task->mems_allowed just like
what you said. But the allocator is still likely to see an empty nodemask.
This problem have been pointed out by Nick Piggin.

The problem is following:
The size of nodemask_t is greater than the size of long integer, so loading
and storing of nodemask_t are not atomic operations. If task->mems_allowed
don't intersect with new_mask, such as the first word of the mask is empty
and only the first word of new_mask is not empty. When the allocator
loads a word of the mask before

	current->mems_allowed |= new_mask;

and then loads another word of the mask after

	current->mems_allowed = new_mask;

the allocator gets an empty nodemask.

I make a new patch to fix this problem now.
Considering the change of task->mems_allowed is not frequent, so in the new
patch, I use variables as a tag to indicate whether task->mems_allowed need
be update or not. And before setting the tag, cpuset caches the new mask of
every task at somewhere. 

When the allocator want to access task->mems_allowed, it must check updated-tag
first. If the tag is set, the allocator enters the slow path and updates
task->mems_allowed.

Thanks!
Miao

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed
  2010-03-07  2:33   ` Miao Xie
@ 2010-03-09 19:42     ` Paul Menage
  2010-03-11  5:04       ` Miao Xie
  0 siblings, 1 reply; 16+ messages in thread
From: Paul Menage @ 2010-03-09 19:42 UTC (permalink / raw)
  To: miaox; +Cc: David Rientjes, Lee Schermerhorn, Nick Piggin, Linux-Kernel,
	Linux-MM

On Sat, Mar 6, 2010 at 6:33 PM, Miao Xie <miaox@cn.fujitsu.com> wrote:
>
> Before applying this patch, cpuset updates task->mems_allowed just like
> what you said. But the allocator is still likely to see an empty nodemask.
> This problem have been pointed out by Nick Piggin.
>
> The problem is following:
> The size of nodemask_t is greater than the size of long integer, so loading
> and storing of nodemask_t are not atomic operations. If task->mems_allowed
> don't intersect with new_mask, such as the first word of the mask is empty
> and only the first word of new_mask is not empty. When the allocator
> loads a word of the mask before
>
>        current->mems_allowed |= new_mask;
>
> and then loads another word of the mask after
>
>        current->mems_allowed = new_mask;
>
> the allocator gets an empty nodemask.

Couldn't that be solved by having the reader read the nodemask twice
and compare them? In the normal case there's no race, so the second
read is straight from L1 cache and is very cheap. In the unlikely case
of a race, the reader would keep trying until it got two consistent
values in a row.

Paul

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and  mems_allowed
  2010-03-09 19:42     ` Paul Menage
@ 2010-03-11  5:04       ` Miao Xie
  2010-03-11  5:30         ` Nick Piggin
  0 siblings, 1 reply; 16+ messages in thread
From: Miao Xie @ 2010-03-11  5:04 UTC (permalink / raw)
  To: Paul Menage
  Cc: David Rientjes, Lee Schermerhorn, Nick Piggin, Linux-Kernel,
	Linux-MM

on 2010-3-10 3:42, Paul Menage wrote:
> On Sat, Mar 6, 2010 at 6:33 PM, Miao Xie <miaox@cn.fujitsu.com> wrote:
>>
>> Before applying this patch, cpuset updates task->mems_allowed just like
>> what you said. But the allocator is still likely to see an empty nodemask.
>> This problem have been pointed out by Nick Piggin.
>>
>> The problem is following:
>> The size of nodemask_t is greater than the size of long integer, so loading
>> and storing of nodemask_t are not atomic operations. If task->mems_allowed
>> don't intersect with new_mask, such as the first word of the mask is empty
>> and only the first word of new_mask is not empty. When the allocator
>> loads a word of the mask before
>>
>>        current->mems_allowed |= new_mask;
>>
>> and then loads another word of the mask after
>>
>>        current->mems_allowed = new_mask;
>>
>> the allocator gets an empty nodemask.
> 
> Couldn't that be solved by having the reader read the nodemask twice
> and compare them? In the normal case there's no race, so the second
> read is straight from L1 cache and is very cheap. In the unlikely case
> of a race, the reader would keep trying until it got two consistent
> values in a row.

I think this method can't fix the problem because we can guarantee the second
read is after the update of mask completes.

Thanks!
Miao

> 
> Paul
> 
> 
> 


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and  mems_allowed
  2010-03-11  5:04       ` Miao Xie
@ 2010-03-11  5:30         ` Nick Piggin
  2010-03-11  7:57           ` Miao Xie
  0 siblings, 1 reply; 16+ messages in thread
From: Nick Piggin @ 2010-03-11  5:30 UTC (permalink / raw)
  To: Miao Xie
  Cc: Paul Menage, David Rientjes, Lee Schermerhorn, Linux-Kernel,
	Linux-MM

On Thu, Mar 11, 2010 at 01:04:33PM +0800, Miao Xie wrote:
> on 2010-3-10 3:42, Paul Menage wrote:
> > On Sat, Mar 6, 2010 at 6:33 PM, Miao Xie <miaox@cn.fujitsu.com> wrote:
> >>
> >> Before applying this patch, cpuset updates task->mems_allowed just like
> >> what you said. But the allocator is still likely to see an empty nodemask.
> >> This problem have been pointed out by Nick Piggin.
> >>
> >> The problem is following:
> >> The size of nodemask_t is greater than the size of long integer, so loading
> >> and storing of nodemask_t are not atomic operations. If task->mems_allowed
> >> don't intersect with new_mask, such as the first word of the mask is empty
> >> and only the first word of new_mask is not empty. When the allocator
> >> loads a word of the mask before
> >>
> >>        current->mems_allowed |= new_mask;
> >>
> >> and then loads another word of the mask after
> >>
> >>        current->mems_allowed = new_mask;
> >>
> >> the allocator gets an empty nodemask.
> > 
> > Couldn't that be solved by having the reader read the nodemask twice
> > and compare them? In the normal case there's no race, so the second
> > read is straight from L1 cache and is very cheap. In the unlikely case
> > of a race, the reader would keep trying until it got two consistent
> > values in a row.
> 
> I think this method can't fix the problem because we can guarantee the second
> read is after the update of mask completes.

Any problem with using a seqlock?

The other thing you could do is store a pointer to the nodemask, and
allocate a new nodemask when changing it, issue a smp_wmb(), and then
store the new pointer. Read side only needs a smp_read_barrier_depends()

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and  mems_allowed
  2010-03-11  5:30         ` Nick Piggin
@ 2010-03-11  7:57           ` Miao Xie
  0 siblings, 0 replies; 16+ messages in thread
From: Miao Xie @ 2010-03-11  7:57 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Paul Menage, David Rientjes, Lee Schermerhorn, Linux-Kernel,
	Linux-MM

on 2010-3-11 13:30, Nick Piggin wrote:
>>>> The problem is following:
>>>> The size of nodemask_t is greater than the size of long integer, so loading
>>>> and storing of nodemask_t are not atomic operations. If task->mems_allowed
>>>> don't intersect with new_mask, such as the first word of the mask is empty
>>>> and only the first word of new_mask is not empty. When the allocator
>>>> loads a word of the mask before
>>>>
>>>>        current->mems_allowed |= new_mask;
>>>>
>>>> and then loads another word of the mask after
>>>>
>>>>        current->mems_allowed = new_mask;
>>>>
>>>> the allocator gets an empty nodemask.
>>>
>>> Couldn't that be solved by having the reader read the nodemask twice
>>> and compare them? In the normal case there's no race, so the second
>>> read is straight from L1 cache and is very cheap. In the unlikely case
>>> of a race, the reader would keep trying until it got two consistent
>>> values in a row.
>>
>> I think this method can't fix the problem because we can guarantee the second
>> read is after the update of mask completes.
> 
> Any problem with using a seqlock?
> 
> The other thing you could do is store a pointer to the nodemask, and
> allocate a new nodemask when changing it, issue a smp_wmb(), and then
> store the new pointer. Read side only needs a smp_read_barrier_depends()

Comparing with my second version patch, I think both of these methods will cause worse
performance and the changing of code is more.

Thanks
Miao

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2010-03-11  7:57 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-03 10:52 [PATCH 4/4] cpuset,mm: use rwlock to protect task->mempolicy and mems_allowed Miao Xie
2010-03-03 23:50 ` Andrew Morton
2010-03-04  9:03   ` Miao Xie
2010-03-04  3:30 ` Nick Piggin
2010-03-04  9:36   ` Miao Xie
2010-03-04 14:58   ` Peter Zijlstra
2010-03-04 16:34     ` Nick Piggin
2010-03-04  4:53 ` Nick Piggin
2010-03-04 14:31 ` Lee Schermerhorn
2010-03-05 13:05   ` Lee Schermerhorn
2010-03-05 12:03 ` Paul Menage
2010-03-07  2:33   ` Miao Xie
2010-03-09 19:42     ` Paul Menage
2010-03-11  5:04       ` Miao Xie
2010-03-11  5:30         ` Nick Piggin
2010-03-11  7:57           ` Miao Xie

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).