[PATCH 5.2 15/20] sched/fair: Use RCU accessors consistently for ->numa_group

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Jann Horn <jannh@google.com>,
	"Peter Zijlstra (Intel)" <peterz@infradead.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Petr Mladek <pmladek@suse.com>,
	Sergey Senozhatsky <sergey.senozhatsky@gmail.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Will Deacon <will@kernel.org>, Ingo Molnar <mingo@kernel.org>
Subject: [PATCH 5.2 15/20] sched/fair: Use RCU accessors consistently for ->numa_group
Date: Fri,  2 Aug 2019 11:40:09 +0200	[thread overview]
Message-ID: <20190802092102.928273691@linuxfoundation.org> (raw)
In-Reply-To: <20190802092055.131876977@linuxfoundation.org>

From: Jann Horn <jannh@google.com>

commit cb361d8cdef69990f6b4504dc1fd9a594d983c97 upstream.

The old code used RCU annotations and accessors inconsistently for
->numa_group, which can lead to use-after-frees and NULL dereferences.

Let all accesses to ->numa_group use proper RCU helpers to prevent such
issues.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Fixes: 8c8a743c5087 ("sched/numa: Use {cpu, pid} to create task groups for shared faults")
Link: https://lkml.kernel.org/r/20190716152047.14424-3-jannh@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 include/linux/sched.h |   10 +++-
 kernel/sched/fair.c   |  120 +++++++++++++++++++++++++++++++++-----------------
 2 files changed, 90 insertions(+), 40 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1026,7 +1026,15 @@ struct task_struct {
 	u64				last_sum_exec_runtime;
 	struct callback_head		numa_work;
 
-	struct numa_group		*numa_group;
+	/*
+	 * This pointer is only modified for current in syscall and
+	 * pagefault context (and for tasks being destroyed), so it can be read
+	 * from any of the following contexts:
+	 *  - RCU read-side critical section
+	 *  - current->numa_group from everywhere
+	 *  - task's runqueue locked, task not running
+	 */
+	struct numa_group __rcu		*numa_group;
 
 	/*
 	 * numa_faults is an array split into four regions:
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1067,6 +1067,21 @@ struct numa_group {
 	unsigned long faults[0];
 };
 
+/*
+ * For functions that can be called in multiple contexts that permit reading
+ * ->numa_group (see struct task_struct for locking rules).
+ */
+static struct numa_group *deref_task_numa_group(struct task_struct *p)
+{
+	return rcu_dereference_check(p->numa_group, p == current ||
+		(lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+}
+
+static struct numa_group *deref_curr_numa_group(struct task_struct *p)
+{
+	return rcu_dereference_protected(p->numa_group, p == current);
+}
+
 static inline unsigned long group_faults_priv(struct numa_group *ng);
 static inline unsigned long group_faults_shared(struct numa_group *ng);
 
@@ -1110,10 +1125,12 @@ static unsigned int task_scan_start(stru
 {
 	unsigned long smin = task_scan_min(p);
 	unsigned long period = smin;
+	struct numa_group *ng;
 
 	/* Scale the maximum scan period with the amount of shared memory. */
-	if (p->numa_group) {
-		struct numa_group *ng = p->numa_group;
+	rcu_read_lock();
+	ng = rcu_dereference(p->numa_group);
+	if (ng) {
 		unsigned long shared = group_faults_shared(ng);
 		unsigned long private = group_faults_priv(ng);
 
@@ -1121,6 +1138,7 @@ static unsigned int task_scan_start(stru
 		period *= shared + 1;
 		period /= private + shared + 1;
 	}
+	rcu_read_unlock();
 
 	return max(smin, period);
 }
@@ -1129,13 +1147,14 @@ static unsigned int task_scan_max(struct
 {
 	unsigned long smin = task_scan_min(p);
 	unsigned long smax;
+	struct numa_group *ng;
 
 	/* Watch for min being lower than max due to floor calculations */
 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
 
 	/* Scale the maximum scan period with the amount of shared memory. */
-	if (p->numa_group) {
-		struct numa_group *ng = p->numa_group;
+	ng = deref_curr_numa_group(p);
+	if (ng) {
 		unsigned long shared = group_faults_shared(ng);
 		unsigned long private = group_faults_priv(ng);
 		unsigned long period = smax;
@@ -1167,7 +1186,7 @@ void init_numa_balancing(unsigned long c
 	p->numa_scan_period		= sysctl_numa_balancing_scan_delay;
 	p->numa_work.next		= &p->numa_work;
 	p->numa_faults			= NULL;
-	p->numa_group			= NULL;
+	RCU_INIT_POINTER(p->numa_group, NULL);
 	p->last_task_numa_placement	= 0;
 	p->last_sum_exec_runtime	= 0;
 
@@ -1214,7 +1233,16 @@ static void account_numa_dequeue(struct
 
 pid_t task_numa_group_id(struct task_struct *p)
 {
-	return p->numa_group ? p->numa_group->gid : 0;
+	struct numa_group *ng;
+	pid_t gid = 0;
+
+	rcu_read_lock();
+	ng = rcu_dereference(p->numa_group);
+	if (ng)
+		gid = ng->gid;
+	rcu_read_unlock();
+
+	return gid;
 }
 
 /*
@@ -1239,11 +1267,13 @@ static inline unsigned long task_faults(
 
 static inline unsigned long group_faults(struct task_struct *p, int nid)
 {
-	if (!p->numa_group)
+	struct numa_group *ng = deref_task_numa_group(p);
+
+	if (!ng)
 		return 0;
 
-	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
-		p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
+	return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+		ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
 }
 
 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
@@ -1381,12 +1411,13 @@ static inline unsigned long task_weight(
 static inline unsigned long group_weight(struct task_struct *p, int nid,
 					 int dist)
 {
+	struct numa_group *ng = deref_task_numa_group(p);
 	unsigned long faults, total_faults;
 
-	if (!p->numa_group)
+	if (!ng)
 		return 0;
 
-	total_faults = p->numa_group->total_faults;
+	total_faults = ng->total_faults;
 
 	if (!total_faults)
 		return 0;
@@ -1400,7 +1431,7 @@ static inline unsigned long group_weight
 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 				int src_nid, int dst_cpu)
 {
-	struct numa_group *ng = p->numa_group;
+	struct numa_group *ng = deref_curr_numa_group(p);
 	int dst_nid = cpu_to_node(dst_cpu);
 	int last_cpupid, this_cpupid;
 
@@ -1583,13 +1614,14 @@ static bool load_too_imbalanced(long src
 static void task_numa_compare(struct task_numa_env *env,
 			      long taskimp, long groupimp, bool maymove)
 {
+	struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
+	long imp = p_ng ? groupimp : taskimp;
 	struct task_struct *cur;
 	long src_load, dst_load;
-	long load;
-	long imp = env->p->numa_group ? groupimp : taskimp;
-	long moveimp = imp;
 	int dist = env->dist;
+	long moveimp = imp;
+	long load;
 
 	if (READ_ONCE(dst_rq->numa_migrate_on))
 		return;
@@ -1628,21 +1660,22 @@ static void task_numa_compare(struct tas
 	 * If dst and source tasks are in the same NUMA group, or not
 	 * in any group then look only at task weights.
 	 */
-	if (cur->numa_group == env->p->numa_group) {
+	cur_ng = rcu_dereference(cur->numa_group);
+	if (cur_ng == p_ng) {
 		imp = taskimp + task_weight(cur, env->src_nid, dist) -
 		      task_weight(cur, env->dst_nid, dist);
 		/*
 		 * Add some hysteresis to prevent swapping the
 		 * tasks within a group over tiny differences.
 		 */
-		if (cur->numa_group)
+		if (cur_ng)
 			imp -= imp / 16;
 	} else {
 		/*
 		 * Compare the group weights. If a task is all by itself
 		 * (not part of a group), use the task weight instead.
 		 */
-		if (cur->numa_group && env->p->numa_group)
+		if (cur_ng && p_ng)
 			imp += group_weight(cur, env->src_nid, dist) -
 			       group_weight(cur, env->dst_nid, dist);
 		else
@@ -1740,11 +1773,12 @@ static int task_numa_migrate(struct task
 		.best_imp = 0,
 		.best_cpu = -1,
 	};
+	unsigned long taskweight, groupweight;
 	struct sched_domain *sd;
+	long taskimp, groupimp;
+	struct numa_group *ng;
 	struct rq *best_rq;
-	unsigned long taskweight, groupweight;
 	int nid, ret, dist;
-	long taskimp, groupimp;
 
 	/*
 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
@@ -1790,7 +1824,8 @@ static int task_numa_migrate(struct task
 	 *   multiple NUMA nodes; in order to better consolidate the group,
 	 *   we need to check other locations.
 	 */
-	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
+	ng = deref_curr_numa_group(p);
+	if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
 		for_each_online_node(nid) {
 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
 				continue;
@@ -1823,7 +1858,7 @@ static int task_numa_migrate(struct task
 	 * A task that migrated to a second choice node will be better off
 	 * trying for a better one later. Do not set the preferred node here.
 	 */
-	if (p->numa_group) {
+	if (ng) {
 		if (env.best_cpu == -1)
 			nid = env.src_nid;
 		else
@@ -2118,6 +2153,7 @@ static void task_numa_placement(struct t
 	unsigned long total_faults;
 	u64 runtime, period;
 	spinlock_t *group_lock = NULL;
+	struct numa_group *ng;
 
 	/*
 	 * The p->mm->numa_scan_seq field gets updated without
@@ -2135,8 +2171,9 @@ static void task_numa_placement(struct t
 	runtime = numa_get_avg_runtime(p, &period);
 
 	/* If the task is part of a group prevent parallel updates to group stats */
-	if (p->numa_group) {
-		group_lock = &p->numa_group->lock;
+	ng = deref_curr_numa_group(p);
+	if (ng) {
+		group_lock = &ng->lock;
 		spin_lock_irq(group_lock);
 	}
 
@@ -2177,7 +2214,7 @@ static void task_numa_placement(struct t
 			p->numa_faults[cpu_idx] += f_diff;
 			faults += p->numa_faults[mem_idx];
 			p->total_numa_faults += diff;
-			if (p->numa_group) {
+			if (ng) {
 				/*
 				 * safe because we can only change our own group
 				 *
@@ -2185,14 +2222,14 @@ static void task_numa_placement(struct t
 				 * nid and priv in a specific region because it
 				 * is at the beginning of the numa_faults array.
 				 */
-				p->numa_group->faults[mem_idx] += diff;
-				p->numa_group->faults_cpu[mem_idx] += f_diff;
-				p->numa_group->total_faults += diff;
-				group_faults += p->numa_group->faults[mem_idx];
+				ng->faults[mem_idx] += diff;
+				ng->faults_cpu[mem_idx] += f_diff;
+				ng->total_faults += diff;
+				group_faults += ng->faults[mem_idx];
 			}
 		}
 
-		if (!p->numa_group) {
+		if (!ng) {
 			if (faults > max_faults) {
 				max_faults = faults;
 				max_nid = nid;
@@ -2203,8 +2240,8 @@ static void task_numa_placement(struct t
 		}
 	}
 
-	if (p->numa_group) {
-		numa_group_count_active_nodes(p->numa_group);
+	if (ng) {
+		numa_group_count_active_nodes(ng);
 		spin_unlock_irq(group_lock);
 		max_nid = preferred_group_nid(p, max_nid);
 	}
@@ -2238,7 +2275,7 @@ static void task_numa_group(struct task_
 	int cpu = cpupid_to_cpu(cpupid);
 	int i;
 
-	if (unlikely(!p->numa_group)) {
+	if (unlikely(!deref_curr_numa_group(p))) {
 		unsigned int size = sizeof(struct numa_group) +
 				    4*nr_node_ids*sizeof(unsigned long);
 
@@ -2274,7 +2311,7 @@ static void task_numa_group(struct task_
 	if (!grp)
 		goto no_join;
 
-	my_grp = p->numa_group;
+	my_grp = deref_curr_numa_group(p);
 	if (grp == my_grp)
 		goto no_join;
 
@@ -2345,7 +2382,8 @@ no_join:
  */
 void task_numa_free(struct task_struct *p, bool final)
 {
-	struct numa_group *grp = p->numa_group;
+	/* safe: p either is current or is being freed by current */
+	struct numa_group *grp = rcu_dereference_raw(p->numa_group);
 	unsigned long *numa_faults = p->numa_faults;
 	unsigned long flags;
 	int i;
@@ -2425,7 +2463,7 @@ void task_numa_fault(int last_cpupid, in
 	 * actively using should be counted as local. This allows the
 	 * scan rate to slow down when a workload has settled down.
 	 */
-	ng = p->numa_group;
+	ng = deref_curr_numa_group(p);
 	if (!priv && !local && ng && ng->active_nodes > 1 &&
 				numa_is_active_node(cpu_node, ng) &&
 				numa_is_active_node(mem_node, ng))
@@ -10724,18 +10762,22 @@ void show_numa_stats(struct task_struct
 {
 	int node;
 	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+	struct numa_group *ng;
 
+	rcu_read_lock();
+	ng = rcu_dereference(p->numa_group);
 	for_each_online_node(node) {
 		if (p->numa_faults) {
 			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
 			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
 		}
-		if (p->numa_group) {
-			gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
-			gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
+		if (ng) {
+			gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
+			gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
 		}
 		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
 	}
+	rcu_read_unlock();
 }
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */

next prev parent reply	other threads:[~2019-08-02  9:58 UTC|newest]

Thread overview: 40+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-08-02  9:39 [PATCH 5.2 00/20] 5.2.6-stable review Greg Kroah-Hartman
2019-08-02  9:39 ` [PATCH 5.2 01/20] vsock: correct removal of socket from the list Greg Kroah-Hartman
2019-08-02  9:39 ` [PATCH 5.2 02/20] ISDN: hfcsusb: checking idx of ep configuration Greg Kroah-Hartman
2019-08-02  9:39 ` [PATCH 5.2 03/20] ALSA: usb-audio: Sanity checks for each pipe and EP types Greg Kroah-Hartman
2019-08-02 13:48   ` Sasha Levin
2019-08-02 15:51     ` Greg Kroah-Hartman
2019-08-07  2:38       ` Sasha Levin
2019-08-02  9:39 ` [PATCH 5.2 04/20] bpf: fix NULL deref in btf_type_is_resolve_source_only Greg Kroah-Hartman
2019-08-02  9:39 ` [PATCH 5.2 05/20] media: au0828: fix null dereference in error path Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 06/20] ath10k: Change the warning message string Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 07/20] media: cpia2_usb: first wake up, then free in disconnect Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 08/20] media: pvrusb2: use a different format for warnings Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 09/20] NFS: Cleanup if nfs_match_client is interrupted Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 10/20] media: radio-raremono: change devm_k*alloc to k*alloc Greg Kroah-Hartman
2019-08-02 10:04   ` Joe Perches
2019-08-06  5:34     ` Luke Nowakowski-Krijger
2019-08-02  9:40 ` [PATCH 5.2 11/20] xfrm: policy: fix bydst hlist corruption on hash rebuild Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 12/20] nvme: fix multipath crash when ANA is deactivated Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 13/20] Bluetooth: hci_uart: check for missing tty operations Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 14/20] sched/fair: Dont free p->numa_faults with concurrent readers Greg Kroah-Hartman
2019-08-02  9:40 ` Greg Kroah-Hartman [this message]
2019-08-02  9:40 ` [PATCH 5.2 16/20] /proc/<pid>/cmdline: remove all the special cases Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 17/20] /proc/<pid>/cmdline: add back the setproctitle() special case Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 18/20] drivers/pps/pps.c: clear offset flags in PPS_SETPARAMS ioctl Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 19/20] Fix allyesconfig output Greg Kroah-Hartman
2019-08-02  9:40 ` [PATCH 5.2 20/20] ceph: hold i_ceph_lock when removing caps for freeing inode Greg Kroah-Hartman
2019-08-02 17:21 ` [PATCH 5.2 00/20] 5.2.6-stable review Thierry Reding
2019-08-03  7:09   ` Greg Kroah-Hartman
2019-08-05 11:48     ` Thierry Reding
2019-08-09  8:52       ` Greg Kroah-Hartman
2019-08-09 13:04         ` Thierry Reding
2019-08-09 15:49           ` Greg Kroah-Hartman
2019-08-12  9:05             ` Thierry Reding
2021-01-04 12:39               ` Greg Kroah-Hartman
2019-08-02 23:25 ` shuah
2019-08-03  7:08   ` Greg Kroah-Hartman
2019-08-03  5:50 ` Naresh Kamboju
2019-08-03  7:11   ` Greg Kroah-Hartman
2019-08-03 16:00 ` Guenter Roeck
2019-08-04  7:15   ` Greg Kroah-Hartman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190802092102.928273691@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=jannh@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=pmladek@suse.com \
    --cc=sergey.senozhatsky@gmail.com \
    --cc=stable@vger.kernel.org \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.