linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] perf/core: fix RCU issues with cgroup monitoring mode
@ 2015-10-17  1:28 Stephane Eranian
  2015-10-17  9:56 ` Peter Zijlstra
  2015-10-17 10:00 ` Peter Zijlstra
  0 siblings, 2 replies; 7+ messages in thread
From: Stephane Eranian @ 2015-10-17  1:28 UTC (permalink / raw)
  To: linux-kernel; +Cc: peterz, mingo, ak, edumazet, acme


This patch eliminates all known RCU violations detected
by the RCU  checker (PROVE_RCU). The impact code paths
were all related to cgroup mode monitoring and involved
access a task's cgrp.

Patch is relative to tip.git at commit:
	f8e9941 Merge branch 'x86/urgent'

Signed-off-by: Stephane Eranian <eranian@google.com>
---
 kernel/events/core.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ea02109..65c4ffa 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -435,12 +435,16 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
 	if (!is_cgroup_event(event))
 		return;
 
+	rcu_read_lock();
+
 	cgrp = perf_cgroup_from_task(current);
 	/*
 	 * Do not update time when cgroup is not active
 	 */
 	if (cgrp == event->cgrp)
 		__update_cgrp_time(event->cgrp);
+
+	rcu_read_unlock();
 }
 
 static inline void
@@ -458,9 +462,11 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	if (!task || !ctx->nr_cgroups)
 		return;
 
+	rcu_read_lock();
 	cgrp = perf_cgroup_from_task(task);
 	info = this_cpu_ptr(cgrp->info);
 	info->timestamp = ctx->timestamp;
+	rcu_read_unlock();
 }
 
 #define PERF_CGROUP_SWOUT	0x1 /* cgroup switch out every event */
@@ -489,7 +495,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 	 * we reschedule only in the presence of cgroup
 	 * constrained events.
 	 */
-	rcu_read_lock();
 
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -531,8 +536,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 		}
 	}
 
-	rcu_read_unlock();
-
 	local_irq_restore(flags);
 }
 
@@ -542,6 +545,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
 	struct perf_cgroup *cgrp1;
 	struct perf_cgroup *cgrp2 = NULL;
 
+	rcu_read_lock();
 	/*
 	 * we come here when we know perf_cgroup_events > 0
 	 */
@@ -561,6 +565,8 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
 	 */
 	if (cgrp1 != cgrp2)
 		perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+
+	rcu_read_unlock();
 }
 
 static inline void perf_cgroup_sched_in(struct task_struct *prev,
@@ -569,6 +575,7 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
 	struct perf_cgroup *cgrp1;
 	struct perf_cgroup *cgrp2 = NULL;
 
+	rcu_read_lock();
 	/*
 	 * we come here when we know perf_cgroup_events > 0
 	 */
@@ -577,6 +584,7 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
 	/* prev can never be NULL */
 	cgrp2 = perf_cgroup_from_task(prev);
 
+
 	/*
 	 * only need to schedule in cgroup events if we are changing
 	 * cgroup during ctxsw. Cgroup events were not scheduled
@@ -584,6 +592,8 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
 	 */
 	if (cgrp1 != cgrp2)
 		perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+
+	rcu_read_unlock();
 }
 
 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -2094,6 +2104,7 @@ static int  __perf_install_in_context(void *info)
 		cpuctx->task_ctx = task_ctx;
 		task = task_ctx->task;
 	}
+	rcu_read_lock();
 
 	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 
@@ -2112,6 +2123,8 @@ static int  __perf_install_in_context(void *info)
 	 */
 	perf_event_sched_in(cpuctx, task_ctx, task);
 
+	rcu_read_unlock();
+
 	perf_pmu_enable(cpuctx->ctx.pmu);
 	perf_ctx_unlock(cpuctx, task_ctx);
 
@@ -2398,7 +2411,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 		return;
 
 	update_context_time(ctx);
+	rcu_read_lock();
 	update_cgrp_time_from_cpuctx(cpuctx);
+	rcu_read_unlock();
 	if (!ctx->nr_active)
 		return;
 
@@ -9442,7 +9457,9 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
 static int __perf_cgroup_move(void *info)
 {
 	struct task_struct *task = info;
+	rcu_read_lock();
 	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+	rcu_read_unlock();
 	return 0;
 }
 
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/core: fix RCU issues with cgroup monitoring mode
  2015-10-17  1:28 [PATCH] perf/core: fix RCU issues with cgroup monitoring mode Stephane Eranian
@ 2015-10-17  9:56 ` Peter Zijlstra
  2015-10-19  7:58   ` Stephane Eranian
  2015-10-17 10:00 ` Peter Zijlstra
  1 sibling, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2015-10-17  9:56 UTC (permalink / raw)
  To: Stephane Eranian; +Cc: linux-kernel, mingo, ak, edumazet, acme

On Sat, Oct 17, 2015 at 03:28:11AM +0200, Stephane Eranian wrote:
> 
> This patch eliminates all known RCU violations detected
> by the RCU  checker (PROVE_RCU). The impact code paths
> were all related to cgroup mode monitoring and involved
> access a task's cgrp.

But were they right? This patch provides no clues.

> Signed-off-by: Stephane Eranian <eranian@google.com>
> ---

> @@ -2094,6 +2104,7 @@ static int  __perf_install_in_context(void *info)
>  		cpuctx->task_ctx = task_ctx;
>  		task = task_ctx->task;
>  	}
> +	rcu_read_lock();
>  
>  	cpu_ctx_sched_out(cpuctx, EVENT_ALL);
>  
> @@ -2112,6 +2123,8 @@ static int  __perf_install_in_context(void *info)
>  	 */
>  	perf_event_sched_in(cpuctx, task_ctx, task);
>  
> +	rcu_read_unlock();
> +
>  	perf_pmu_enable(cpuctx->ctx.pmu);
>  	perf_ctx_unlock(cpuctx, task_ctx);
>  
> @@ -2398,7 +2411,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
>  		return;
>  
>  	update_context_time(ctx);
> +	rcu_read_lock();
>  	update_cgrp_time_from_cpuctx(cpuctx);
> +	rcu_read_unlock();
>  	if (!ctx->nr_active)
>  		return;
>  

And these impact !cgroup code, and the last looks like it should be
inside update_cgrp_time_from_cpuctx() if at all.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/core: fix RCU issues with cgroup monitoring mode
  2015-10-17  1:28 [PATCH] perf/core: fix RCU issues with cgroup monitoring mode Stephane Eranian
  2015-10-17  9:56 ` Peter Zijlstra
@ 2015-10-17 10:00 ` Peter Zijlstra
  1 sibling, 0 replies; 7+ messages in thread
From: Peter Zijlstra @ 2015-10-17 10:00 UTC (permalink / raw)
  To: Stephane Eranian; +Cc: linux-kernel, mingo, ak, edumazet, acme

On Sat, Oct 17, 2015 at 03:28:11AM +0200, Stephane Eranian wrote:
>  static int __perf_cgroup_move(void *info)
>  {
>  	struct task_struct *task = info;
> +	rcu_read_lock();
>  	perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
> +	rcu_read_unlock();
>  	return 0;
>  }

On second thought, how can this be right? Surely the cgroup system has
cgroups stabilized over calling ->attach() ?



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/core: fix RCU issues with cgroup monitoring mode
  2015-10-17  9:56 ` Peter Zijlstra
@ 2015-10-19  7:58   ` Stephane Eranian
  2015-10-19  9:12     ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Stephane Eranian @ 2015-10-19  7:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, mingo@elte.hu, ak@linux.intel.com, Eric Dumazet,
	Arnaldo Carvalho de Melo

Peter,

On Sat, Oct 17, 2015 at 2:56 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Sat, Oct 17, 2015 at 03:28:11AM +0200, Stephane Eranian wrote:
> >
> > This patch eliminates all known RCU violations detected
> > by the RCU  checker (PROVE_RCU). The impact code paths
> > were all related to cgroup mode monitoring and involved
> > access a task's cgrp.
>
> But were they right? This patch provides no clues.
>
I am assuming that is the checker detects something suspicious there is likely
a problem.

Take for instance:
 perf_cgroup_sched_out()->perf_cgroup_from_task() ->task_subsys_state()

That one fires the checker. I think because we are accessing the css
state without
protection.

The other places are similar.

I am also trying to avoid double taking the rcu-read-lock(), so I have moved
the rcu_read_lock() out of some functions to place them in the caller. The
perf_cgroup_switch() is a performance sensitive path.

If this is not the way to fix this, let me know and I'll rewrite the patch.
Thanks.




> > Signed-off-by: Stephane Eranian <eranian@google.com>
> > ---
>
> > @@ -2094,6 +2104,7 @@ static int  __perf_install_in_context(void *info)
> >               cpuctx->task_ctx = task_ctx;
> >               task = task_ctx->task;
> >       }
> > +     rcu_read_lock();
> >
> >       cpu_ctx_sched_out(cpuctx, EVENT_ALL);
> >
> > @@ -2112,6 +2123,8 @@ static int  __perf_install_in_context(void *info)
> >        */
> >       perf_event_sched_in(cpuctx, task_ctx, task);
> >
> > +     rcu_read_unlock();
> > +
> >       perf_pmu_enable(cpuctx->ctx.pmu);
> >       perf_ctx_unlock(cpuctx, task_ctx);
> >
> > @@ -2398,7 +2411,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
> >               return;
> >
> >       update_context_time(ctx);
> > +     rcu_read_lock();
> >       update_cgrp_time_from_cpuctx(cpuctx);
> > +     rcu_read_unlock();
> >       if (!ctx->nr_active)
> >               return;
> >
>
> And these impact !cgroup code, and the last looks like it should be
> inside update_cgrp_time_from_cpuctx() if at all.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/core: fix RCU issues with cgroup monitoring mode
  2015-10-19  7:58   ` Stephane Eranian
@ 2015-10-19  9:12     ` Peter Zijlstra
  2015-10-26 19:27       ` Stephane Eranian
  0 siblings, 1 reply; 7+ messages in thread
From: Peter Zijlstra @ 2015-10-19  9:12 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: LKML, mingo@elte.hu, ak@linux.intel.com, Eric Dumazet,
	Arnaldo Carvalho de Melo

On Mon, Oct 19, 2015 at 12:58:47AM -0700, Stephane Eranian wrote:
> Peter,
> 
> On Sat, Oct 17, 2015 at 2:56 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Sat, Oct 17, 2015 at 03:28:11AM +0200, Stephane Eranian wrote:
> > >
> > > This patch eliminates all known RCU violations detected
> > > by the RCU  checker (PROVE_RCU). The impact code paths
> > > were all related to cgroup mode monitoring and involved
> > > access a task's cgrp.
> >
> > But were they right? This patch provides no clues.
> >
> I am assuming that is the checker detects something suspicious there is likely
> a problem.
> 
> Take for instance:
>  perf_cgroup_sched_out()->perf_cgroup_from_task() ->task_subsys_state()
> 
> That one fires the checker. I think because we are accessing the css
> state without
> protection.
> 
> The other places are similar.

But perf_cgroup_attach()->perf_cgroup_switch() takes ctx->lock().

Therefore; if you hold ctx->lock, the cgroup is pinned.

And the above sequence very much holds ctx->lock.

Right?

So it looks to me that we should teach perf_cgroup_from_task() about
ctx->lock or something.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/core: fix RCU issues with cgroup monitoring mode
  2015-10-19  9:12     ` Peter Zijlstra
@ 2015-10-26 19:27       ` Stephane Eranian
  2015-10-26 20:20         ` Peter Zijlstra
  0 siblings, 1 reply; 7+ messages in thread
From: Stephane Eranian @ 2015-10-26 19:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, mingo@elte.hu, ak@linux.intel.com, Eric Dumazet,
	Arnaldo Carvalho de Melo

On Mon, Oct 19, 2015 at 2:12 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, Oct 19, 2015 at 12:58:47AM -0700, Stephane Eranian wrote:
> > Peter,
> >
> > On Sat, Oct 17, 2015 at 2:56 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Sat, Oct 17, 2015 at 03:28:11AM +0200, Stephane Eranian wrote:
> > > >
> > > > This patch eliminates all known RCU violations detected
> > > > by the RCU  checker (PROVE_RCU). The impact code paths
> > > > were all related to cgroup mode monitoring and involved
> > > > access a task's cgrp.
> > >
> > > But were they right? This patch provides no clues.
> > >
> > I am assuming that is the checker detects something suspicious there is likely
> > a problem.
> >
> > Take for instance:
> >  perf_cgroup_sched_out()->perf_cgroup_from_task() ->task_subsys_state()
> >
> > That one fires the checker. I think because we are accessing the css
> > state without
> > protection.
> >
> > The other places are similar.
>
> But perf_cgroup_attach()->perf_cgroup_switch() takes ctx->lock().
>
> Therefore; if you hold ctx->lock, the cgroup is pinned.
>
Ok, that one was a bad example because yes, it grabs the ctx lock and the
rcu_lock() already.

But the other path:

  __perf_event_task_sched_out() -> perf_cgroup_sched_out() ->
perf_cgroup_switch()
is accessing in perf_cgroup_sched_out() task_subsys_state() without
ctx->lock() or
rcu_read lock. Same thing on the sched_in path.

The one place where we already hold the ctx->lock is
__perf_install_in_context().
Same thing for __perf_event_read() -> update_cgrp_time_from_event().
So yes, we'd need a way to tell this is okay, the cgroup cannot disappear.




>
> And the above sequence very much holds ctx->lock.
>
> Right?
>
> So it looks to me that we should teach perf_cgroup_from_task() about
> ctx->lock or something.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] perf/core: fix RCU issues with cgroup monitoring mode
  2015-10-26 19:27       ` Stephane Eranian
@ 2015-10-26 20:20         ` Peter Zijlstra
  0 siblings, 0 replies; 7+ messages in thread
From: Peter Zijlstra @ 2015-10-26 20:20 UTC (permalink / raw)
  To: Stephane Eranian
  Cc: LKML, mingo@elte.hu, ak@linux.intel.com, Eric Dumazet,
	Arnaldo Carvalho de Melo

On Mon, Oct 26, 2015 at 12:27:05PM -0700, Stephane Eranian wrote:
> Ok, that one was a bad example because yes, it grabs the ctx lock and the
> rcu_lock() already.
> 
> But the other path:
> 
>   __perf_event_task_sched_out() -> perf_cgroup_sched_out() ->
> perf_cgroup_switch()
> is accessing in perf_cgroup_sched_out() task_subsys_state() without
> ctx->lock() or
> rcu_read lock. Same thing on the sched_in path.

Indeed, that part looks good.

Now if you'd done a patch per warning (roughly) with the splat included,
I could have applied some of it ;-)

> The one place where we already hold the ctx->lock is
> __perf_install_in_context().
> Same thing for __perf_event_read() -> update_cgrp_time_from_event().
> So yes, we'd need a way to tell this is okay, the cgroup cannot disappear.

Right, but this is 'hard' :/ Or at least, my jet-lagged brain isn't
currently seeing a sane way to do that.

perf_cgroup_from_task() only takes the task pointer, and we'd somehow
need to find a 'random' pmu cpuctx pointer for that.

Passing in a ctx pointer would be possible, but would not be too much
different from just telling that check to shut up.

Hmm, note that you even have comments in saying these are good because you've
done css_get(). So maybe just shut it up?

Something like the below; combine that with the bits of your patch that
move rcu_lock our from perf_cgroup_switch() and into
perf_cgroup_switch_{in,out}() and we should be good, right?

---
 include/linux/perf_event.h |  4 ++--
 kernel/events/core.c       | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index d841d33bcdc9..24f3539a85ef 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -697,9 +697,9 @@ struct perf_cgroup {
  * if there is no cgroup event for the current CPU context.
  */
 static inline struct perf_cgroup *
-perf_cgroup_from_task(struct task_struct *task)
+perf_cgroup_from_task(struct task_struct *task, bool safe)
 {
-	return container_of(task_css(task, perf_event_cgrp_id),
+	return container_of(task_css_check(task, perf_event_cgrp_id, safe),
 			    struct perf_cgroup, css);
 }
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ea02109aee77..3633630a20df 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -435,7 +435,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
 	if (!is_cgroup_event(event))
 		return;
 
-	cgrp = perf_cgroup_from_task(current);
+	cgrp = perf_cgroup_from_task(current, true);
 	/*
 	 * Do not update time when cgroup is not active
 	 */
@@ -458,7 +458,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
 	if (!task || !ctx->nr_cgroups)
 		return;
 
-	cgrp = perf_cgroup_from_task(task);
+	cgrp = perf_cgroup_from_task(task, true);
 	info = this_cpu_ptr(cgrp->info);
 	info->timestamp = ctx->timestamp;
 }
@@ -523,7 +523,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
 				 * event_filter_match() to not have to pass
 				 * task around
 				 */
-				cpuctx->cgrp = perf_cgroup_from_task(task);
+				cpuctx->cgrp = perf_cgroup_from_task(task, false);
 				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 			}
 			perf_pmu_enable(cpuctx->ctx.pmu);
@@ -545,14 +545,14 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
 	/*
 	 * we come here when we know perf_cgroup_events > 0
 	 */
-	cgrp1 = perf_cgroup_from_task(task);
+	cgrp1 = perf_cgroup_from_task(task, false);
 
 	/*
 	 * next is NULL when called from perf_event_enable_on_exec()
 	 * that will systematically cause a cgroup_switch()
 	 */
 	if (next)
-		cgrp2 = perf_cgroup_from_task(next);
+		cgrp2 = perf_cgroup_from_task(next, false);
 
 	/*
 	 * only schedule out current cgroup events if we know
@@ -572,10 +572,10 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
 	/*
 	 * we come here when we know perf_cgroup_events > 0
 	 */
-	cgrp1 = perf_cgroup_from_task(task);
+	cgrp1 = perf_cgroup_from_task(task, false);
 
 	/* prev can never be NULL */
-	cgrp2 = perf_cgroup_from_task(prev);
+	cgrp2 = perf_cgroup_from_task(prev, false);
 
 	/*
 	 * only need to schedule in cgroup events if we are changing


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-10-26 20:20 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-10-17  1:28 [PATCH] perf/core: fix RCU issues with cgroup monitoring mode Stephane Eranian
2015-10-17  9:56 ` Peter Zijlstra
2015-10-19  7:58   ` Stephane Eranian
2015-10-19  9:12     ` Peter Zijlstra
2015-10-26 19:27       ` Stephane Eranian
2015-10-26 20:20         ` Peter Zijlstra
2015-10-17 10:00 ` Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).