[RFC PATCH] perf_counter: Fix race in attaching counters to tasks

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC PATCH] perf_counter: Fix race in attaching counters to tasks
@ 2009-05-26 12:29 Paul Mackerras
  2009-05-26 21:20 ` Peter Zijlstra
  0 siblings, 1 reply; 3+ messages in thread
From: Paul Mackerras @ 2009-05-26 12:29 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Peter Zijlstra, linux-kernel, Corey Ashford, Thomas Gleixner

Commit 564c2b21 ("perf_counter: Optimize context switch between
identical inherited contexts") introduced a race where it is possible
that a counter being attached to a task could get attached to the
wrong task, if the task is one that has inherited its context from
another task via fork.  This happens because the optimized context
switch could switch the context to another task after find_get_context
has read task->perf_counter_ctxp.  In fact, it's possible that the
context could then get freed, if the other task then exits.

This fixes the problem by protecting both the context switch and the
critical code in find_get_context with a spinlock.  We use the
ctx->lock of the parent context for this because it will be common
between any pair of contexts that might get swapped.  Thus
perf_counter_task_sched_out only needs to take one lock to exclude
find_get_context from getting the wrong context for either the old
task or the new task.

To make sure that none of the contexts being looked at in
find_get_context can get freed, this changes the context freeing code
to use RCU.  Thus an rcu_read_lock() is sufficient to ensure that no
contexts can get freed.  This part of the patch is lifted from a patch
posted by Peter Zijlstra.

This also adds a check to make sure that we can't add a counter to a
task that is exiting.  This solves a race between
perf_counter_exit_task and find_get_context; it ensures that
find_get_context can't attach a new context to a task after
perf_counter_exit_task has disposed of the task's context.

With this, we are now doing the unclone in find_get_context rather
than when a counter was added to or removed from a context (actually,
we were missing the unclone_ctx() call when adding a counter to a
context).  We don't need to unclone when removing a counter from a
context because we have no way to remove a counter from a cloned
context.

This also takes out the smp_wmb() in find_get_context, which Peter
Zijlstra pointed out was unnecessary because the cmpxchg implies a
full barrier anyway.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2b16ed3..35dc996 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -541,8 +541,9 @@ struct perf_counter_context {
 	 * been cloned (inherited) from a common ancestor.
 	 */
 	struct perf_counter_context *parent_ctx;
-	u32			parent_gen;
-	u32			generation;
+	u64			parent_gen;
+	u64			generation;
+	struct rcu_head		rcu_head;
 };
 
 /**
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 367299f..469ffe2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -103,12 +103,20 @@ static void get_ctx(struct perf_counter_context *ctx)
 	atomic_inc(&ctx->refcount);
 }
 
+static void free_ctx(struct rcu_head *head)
+{
+	struct perf_counter_context *ctx;
+
+	ctx = container_of(head, struct perf_counter_context, rcu_head);
+	kfree(ctx);
+}
+
 static void put_ctx(struct perf_counter_context *ctx)
 {
 	if (atomic_dec_and_test(&ctx->refcount)) {
 		if (ctx->parent_ctx)
 			put_ctx(ctx->parent_ctx);
-		kfree(ctx);
+		call_rcu(&ctx->rcu_head, free_ctx);
 	}
 }
 
@@ -212,22 +220,6 @@ group_sched_out(struct perf_counter *group_counter,
 }
 
 /*
- * Mark this context as not being a clone of another.
- * Called when counters are added to or removed from this context.
- * We also increment our generation number so that anything that
- * was cloned from this context before this will not match anything
- * cloned from this context after this.
- */
-static void unclone_ctx(struct perf_counter_context *ctx)
-{
-	++ctx->generation;
-	if (!ctx->parent_ctx)
-		return;
-	put_ctx(ctx->parent_ctx);
-	ctx->parent_ctx = NULL;
-}
-
-/*
  * Cross CPU call to remove a performance counter
  *
  * We disable the counter on the hardware level first. After that we
@@ -281,13 +273,16 @@ static void __perf_counter_remove_from_context(void *info)
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.
  */
 static void perf_counter_remove_from_context(struct perf_counter *counter)
 {
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
 
-	unclone_ctx(ctx);
 	if (!task) {
 		/*
 		 * Per cpu counters are removed via an smp call and
@@ -410,6 +405,10 @@ static void __perf_counter_disable(void *info)
 
 /*
  * Disable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.
  */
 static void perf_counter_disable(struct perf_counter *counter)
 {
@@ -794,6 +793,10 @@ static void __perf_counter_enable(void *info)
 
 /*
  * Enable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.
  */
 static void perf_counter_enable(struct perf_counter *counter)
 {
@@ -923,7 +926,9 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 	struct perf_counter_context *next_ctx;
+	struct perf_counter_context *parent;
 	struct pt_regs *regs;
+	int do_switch = 1;
 
 	regs = task_pt_regs(task);
 	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
@@ -932,18 +937,36 @@ void perf_counter_task_sched_out(struct task_struct *task,
 		return;
 
 	update_context_time(ctx);
-	next_ctx = next->perf_counter_ctxp;
-	if (next_ctx && context_equiv(ctx, next_ctx)) {
-		task->perf_counter_ctxp = next_ctx;
-		next->perf_counter_ctxp = ctx;
-		ctx->task = next;
-		next_ctx->task = task;
-		return;
-	}
 
-	__perf_counter_sched_out(ctx, cpuctx);
+	rcu_read_lock();
+	parent = rcu_dereference(ctx->parent_ctx);
+	next_ctx = rcu_dereference(next->perf_counter_ctxp);
+	if (parent && next_ctx &&
+	    rcu_dereference(next_ctx->parent_ctx) == parent) {
+		/*
+		 * Looks like the two contexts are clones, so we might be
+		 * able to optimize the context switch.  We lock the
+		 * parent context because it represents a common point
+		 * connected to both the incoming and outgoing contexts
+		 * in the case where we can optimize.  Then we check
+		 * under the lock whether they really are clones.
+		 */
+		spin_lock(&parent->lock);
+		if (context_equiv(ctx, next_ctx)) {
+			task->perf_counter_ctxp = next_ctx;
+			next->perf_counter_ctxp = ctx;
+			ctx->task = next;
+			next_ctx->task = task;
+			do_switch = 0;
+		}
+		spin_unlock(&parent->lock);
+	}
+	rcu_read_unlock();
 
-	cpuctx->task_ctx = NULL;
+	if (do_switch) {
+		__perf_counter_sched_out(ctx, cpuctx);
+		cpuctx->task_ctx = NULL;
+	}
 }
 
 static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
@@ -1226,6 +1249,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
 	struct perf_counter_context *tctx;
+	struct perf_counter_context *parent_ctx;
 	struct task_struct *task;
 
 	/*
@@ -1271,7 +1295,41 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 		return ERR_PTR(-EACCES);
 	}
 
-	ctx = task->perf_counter_ctxp;
+	rcu_read_lock();
+	ctx = rcu_dereference(task->perf_counter_ctxp);
+	if (ctx) {
+		parent_ctx = rcu_dereference(ctx->parent_ctx);
+		if (parent_ctx) {
+			/*
+			 * This context appears to be a clone of another,
+			 * so it might get swapped for another underneath
+			 * us by perf_counter_task_sched_out, though the
+			 * rcu_read_lock() protects us from any context
+			 * getting freed.  Lock the parent context, which
+			 * will prevent our context from getting swapped,
+			 * then reload the pointers and unclone the context.
+			 * Once it's not a clone things will be stable.
+			 */
+			spin_lock(&parent_ctx->lock);
+			ctx = task->perf_counter_ctxp;
+			if (ctx && ctx->parent_ctx) {
+				put_ctx(ctx->parent_ctx);
+				ctx->parent_ctx = NULL;
+			}
+			spin_unlock(&parent_ctx->lock);
+		}
+	}
+	rcu_read_unlock();
+
+	/*
+	 * Can't attach counters to a dying task.
+	 * Have to check this after loading ctx.
+	 */
+	if (task->flags & PF_EXITING) {
+		put_task_struct(task);
+		return ERR_PTR(-ESRCH);
+	}
+
 	if (!ctx) {
 		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
 		if (!ctx) {
@@ -1279,11 +1337,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 			return ERR_PTR(-ENOMEM);
 		}
 		__perf_counter_init_context(ctx, task);
-		/*
-		 * Make sure other cpus see correct values for *ctx
-		 * once task->perf_counter_ctxp is visible to them.
-		 */
-		smp_wmb();
 		tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
 		if (tctx) {
 			/*
@@ -1295,6 +1348,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 		}
 	}
 
+	++ctx->generation;
 	return ctx;
 }
 
@@ -1303,7 +1357,6 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
-	put_ctx(counter->ctx);
 	kfree(counter);
 }
 
@@ -1324,6 +1377,7 @@ static void free_counter(struct perf_counter *counter)
 	if (counter->destroy)
 		counter->destroy(counter);
 
+	put_ctx(counter->ctx);
 	call_rcu(&counter->rcu_head, free_counter_rcu);
 }
 
@@ -1437,6 +1491,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
 	mutex_unlock(&ctx->mutex);
 }
 
+/*
+ * Holding the top-level counter's child_mutex means that any
+ * descendant process that has inherited this counter will block
+ * in sync_child_counter if it goes to exit, thus satisfying the
+ * task existence requirements of perf_counter_enable/disable.
+ */
 static void perf_counter_for_each_child(struct perf_counter *counter,
 					void (*func)(struct perf_counter *))
 {
@@ -3449,17 +3509,13 @@ void perf_counter_exit_task(struct task_struct *child)
 {
 	struct perf_counter *child_counter, *tmp;
 	struct perf_counter_context *child_ctx;
-	unsigned long flags;
 
 	child_ctx = child->perf_counter_ctxp;
-
 	if (likely(!child_ctx))
 		return;
 
-	local_irq_save(flags);
 	__perf_counter_task_sched_out(child_ctx);
 	child->perf_counter_ctxp = NULL;
-	local_irq_restore(flags);
 
 	mutex_lock(&child_ctx->mutex);
 

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [RFC PATCH] perf_counter: Fix race in attaching counters to tasks
  2009-05-26 12:29 [RFC PATCH] perf_counter: Fix race in attaching counters to tasks Paul Mackerras
@ 2009-05-26 21:20 ` Peter Zijlstra
  2009-05-27  8:27   ` Paul Mackerras
  0 siblings, 1 reply; 3+ messages in thread
From: Peter Zijlstra @ 2009-05-26 21:20 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: Ingo Molnar, linux-kernel, Corey Ashford, Thomas Gleixner

On Tue, 2009-05-26 at 22:29 +1000, Paul Mackerras wrote:
> Commit 564c2b21 ("perf_counter: Optimize context switch between
> identical inherited contexts") introduced a race where it is possible
> that a counter being attached to a task could get attached to the
> wrong task, if the task is one that has inherited its context from
> another task via fork.  This happens because the optimized context
> switch could switch the context to another task after find_get_context
> has read task->perf_counter_ctxp.  In fact, it's possible that the
> context could then get freed, if the other task then exits.
> 
> This fixes the problem by protecting both the context switch and the
> critical code in find_get_context with a spinlock.  We use the
> ctx->lock of the parent context for this because it will be common
> between any pair of contexts that might get swapped.  Thus
> perf_counter_task_sched_out only needs to take one lock to exclude
> find_get_context from getting the wrong context for either the old
> task or the new task.

I'm not sure this is the better approach (as opposed to your previous
patch that used a per counter/task lock), since the parent context lock
will serialize the full process tree over all cpus, whereas the per
counter/task lock will be lock to the current cpu.

> To make sure that none of the contexts being looked at in
> find_get_context can get freed, this changes the context freeing code
> to use RCU.  Thus an rcu_read_lock() is sufficient to ensure that no
> contexts can get freed.  This part of the patch is lifted from a patch
> posted by Peter Zijlstra.
> 
> This also adds a check to make sure that we can't add a counter to a
> task that is exiting.  This solves a race between
> perf_counter_exit_task and find_get_context; it ensures that
> find_get_context can't attach a new context to a task after
> perf_counter_exit_task has disposed of the task's context.

Right, this leaves us with the flush_old_exec() race. I tried solving
that by dropping the ctx reference count to 0 before iterating the ctx
counters and destroying them vs atomic_inc_not_zero() reference acquire
in find_get_context().

> With this, we are now doing the unclone in find_get_context rather
> than when a counter was added to or removed from a context (actually,
> we were missing the unclone_ctx() call when adding a counter to a
> context). 

We destroy the parent_ctx pointer under the lock, but suppose we're
trying to attach to the parent context itself, doesn't that mean we need
to increment the generatoin count under the lock as well?

>  We don't need to unclone when removing a counter from a
> context because we have no way to remove a counter from a cloned
> context.

Not without removing it from the whole hierarchy indeed.

> This also takes out the smp_wmb() in find_get_context, which Peter
> Zijlstra pointed out was unnecessary because the cmpxchg implies a
> full barrier anyway.

> @@ -281,13 +273,16 @@ static void __perf_counter_remove_from_context(void *info)
>   *
>   * CPU counters are removed with a smp call. For task counters we only
>   * call when the task is on a CPU.
> + *
> + * If counter->ctx is a cloned context, callers must make sure that
> + * every task struct that counter->ctx->task could possibly point to
> + * remains valid.
>   */

This comment (and its replicas) confuse me in that they only require
something but then don't expand on how this is accomplished.

> @@ -1437,6 +1491,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
>  	mutex_unlock(&ctx->mutex);
>  }
>  
> +/*
> + * Holding the top-level counter's child_mutex means that any
> + * descendant process that has inherited this counter will block
> + * in sync_child_counter if it goes to exit, thus satisfying the
> + * task existence requirements of perf_counter_enable/disable.
> + */
>  static void perf_counter_for_each_child(struct perf_counter *counter,
>  					void (*func)(struct perf_counter *))
>  {
> @@ -3449,17 +3509,13 @@ void perf_counter_exit_task(struct task_struct *child)
>  {
>  	struct perf_counter *child_counter, *tmp;
>  	struct perf_counter_context *child_ctx;
> -	unsigned long flags;
>  
>  	child_ctx = child->perf_counter_ctxp;
> -
>  	if (likely(!child_ctx))
>  		return;
>  
> -	local_irq_save(flags);
>  	__perf_counter_task_sched_out(child_ctx);
>  	child->perf_counter_ctxp = NULL;
> -	local_irq_restore(flags);
>  
>  	mutex_lock(&child_ctx->mutex);

This change looks wrong, __perf_counter_task_sched_out() ->
__perf_counter_sched_out() -> spin_lock(&ctx->lock) wants IRQs
disabled..


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [RFC PATCH] perf_counter: Fix race in attaching counters to tasks
  2009-05-26 21:20 ` Peter Zijlstra
@ 2009-05-27  8:27   ` Paul Mackerras
  0 siblings, 0 replies; 3+ messages in thread
From: Paul Mackerras @ 2009-05-27  8:27 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Ingo Molnar, linux-kernel, Corey Ashford, Thomas Gleixner

Peter Zijlstra writes:

> I'm not sure this is the better approach (as opposed to your previous
> patch that used a per counter/task lock), since the parent context lock
> will serialize the full process tree over all cpus, whereas the per
> counter/task lock will be lock to the current cpu.

OK, see the patch below, which goes back to the 2-locks approach,
except now they are just the ctx->lock of each context rather than
being an extra lock in the task_struct.  We can do that because we can
use RCU to guarantee the existence of the context in find_get_context.

> Right, this leaves us with the flush_old_exec() race. I tried solving
> that by dropping the ctx reference count to 0 before iterating the ctx
> counters and destroying them vs atomic_inc_not_zero() reference acquire
> in find_get_context().

I think I have solved that race now; the key observation being that it
doesn't matter if a context has some top-level (non-inherited)
counters in it after perf_counter_exit_task has finished, or if
top-level counters are added.  They won't do anything and they'll get
removed when their fd is closed.

> We destroy the parent_ctx pointer under the lock, but suppose we're
> trying to attach to the parent context itself, doesn't that mean we need
> to increment the generatoin count under the lock as well?

OK, we now increment it under the lock.

> >  We don't need to unclone when removing a counter from a
> > context because we have no way to remove a counter from a cloned
> > context.
> 
> Not without removing it from the whole hierarchy indeed.

Well, I can't even see any way to remove it from the hierarchy.  You
can close the fd for the top-level counter but we won't get
perf_release called until all the fput's have been done, which only
happens as descendant processes exit.

> > + *
> > + * If counter->ctx is a cloned context, callers must make sure that
> > + * every task struct that counter->ctx->task could possibly point to
> > + * remains valid.
> >   */
> 
> This comment (and its replicas) confuse me in that they only require
> something but then don't expand on how this is accomplished.

OK, I added some stuff to explain that.

> > -	local_irq_save(flags);
> >  	__perf_counter_task_sched_out(child_ctx);
> >  	child->perf_counter_ctxp = NULL;
> > -	local_irq_restore(flags);
> >  
> >  	mutex_lock(&child_ctx->mutex);
> 
> This change looks wrong, __perf_counter_task_sched_out() ->
> __perf_counter_sched_out() -> spin_lock(&ctx->lock) wants IRQs
> disabled..

Right, I took that change out.

If this all looks OK to you, I'll submit it properly with a patch
description and signoff...

Paul.

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2b16ed3..35dc996 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -541,8 +541,9 @@ struct perf_counter_context {
 	 * been cloned (inherited) from a common ancestor.
 	 */
 	struct perf_counter_context *parent_ctx;
-	u32			parent_gen;
-	u32			generation;
+	u64			parent_gen;
+	u64			generation;
+	struct rcu_head		rcu_head;
 };
 
 /**
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 367299f..f2d41be 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -103,12 +103,22 @@ static void get_ctx(struct perf_counter_context *ctx)
 	atomic_inc(&ctx->refcount);
 }
 
+static void free_ctx(struct rcu_head *head)
+{
+	struct perf_counter_context *ctx;
+
+	ctx = container_of(head, struct perf_counter_context, rcu_head);
+	if (ctx->task)
+		put_task_struct(ctx->task);
+	kfree(ctx);
+}
+
 static void put_ctx(struct perf_counter_context *ctx)
 {
 	if (atomic_dec_and_test(&ctx->refcount)) {
 		if (ctx->parent_ctx)
 			put_ctx(ctx->parent_ctx);
-		kfree(ctx);
+		call_rcu(&ctx->rcu_head, free_ctx);
 	}
 }
 
@@ -212,22 +222,6 @@ group_sched_out(struct perf_counter *group_counter,
 }
 
 /*
- * Mark this context as not being a clone of another.
- * Called when counters are added to or removed from this context.
- * We also increment our generation number so that anything that
- * was cloned from this context before this will not match anything
- * cloned from this context after this.
- */
-static void unclone_ctx(struct perf_counter_context *ctx)
-{
-	++ctx->generation;
-	if (!ctx->parent_ctx)
-		return;
-	put_ctx(ctx->parent_ctx);
-	ctx->parent_ctx = NULL;
-}
-
-/*
  * Cross CPU call to remove a performance counter
  *
  * We disable the counter on the hardware level first. After that we
@@ -281,13 +275,19 @@ static void __perf_counter_remove_from_context(void *info)
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_counter_exit_task, it's OK because the
+ * context has been detached from its task.
  */
 static void perf_counter_remove_from_context(struct perf_counter *counter)
 {
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
 
-	unclone_ctx(ctx);
 	if (!task) {
 		/*
 		 * Per cpu counters are removed via an smp call and
@@ -410,6 +410,16 @@ static void __perf_counter_disable(void *info)
 
 /*
  * Disable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisifed when called through
+ * perf_counter_for_each_child or perf_counter_for_each because they
+ * hold the top-level counter's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_counter.
+ * When called from perf_pending_counter it's OK because counter->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_counter_task_sched_out for this context.
  */
 static void perf_counter_disable(struct perf_counter *counter)
 {
@@ -794,6 +804,12 @@ static void __perf_counter_enable(void *info)
 
 /*
  * Enable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisfied when called through
+ * perf_counter_for_each_child or perf_counter_for_each as described
+ * for perf_counter_disable.
  */
 static void perf_counter_enable(struct perf_counter *counter)
 {
@@ -923,7 +939,9 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 	struct perf_counter_context *next_ctx;
+	struct perf_counter_context *parent;
 	struct pt_regs *regs;
+	int do_switch = 1;
 
 	regs = task_pt_regs(task);
 	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
@@ -932,18 +950,39 @@ void perf_counter_task_sched_out(struct task_struct *task,
 		return;
 
 	update_context_time(ctx);
+
+	rcu_read_lock();
+	parent = rcu_dereference(ctx->parent_ctx);
 	next_ctx = next->perf_counter_ctxp;
-	if (next_ctx && context_equiv(ctx, next_ctx)) {
-		task->perf_counter_ctxp = next_ctx;
-		next->perf_counter_ctxp = ctx;
-		ctx->task = next;
-		next_ctx->task = task;
-		return;
+	if (parent && next_ctx &&
+	    rcu_dereference(next_ctx->parent_ctx) == parent) {
+		/*
+		 * Looks like the two contexts are clones, so we might be
+		 * able to optimize the context switch.  We lock both
+		 * contexts and check that they are clones under the
+		 * lock (including re-checking that neither has been
+		 * uncloned in the meantime).  It doesn't matter which
+		 * order we take the locks because no other cpu could
+		 * be trying to lock both of these tasks.
+		 */
+		spin_lock(&ctx->lock);
+		spin_lock(&next_ctx->lock);
+		if (context_equiv(ctx, next_ctx)) {
+			task->perf_counter_ctxp = next_ctx;
+			next->perf_counter_ctxp = ctx;
+			ctx->task = next;
+			next_ctx->task = task;
+			do_switch = 0;
+		}
+		spin_unlock(&next_ctx->lock);
+		spin_unlock(&ctx->lock);
 	}
+	rcu_read_unlock();
 
-	__perf_counter_sched_out(ctx, cpuctx);
-
-	cpuctx->task_ctx = NULL;
+	if (do_switch) {
+		__perf_counter_sched_out(ctx, cpuctx);
+		cpuctx->task_ctx = NULL;
+	}
 }
 
 static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
@@ -1215,18 +1254,13 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	ctx->task = task;
 }
 
-static void put_context(struct perf_counter_context *ctx)
-{
-	if (ctx->task)
-		put_task_struct(ctx->task);
-}
-
 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
-	struct perf_counter_context *tctx;
+	struct perf_counter_context *parent_ctx;
 	struct task_struct *task;
+	int err;
 
 	/*
 	 * If cpu is not a wildcard then this is a percpu counter:
@@ -1249,6 +1283,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
+		get_ctx(ctx);
 
 		return ctx;
 	}
@@ -1265,37 +1300,78 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	if (!task)
 		return ERR_PTR(-ESRCH);
 
+	/*
+	 * Can't attach counters to a dying task.
+	 */
+	err = -ESRCH;
+	if (task->flags & PF_EXITING)
+		goto errout;
+
 	/* Reuse ptrace permission checks for now. */
-	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
-		put_task_struct(task);
-		return ERR_PTR(-EACCES);
+	err = -EACCES;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto errout;
+
+	rcu_read_lock();
+ retry:
+	ctx = rcu_dereference(task->perf_counter_ctxp);
+	if (ctx) {
+		/*
+		 * If this context is a clone of another, it might
+		 * get swapped for another underneath us by
+		 * perf_counter_task_sched_out, though the
+		 * rcu_read_lock() protects us from any context
+		 * getting freed.  Lock the context and check if it
+		 * got swapped before we could get the lock, and retry
+		 * if so.  If we locked the right context, then it
+		 * can't get swapped on us any more and we can
+		 * unclone it if necessary.
+		 * Once it's not a clone things will be stable.
+		 */
+		spin_lock(&ctx->lock);
+		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+			spin_unlock(&ctx->lock);
+			goto retry;
+		}
+		parent_ctx = ctx->parent_ctx;
+		if (parent_ctx) {
+			put_ctx(parent_ctx);
+			ctx->parent_ctx = NULL;		/* no longer a clone */
+		}
+		++ctx->generation;
+		/*
+		 * Get an extra reference before dropping the lock so that
+		 * this context won't get freed if the task exits.
+		 */
+		get_ctx(ctx);
+		spin_unlock(&ctx->lock);
 	}
+	rcu_read_unlock();
 
-	ctx = task->perf_counter_ctxp;
 	if (!ctx) {
 		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
-		if (!ctx) {
-			put_task_struct(task);
-			return ERR_PTR(-ENOMEM);
-		}
+		err = -ENOMEM;
+		if (!ctx)
+			goto errout;
 		__perf_counter_init_context(ctx, task);
-		/*
-		 * Make sure other cpus see correct values for *ctx
-		 * once task->perf_counter_ctxp is visible to them.
-		 */
-		smp_wmb();
-		tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
-		if (tctx) {
+		get_ctx(ctx);
+		if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
 			/*
 			 * We raced with some other task; use
 			 * the context they set.
 			 */
 			kfree(ctx);
-			ctx = tctx;
+			goto retry;
 		}
+		get_task_struct(task);
 	}
 
+	put_task_struct(task);
 	return ctx;
+
+ errout:
+	put_task_struct(task);
+	return ERR_PTR(err);
 }
 
 static void free_counter_rcu(struct rcu_head *head)
@@ -1303,7 +1379,6 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
-	put_ctx(counter->ctx);
 	kfree(counter);
 }
 
@@ -1324,6 +1399,7 @@ static void free_counter(struct perf_counter *counter)
 	if (counter->destroy)
 		counter->destroy(counter);
 
+	put_ctx(counter->ctx);
 	call_rcu(&counter->rcu_head, free_counter_rcu);
 }
 
@@ -1347,7 +1423,6 @@ static int perf_release(struct inode *inode, struct file *file)
 	put_task_struct(counter->owner);
 
 	free_counter(counter);
-	put_context(ctx);
 
 	return 0;
 }
@@ -1437,6 +1512,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
 	mutex_unlock(&ctx->mutex);
 }
 
+/*
+ * Holding the top-level counter's child_mutex means that any
+ * descendant process that has inherited this counter will block
+ * in sync_child_counter if it goes to exit, thus satisfying the
+ * task existence requirements of perf_counter_enable/disable.
+ */
 static void perf_counter_for_each_child(struct perf_counter *counter,
 					void (*func)(struct perf_counter *))
 {
@@ -3124,8 +3205,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->ctx			= ctx;
 	counter->oncpu			= -1;
 
-	get_ctx(ctx);
-
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
@@ -3290,7 +3369,7 @@ err_free_put_context:
 	kfree(counter);
 
 err_put_context:
-	put_context(ctx);
+	put_ctx(ctx);
 
 	goto out_fput;
 }
@@ -3322,6 +3401,7 @@ inherit_counter(struct perf_counter *parent_counter,
 					   group_leader, GFP_KERNEL);
 	if (IS_ERR(child_counter))
 		return child_counter;
+	get_ctx(child_ctx);
 
 	/*
 	 * Make the child state follow the state of the parent counter,
@@ -3439,11 +3519,6 @@ __perf_counter_exit_task(struct task_struct *child,
 
 /*
  * When a child task exits, feed back counter values to parent counters.
- *
- * Note: we may be running in child context, but the PID is not hashed
- * anymore so new counters will not be added.
- * (XXX not sure that is true when we get called from flush_old_exec.
- *  -- paulus)
  */
 void perf_counter_exit_task(struct task_struct *child)
 {
@@ -3458,7 +3533,15 @@ void perf_counter_exit_task(struct task_struct *child)
 
 	local_irq_save(flags);
 	__perf_counter_task_sched_out(child_ctx);
+
+	/*
+	 * Take the context lock here so that if find_get_context is
+	 * reading child->perf_counter_ctxp, we wait until it has
+	 * incremented the context's refcount before we do put_ctx below.
+	 */
+	spin_lock(&child_ctx->lock);
 	child->perf_counter_ctxp = NULL;
+	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 
 	mutex_lock(&child_ctx->mutex);

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2009-05-27  8:28 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-26 12:29 [RFC PATCH] perf_counter: Fix race in attaching counters to tasks Paul Mackerras
2009-05-26 21:20 ` Peter Zijlstra
2009-05-27  8:27   ` Paul Mackerras

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox