[PATCH] tracing: add trace_event_read

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] tracing: add trace_event_read_lock()
@ 2009-05-18 11:35 Lai Jiangshan
  2009-05-18 13:59 ` Frederic Weisbecker
  2009-05-27 22:34 ` [tip:tracing/core] " tip-bot for Lai Jiangshan
  0 siblings, 2 replies; 13+ messages in thread
From: Lai Jiangshan @ 2009-05-18 11:35 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Steven Rostedt, Frederic Weisbecker, LKML


I found that there is nothing to protect event_hash in
ftrace_find_event().

RCU can also protect ftrace_find_event(), but it will add
latency to kernel. So this fix converts 'trace_event_mutex'
to a read/write semaphore, and adds trace_event_read_lock()
to protect ftrace_find_event().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 665a915..4ef8052 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1569,12 +1569,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		p = s_next(m, p, &l);
 	}
 
+	trace_event_read_lock();
 	return p;
 }
 
 static void s_stop(struct seq_file *m, void *p)
 {
 	atomic_dec(&trace_record_cmdline_disabled);
+	trace_event_read_unlock();
 }
 
 static void print_lat_help_header(struct seq_file *m)
@@ -1817,6 +1819,7 @@ static int trace_empty(struct trace_iterator *iter)
 	return 1;
 }
 
+/*  Called with trace_event_read_lock() held. */
 static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
 	enum print_line_t ret;
@@ -3008,6 +3011,7 @@ waitagain:
 	       offsetof(struct trace_iterator, seq));
 	iter->pos = -1;
 
+	trace_event_read_lock();
 	while (find_next_entry_inc(iter) != NULL) {
 		enum print_line_t ret;
 		int len = iter->seq.len;
@@ -3024,6 +3028,7 @@ waitagain:
 		if (iter->seq.len >= cnt)
 			break;
 	}
+	trace_event_read_unlock();
 
 	/* Now copy what we have to the user */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -3146,6 +3151,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		goto out_err;
 	}
 
+	trace_event_read_lock();
+
 	/* Fill as many pages as possible. */
 	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
 		pages[i] = alloc_page(GFP_KERNEL);
@@ -3168,6 +3175,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		trace_seq_init(&iter->seq);
 	}
 
+	trace_event_read_unlock();
 	mutex_unlock(&iter->mutex);
 
 	spd.nr_pages = i;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 489c0e8..7136420 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE	128
 
-static DEFINE_MUTEX(trace_event_mutex);
+static DECLARE_RWSEM(trace_event_mutex);
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -466,6 +466,7 @@ static int task_state_char(unsigned long state)
  * @type: the type of event to look for
  *
  * Returns an event of type @type otherwise NULL
+ * Called with trace_event_read_lock() held.
  */
 struct trace_event *ftrace_find_event(int type)
 {
@@ -475,7 +476,7 @@ struct trace_event *ftrace_find_event(int type)
 
 	key = type & (EVENT_HASHSIZE - 1);
 
-	hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+	hlist_for_each_entry(event, n, &event_hash[key], node) {
 		if (event->type == type)
 			return event;
 	}
@@ -513,6 +514,16 @@ static int trace_search_list(struct list_head **list)
 	return last + 1;
 }
 
+void trace_event_read_lock(void)
+{
+	down_read(&trace_event_mutex);
+}
+
+void trace_event_read_unlock(void)
+{
+	up_read(&trace_event_mutex);
+}
+
 /**
  * register_ftrace_event - register output for an event type
  * @event: the event type to register
@@ -533,7 +544,7 @@ int register_ftrace_event(struct trace_event *event)
 	unsigned key;
 	int ret = 0;
 
-	mutex_lock(&trace_event_mutex);
+	down_write(&trace_event_mutex);
 
 	if (WARN_ON(!event))
 		goto out;
@@ -581,11 +592,11 @@ int register_ftrace_event(struct trace_event *event)
 
 	key = event->type & (EVENT_HASHSIZE - 1);
 
-	hlist_add_head_rcu(&event->node, &event_hash[key]);
+	hlist_add_head(&event->node, &event_hash[key]);
 
 	ret = event->type;
  out:
-	mutex_unlock(&trace_event_mutex);
+	up_write(&trace_event_mutex);
 
 	return ret;
 }
@@ -597,10 +608,10 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
  */
 int unregister_ftrace_event(struct trace_event *event)
 {
-	mutex_lock(&trace_event_mutex);
+	down_write(&trace_event_mutex);
 	hlist_del(&event->node);
 	list_del(&event->list);
-	mutex_unlock(&trace_event_mutex);
+	up_write(&trace_event_mutex);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 6e220a8..ac240e7 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -20,6 +20,8 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
+extern void trace_event_read_lock(void);
+extern void trace_event_read_unlock(void);
 extern struct trace_event *ftrace_find_event(int type);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,





^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] tracing: add trace_event_read_lock()
  2009-05-18 11:35 [PATCH] tracing: add trace_event_read_lock() Lai Jiangshan
@ 2009-05-18 13:59 ` Frederic Weisbecker
  2009-05-19  0:35   ` Paul E. McKenney
  2009-05-19  2:05   ` Lai Jiangshan
  2009-05-27 22:34 ` [tip:tracing/core] " tip-bot for Lai Jiangshan
  1 sibling, 2 replies; 13+ messages in thread
From: Frederic Weisbecker @ 2009-05-18 13:59 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Ingo Molnar, Steven Rostedt, LKML

On Mon, May 18, 2009 at 07:35:34PM +0800, Lai Jiangshan wrote:
> 
> I found that there is nothing to protect event_hash in
> ftrace_find_event().



Actually, rcu protects it, but not enough. We have neither
synchronize_rcu() nor rcu_read_lock.

So we protect against concurrent hlist accesses.
But the event can be removed when a module is unloaded,
and that can happen between the time we get the event output
callback and the time we actually use it.

 
> RCU can also protect ftrace_find_event(), but it will add
> latency to kernel. So this fix converts 'trace_event_mutex'
> to a read/write semaphore, and adds trace_event_read_lock()
> to protect ftrace_find_event().
> 
> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> ---
> diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> index 665a915..4ef8052 100644
> --- a/kernel/trace/trace.c
> +++ b/kernel/trace/trace.c
> @@ -1569,12 +1569,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
>  		p = s_next(m, p, &l);
>  	}
>  
> +	trace_event_read_lock();
>  	return p;
>  }
>  
>  static void s_stop(struct seq_file *m, void *p)
>  {
>  	atomic_dec(&trace_record_cmdline_disabled);
> +	trace_event_read_unlock();
>  }
>  
>  static void print_lat_help_header(struct seq_file *m)
> @@ -1817,6 +1819,7 @@ static int trace_empty(struct trace_iterator *iter)
>  	return 1;
>  }
>  
> +/*  Called with trace_event_read_lock() held. */
>  static enum print_line_t print_trace_line(struct trace_iterator *iter)
>  {
>  	enum print_line_t ret;
> @@ -3008,6 +3011,7 @@ waitagain:
>  	       offsetof(struct trace_iterator, seq));
>  	iter->pos = -1;
>  
> +	trace_event_read_lock();
>  	while (find_next_entry_inc(iter) != NULL) {
>  		enum print_line_t ret;
>  		int len = iter->seq.len;
> @@ -3024,6 +3028,7 @@ waitagain:
>  		if (iter->seq.len >= cnt)
>  			break;
>  	}
> +	trace_event_read_unlock();
>  	/* Now copy what we have to the user */
>  	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
> @@ -3146,6 +3151,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
>  		goto out_err;
>  	}
>  
> +	trace_event_read_lock();
> +
>  	/* Fill as many pages as possible. */
>  	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
>  		pages[i] = alloc_page(GFP_KERNEL);
> @@ -3168,6 +3175,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
>  		trace_seq_init(&iter->seq);
>  	}
>  
> +	trace_event_read_unlock();
>  	mutex_unlock(&iter->mutex);
>  
>  	spd.nr_pages = i;
> diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
> index 489c0e8..7136420 100644
> --- a/kernel/trace/trace_output.c
> +++ b/kernel/trace/trace_output.c
> @@ -14,7 +14,7 @@
>  /* must be a power of 2 */
>  #define EVENT_HASHSIZE	128
>  
> -static DEFINE_MUTEX(trace_event_mutex);
> +static DECLARE_RWSEM(trace_event_mutex);
>  static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
>  
>  static int next_event_type = __TRACE_LAST_TYPE + 1;
> @@ -466,6 +466,7 @@ static int task_state_char(unsigned long state)
>   * @type: the type of event to look for
>   *
>   * Returns an event of type @type otherwise NULL
> + * Called with trace_event_read_lock() held.
>   */
>  struct trace_event *ftrace_find_event(int type)
>  {
> @@ -475,7 +476,7 @@ struct trace_event *ftrace_find_event(int type)
>  
>  	key = type & (EVENT_HASHSIZE - 1);
>  
> -	hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
> +	hlist_for_each_entry(event, n, &event_hash[key], node) {
>  		if (event->type == type)
>  			return event;
>  	}
> @@ -513,6 +514,16 @@ static int trace_search_list(struct list_head **list)
>  	return last + 1;
>  }
>  
> +void trace_event_read_lock(void)
> +{
> +	down_read(&trace_event_mutex);
> +}
> +
> +void trace_event_read_unlock(void)
> +{
> +	up_read(&trace_event_mutex);
> +}
> +
>  /**
>   * register_ftrace_event - register output for an event type
>   * @event: the event type to register
> @@ -533,7 +544,7 @@ int register_ftrace_event(struct trace_event *event)
>  	unsigned key;
>  	int ret = 0;
>  
> -	mutex_lock(&trace_event_mutex);
> +	down_write(&trace_event_mutex);
>  
>  	if (WARN_ON(!event))
>  		goto out;
> @@ -581,11 +592,11 @@ int register_ftrace_event(struct trace_event *event)
>  
>  	key = event->type & (EVENT_HASHSIZE - 1);
>  
> -	hlist_add_head_rcu(&event->node, &event_hash[key]);
> +	hlist_add_head(&event->node, &event_hash[key]);
>  
>  	ret = event->type;
>   out:
> -	mutex_unlock(&trace_event_mutex);
> +	up_write(&trace_event_mutex);
>  
>  	return ret;
>  }
> @@ -597,10 +608,10 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
>   */
>  int unregister_ftrace_event(struct trace_event *event)
>  {
> -	mutex_lock(&trace_event_mutex);
> +	down_write(&trace_event_mutex);
>  	hlist_del(&event->node);
>  	list_del(&event->list);
> -	mutex_unlock(&trace_event_mutex);
> +	up_write(&trace_event_mutex);
>  
>  	return 0;
>  }
> diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
> index 6e220a8..ac240e7 100644
> --- a/kernel/trace/trace_output.h
> +++ b/kernel/trace/trace_output.h
> @@ -20,6 +20,8 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
>  extern int trace_print_context(struct trace_iterator *iter);
>  extern int trace_print_lat_context(struct trace_iterator *iter);
>  
> +extern void trace_event_read_lock(void);
> +extern void trace_event_read_unlock(void);
>  extern struct trace_event *ftrace_find_event(int type);
>  
>  extern enum print_line_t trace_nop_print(struct trace_iterator *iter,



It could be more fine grained. We could have a per event rwsem, and also place the
protected read section only in trace_print_entry() which is the only racy window.

But I'm not sure it's that worthy since event removal is a rare thing.

So I guess this patch is fine.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] tracing: add trace_event_read_lock()
  2009-05-18 13:59 ` Frederic Weisbecker
@ 2009-05-19  0:35   ` Paul E. McKenney
  2009-05-19  5:15     ` Lai Jiangshan
  2009-05-19  2:05   ` Lai Jiangshan
  1 sibling, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2009-05-19  0:35 UTC (permalink / raw)
  To: Frederic Weisbecker; +Cc: Lai Jiangshan, Ingo Molnar, Steven Rostedt, LKML

On Mon, May 18, 2009 at 03:59:31PM +0200, Frederic Weisbecker wrote:
> On Mon, May 18, 2009 at 07:35:34PM +0800, Lai Jiangshan wrote:
> > 
> > I found that there is nothing to protect event_hash in
> > ftrace_find_event().
> 
> Actually, rcu protects it, but not enough. We have neither
> synchronize_rcu() nor rcu_read_lock.
> 
> So we protect against concurrent hlist accesses.
> But the event can be removed when a module is unloaded,
> and that can happen between the time we get the event output
> callback and the time we actually use it.

I will ask the stupid question...  Would invoking rcu_barrier() in the
module-exit function take care of this?  The rcu_barrier() primitive
waits for all in-flight RCU callbacks to complete execution.

							Thanx, Paul

> > RCU can also protect ftrace_find_event(), but it will add
> > latency to kernel. So this fix converts 'trace_event_mutex'
> > to a read/write semaphore, and adds trace_event_read_lock()
> > to protect ftrace_find_event().
> > 
> > Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> > ---
> > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> > index 665a915..4ef8052 100644
> > --- a/kernel/trace/trace.c
> > +++ b/kernel/trace/trace.c
> > @@ -1569,12 +1569,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
> >  		p = s_next(m, p, &l);
> >  	}
> >  
> > +	trace_event_read_lock();
> >  	return p;
> >  }
> >  
> >  static void s_stop(struct seq_file *m, void *p)
> >  {
> >  	atomic_dec(&trace_record_cmdline_disabled);
> > +	trace_event_read_unlock();
> >  }
> >  
> >  static void print_lat_help_header(struct seq_file *m)
> > @@ -1817,6 +1819,7 @@ static int trace_empty(struct trace_iterator *iter)
> >  	return 1;
> >  }
> >  
> > +/*  Called with trace_event_read_lock() held. */
> >  static enum print_line_t print_trace_line(struct trace_iterator *iter)
> >  {
> >  	enum print_line_t ret;
> > @@ -3008,6 +3011,7 @@ waitagain:
> >  	       offsetof(struct trace_iterator, seq));
> >  	iter->pos = -1;
> >  
> > +	trace_event_read_lock();
> >  	while (find_next_entry_inc(iter) != NULL) {
> >  		enum print_line_t ret;
> >  		int len = iter->seq.len;
> > @@ -3024,6 +3028,7 @@ waitagain:
> >  		if (iter->seq.len >= cnt)
> >  			break;
> >  	}
> > +	trace_event_read_unlock();
> >  	/* Now copy what we have to the user */
> >  	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
> > @@ -3146,6 +3151,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
> >  		goto out_err;
> >  	}
> >  
> > +	trace_event_read_lock();
> > +
> >  	/* Fill as many pages as possible. */
> >  	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
> >  		pages[i] = alloc_page(GFP_KERNEL);
> > @@ -3168,6 +3175,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
> >  		trace_seq_init(&iter->seq);
> >  	}
> >  
> > +	trace_event_read_unlock();
> >  	mutex_unlock(&iter->mutex);
> >  
> >  	spd.nr_pages = i;
> > diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
> > index 489c0e8..7136420 100644
> > --- a/kernel/trace/trace_output.c
> > +++ b/kernel/trace/trace_output.c
> > @@ -14,7 +14,7 @@
> >  /* must be a power of 2 */
> >  #define EVENT_HASHSIZE	128
> >  
> > -static DEFINE_MUTEX(trace_event_mutex);
> > +static DECLARE_RWSEM(trace_event_mutex);
> >  static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
> >  
> >  static int next_event_type = __TRACE_LAST_TYPE + 1;
> > @@ -466,6 +466,7 @@ static int task_state_char(unsigned long state)
> >   * @type: the type of event to look for
> >   *
> >   * Returns an event of type @type otherwise NULL
> > + * Called with trace_event_read_lock() held.
> >   */
> >  struct trace_event *ftrace_find_event(int type)
> >  {
> > @@ -475,7 +476,7 @@ struct trace_event *ftrace_find_event(int type)
> >  
> >  	key = type & (EVENT_HASHSIZE - 1);
> >  
> > -	hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
> > +	hlist_for_each_entry(event, n, &event_hash[key], node) {
> >  		if (event->type == type)
> >  			return event;
> >  	}
> > @@ -513,6 +514,16 @@ static int trace_search_list(struct list_head **list)
> >  	return last + 1;
> >  }
> >  
> > +void trace_event_read_lock(void)
> > +{
> > +	down_read(&trace_event_mutex);
> > +}
> > +
> > +void trace_event_read_unlock(void)
> > +{
> > +	up_read(&trace_event_mutex);
> > +}
> > +
> >  /**
> >   * register_ftrace_event - register output for an event type
> >   * @event: the event type to register
> > @@ -533,7 +544,7 @@ int register_ftrace_event(struct trace_event *event)
> >  	unsigned key;
> >  	int ret = 0;
> >  
> > -	mutex_lock(&trace_event_mutex);
> > +	down_write(&trace_event_mutex);
> >  
> >  	if (WARN_ON(!event))
> >  		goto out;
> > @@ -581,11 +592,11 @@ int register_ftrace_event(struct trace_event *event)
> >  
> >  	key = event->type & (EVENT_HASHSIZE - 1);
> >  
> > -	hlist_add_head_rcu(&event->node, &event_hash[key]);
> > +	hlist_add_head(&event->node, &event_hash[key]);
> >  
> >  	ret = event->type;
> >   out:
> > -	mutex_unlock(&trace_event_mutex);
> > +	up_write(&trace_event_mutex);
> >  
> >  	return ret;
> >  }
> > @@ -597,10 +608,10 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
> >   */
> >  int unregister_ftrace_event(struct trace_event *event)
> >  {
> > -	mutex_lock(&trace_event_mutex);
> > +	down_write(&trace_event_mutex);
> >  	hlist_del(&event->node);
> >  	list_del(&event->list);
> > -	mutex_unlock(&trace_event_mutex);
> > +	up_write(&trace_event_mutex);
> >  
> >  	return 0;
> >  }
> > diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
> > index 6e220a8..ac240e7 100644
> > --- a/kernel/trace/trace_output.h
> > +++ b/kernel/trace/trace_output.h
> > @@ -20,6 +20,8 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
> >  extern int trace_print_context(struct trace_iterator *iter);
> >  extern int trace_print_lat_context(struct trace_iterator *iter);
> >  
> > +extern void trace_event_read_lock(void);
> > +extern void trace_event_read_unlock(void);
> >  extern struct trace_event *ftrace_find_event(int type);
> >  
> >  extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
> 
> 
> 
> It could be more fine grained. We could have a per event rwsem, and also place the
> protected read section only in trace_print_entry() which is the only racy window.
> 
> But I'm not sure it's that worthy since event removal is a rare thing.
> 
> So I guess this patch is fine.
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] tracing: add trace_event_read_lock()
  2009-05-19  0:35   ` Paul E. McKenney
@ 2009-05-19  5:15     ` Lai Jiangshan
  2009-05-19 12:38       ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Lai Jiangshan @ 2009-05-19  5:15 UTC (permalink / raw)
  To: paulmck; +Cc: Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML

Paul E. McKenney wrote:
> On Mon, May 18, 2009 at 03:59:31PM +0200, Frederic Weisbecker wrote:
>> On Mon, May 18, 2009 at 07:35:34PM +0800, Lai Jiangshan wrote:
>>> I found that there is nothing to protect event_hash in
>>> ftrace_find_event().
>> Actually, rcu protects it, but not enough. We have neither
>> synchronize_rcu() nor rcu_read_lock.
>>
>> So we protect against concurrent hlist accesses.
>> But the event can be removed when a module is unloaded,
>> and that can happen between the time we get the event output
>> callback and the time we actually use it.
> 
> I will ask the stupid question...  Would invoking rcu_barrier() in the
> module-exit function take care of this?  The rcu_barrier() primitive
> waits for all in-flight RCU callbacks to complete execution.
> 
> 							Thanx, Paul
> 

We have no call_rcu* in it.

Thanx, Lai


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] tracing: add trace_event_read_lock()
  2009-05-19  5:15     ` Lai Jiangshan
@ 2009-05-19 12:38       ` Paul E. McKenney
  2009-05-20  0:59         ` Frederic Weisbecker
  0 siblings, 1 reply; 13+ messages in thread
From: Paul E. McKenney @ 2009-05-19 12:38 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML

On Tue, May 19, 2009 at 01:15:44PM +0800, Lai Jiangshan wrote:
> Paul E. McKenney wrote:
> > On Mon, May 18, 2009 at 03:59:31PM +0200, Frederic Weisbecker wrote:
> >> On Mon, May 18, 2009 at 07:35:34PM +0800, Lai Jiangshan wrote:
> >>> I found that there is nothing to protect event_hash in
> >>> ftrace_find_event().
> >> Actually, rcu protects it, but not enough. We have neither
> >> synchronize_rcu() nor rcu_read_lock.
> >>
> >> So we protect against concurrent hlist accesses.
> >> But the event can be removed when a module is unloaded,
> >> and that can happen between the time we get the event output
> >> callback and the time we actually use it.
> > 
> > I will ask the stupid question...  Would invoking rcu_barrier() in the
> > module-exit function take care of this?  The rcu_barrier() primitive
> > waits for all in-flight RCU callbacks to complete execution.
> > 
> > 							Thanx, Paul
> > 
> 
> We have no call_rcu* in it.

OK, I will ask the next stupid question...  Is it possible to use the
same trick rcu_barrier() uses, but for your events?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] tracing: add trace_event_read_lock()
  2009-05-19 12:38       ` Paul E. McKenney
@ 2009-05-20  0:59         ` Frederic Weisbecker
  2009-05-20  4:38           ` Paul E. McKenney
  0 siblings, 1 reply; 13+ messages in thread
From: Frederic Weisbecker @ 2009-05-20  0:59 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Lai Jiangshan, Ingo Molnar, Steven Rostedt, LKML

On Tue, May 19, 2009 at 05:38:53AM -0700, Paul E. McKenney wrote:
> On Tue, May 19, 2009 at 01:15:44PM +0800, Lai Jiangshan wrote:
> > Paul E. McKenney wrote:
> > > On Mon, May 18, 2009 at 03:59:31PM +0200, Frederic Weisbecker wrote:
> > >> On Mon, May 18, 2009 at 07:35:34PM +0800, Lai Jiangshan wrote:
> > >>> I found that there is nothing to protect event_hash in
> > >>> ftrace_find_event().
> > >> Actually, rcu protects it, but not enough. We have neither
> > >> synchronize_rcu() nor rcu_read_lock.
> > >>
> > >> So we protect against concurrent hlist accesses.
> > >> But the event can be removed when a module is unloaded,
> > >> and that can happen between the time we get the event output
> > >> callback and the time we actually use it.
> > > 
> > > I will ask the stupid question...  Would invoking rcu_barrier() in the
> > > module-exit function take care of this?  The rcu_barrier() primitive
> > > waits for all in-flight RCU callbacks to complete execution.
> > > 
> > > 							Thanx, Paul
> > > 
> > 
> > We have no call_rcu* in it.
> 
> OK, I will ask the next stupid question...  Is it possible to use the
> same trick rcu_barrier() uses, but for your events?
> 
> 							Thanx, Paul

Hi Paul,

I think you're right, we could use RCU to solve this race.

The current race window to solve is as follows:

--Task A--

print_trace_line(trace) {
	event = find_ftrace_event(trace) {
			hlist_for_each_entry_rcu(....) {
				...
				return found;
		}
	}

--Task B--

/*
 * Can be quite confusing.
 * But we have "trace events" which are global objects
 * handling tracing, output, everything.
 * And we have also "ftrace events" which are unique
 * identifiers on traces than make them able to retrieve  
 * their associated output callbacks.
 * So a "trace event" has an associated "ftrace event" to output
 * itself.
 */

trace_module_remove_events(mod) {
	list_trace_events_module(ev, mod) {
		unregister_ftrace_event(ev->event) {
			// mutex protected
			hlist_del(ev->event->node)
			list_del(....)
			//
		}
	}
}
//module removed

-- Task A--

event->print(trace) 

KABOOM! print callback was on the module. event doesn't even exist anymore.

So Lai's solution is correct to fix it.
But rcu is also valid.

Currently rcu only protects the hashlist but not the callback until
the job is complete.

What we could do is:

(Read side)

rcu_read_lock();

event = find_ftrace_event();
event->print(trace);

rcu_read_unlock()

(Write side)

unregister_ftrace_event(e)
{
	mutex_lock(&writer);
	hlist_del(e);
	list_del(e);
	mutex_unlock(&writer);

	// Wait for every printing grace periods
	synchronize_rcu();
}

Then the only overhead is a preempt_disable()/preempt_enable()
for each trace to be printed. Just a sub/add pair and quite few
preemption disabled latency, just the time for a hashlist search
and some work done to format a trace to be printed.
But if latency does really matters, we have RCU_PREEMPT.

On the other side, the down_read() is an extra call but it is called
only once for a whole loop of traces seeking.

I guess both solutions are valid.
What do you think?

Thanks,

Frederic.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] tracing: add trace_event_read_lock()
  2009-05-20  0:59         ` Frederic Weisbecker
@ 2009-05-20  4:38           ` Paul E. McKenney
  0 siblings, 0 replies; 13+ messages in thread
From: Paul E. McKenney @ 2009-05-20  4:38 UTC (permalink / raw)
  To: Frederic Weisbecker

On Wed, May 20, 2009 at 02:59:50AM +0200, Frederic Weisbecker wrote:
> On Tue, May 19, 2009 at 05:38:53AM -0700, Paul E. McKenney wrote:
> > On Tue, May 19, 2009 at 01:15:44PM +0800, Lai Jiangshan wrote:
> > > Paul E. McKenney wrote:
> > > > On Mon, May 18, 2009 at 03:59:31PM +0200, Frederic Weisbecker wrote:
> > > >> On Mon, May 18, 2009 at 07:35:34PM +0800, Lai Jiangshan wrote:
> > > >>> I found that there is nothing to protect event_hash in
> > > >>> ftrace_find_event().
> > > >> Actually, rcu protects it, but not enough. We have neither
> > > >> synchronize_rcu() nor rcu_read_lock.
> > > >>
> > > >> So we protect against concurrent hlist accesses.
> > > >> But the event can be removed when a module is unloaded,
> > > >> and that can happen between the time we get the event output
> > > >> callback and the time we actually use it.
> > > > 
> > > > I will ask the stupid question...  Would invoking rcu_barrier() in the
> > > > module-exit function take care of this?  The rcu_barrier() primitive
> > > > waits for all in-flight RCU callbacks to complete execution.
> > > > 
> > > > 							Thanx, Paul
> > > > 
> > > 
> > > We have no call_rcu* in it.
> > 
> > OK, I will ask the next stupid question...  Is it possible to use the
> > same trick rcu_barrier() uses, but for your events?
> > 
> > 							Thanx, Paul
> 
> 
> Hi Paul,
> 
> I think you're right, we could use RCU to solve this race.
> 
> The current race window to solve is as follows:
> 
> 
> --Task A--
> 
> print_trace_line(trace) {
> 	event = find_ftrace_event(trace) {
> 			hlist_for_each_entry_rcu(....) {
> 				...
> 				return found;
> 		}
> 	}
> 
> 
> --Task B--
> 
> /*
>  * Can be quite confusing.
>  * But we have "trace events" which are global objects
>  * handling tracing, output, everything.
>  * And we have also "ftrace events" which are unique
>  * identifiers on traces than make them able to retrieve  
>  * their associated output callbacks.
>  * So a "trace event" has an associated "ftrace event" to output
>  * itself.
>  */
> 
> trace_module_remove_events(mod) {
> 	list_trace_events_module(ev, mod) {
> 		unregister_ftrace_event(ev->event) {
> 			// mutex protected
> 			hlist_del(ev->event->node)
> 			list_del(....)
> 			//
> 		}
> 	}
> }
> //module removed

Ouch!!!  ;-)

> -- Task A--
> 
> event->print(trace) 
> 
> KABOOM! print callback was on the module. event doesn't even exist anymore.
> 
> So Lai's solution is correct to fix it.
> But rcu is also valid.
> 
> Currently rcu only protects the hashlist but not the callback until
> the job is complete.
> 
> What we could do is:
> 
> (Read side)
> 
> rcu_read_lock();
> 
> event = find_ftrace_event();
> event->print(trace);

As long as event->print() never blocks...

Though you could use SRCU if it might block:

	rcu_read_lock()		->	i = srcu_read_lock(&my_srcu_struct);
	rcu_read_unlock()	->	srcu_read_unlock(&my_srcu_struct, i);
	synchronize_rcu()	->	synchronize_srcu(&my_srcu_struct);

> rcu_read_unlock()
> 
> (Write side)
> 
> unregister_ftrace_event(e)
> {
> 	mutex_lock(&writer);
> 	hlist_del(e);
> 	list_del(e);
> 	mutex_unlock(&writer);
> 
> 	// Wait for every printing grace periods
> 	synchronize_rcu();
> }
> 
> 
> Then the only overhead is a preempt_disable()/preempt_enable()
> for each trace to be printed. Just a sub/add pair and quite few
> preemption disabled latency, just the time for a hashlist search
> and some work done to format a trace to be printed.
> But if latency does really matters, we have RCU_PREEMPT.
> 
> On the other side, the down_read() is an extra call but it is called
> only once for a whole loop of traces seeking.
> 
> I guess both solutions are valid.
> What do you think?

It could go either way.  As a very rough rule of thumb, if the down_read()
happens seldom, there is little advantage to RCU.  On the other hand,
if the down_read() could happen often enough that a new reader shows up
before the previous down_read() completes, then you really need RCU or
something like it.

On most systems, if down_read() was happening more than a few million
times per second, life might be hard.  On the other hand, if down_read()
never happened more than a few tens of thousand times per second or so,
should be no problem.

Since it sounds like a single down_read() substitutes for a potentially
large number of RCU read-side critical sections, I do not believe that
contention-free read-side overhead would be a problem.

Does that help?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] tracing: add trace_event_read_lock()
  2009-05-18 13:59 ` Frederic Weisbecker
  2009-05-19  0:35   ` Paul E. McKenney
@ 2009-05-19  2:05   ` Lai Jiangshan
  2009-05-20  0:24     ` Frederic Weisbecker
  1 sibling, 1 reply; 13+ messages in thread
From: Lai Jiangshan @ 2009-05-19  2:05 UTC (permalink / raw)
  To: Frederic Weisbecker; +Cc: Ingo Molnar, Steven Rostedt, LKML

Frederic Weisbecker wrote:
> On Mon, May 18, 2009 at 07:35:34PM +0800, Lai Jiangshan wrote:
>> I found that there is nothing to protect event_hash in
>> ftrace_find_event().
> 
> 
> 
> Actually, rcu protects it, but not enough. We have neither
> synchronize_rcu() nor rcu_read_lock.

We have no rcu_read_lock(), RCU can not protects it.

> 
> So we protect against concurrent hlist accesses.
> But the event can be removed when a module is unloaded,
> and that can happen between the time we get the event output
> callback and the time we actually use it.
> 

[...]

> It could be more fine grained.

I think it's fine-grained enough, write-side(modules loading/unloading)
is happened rarely. trace_event_read_lock() will not sleep very likely.

Thoughts?

> We could have a per event rwsem, and also place the
> protected read section only in trace_print_entry() which is the only racy window.
> 

print_trace_line() is the only racy window.
So I just protect print_trace_line()(except __ftrace_dump())

I protect loops which call print_trace_line(), it
reduces invoke-times:

trace_event_read_lock();
while (...) {
	...
	print_trace_line();
	...
}
trace_event_read_unlock();

Thanks!
Lai

> But I'm not sure it's that worthy since event removal is a rare thing.
> 
> So I guess this patch is fine.
> 
> 
> 
> 



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] tracing: add trace_event_read_lock()
  2009-05-19  2:05   ` Lai Jiangshan
@ 2009-05-20  0:24     ` Frederic Weisbecker
  2009-05-20  2:25       ` Lai Jiangshan
  0 siblings, 1 reply; 13+ messages in thread
From: Frederic Weisbecker @ 2009-05-20  0:24 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Ingo Molnar, Steven Rostedt, LKML, Paul E. McKenney

On Tue, May 19, 2009 at 10:05:21AM +0800, Lai Jiangshan wrote:
> Frederic Weisbecker wrote:
> > On Mon, May 18, 2009 at 07:35:34PM +0800, Lai Jiangshan wrote:
> >> I found that there is nothing to protect event_hash in
> >> ftrace_find_event().
> > 
> > 
> > 
> > Actually, rcu protects it, but not enough. We have neither
> > synchronize_rcu() nor rcu_read_lock.
> 
> We have no rcu_read_lock(), RCU can not protects it.
> 
> > 
> > So we protect against concurrent hlist accesses.
> > But the event can be removed when a module is unloaded,
> > and that can happen between the time we get the event output
> > callback and the time we actually use it.
> > 
> 
> [...]
> 
> > It could be more fine grained.
> 
> I think it's fine-grained enough, write-side(modules loading/unloading)
> is happened rarely. trace_event_read_lock() will not sleep very likely.
> 
> Thoughts?


Yeah, the write lock is a rare event, that's why I think
it's enough fine grained.

 
> > We could have a per event rwsem, and also place the
> > protected read section only in trace_print_entry() which is the only racy window.
> > 
> 
> print_trace_line() is the only racy window.
> So I just protect print_trace_line()(except __ftrace_dump())
> 
> I protect loops which call print_trace_line(), it
> reduces invoke-times:
> 
> trace_event_read_lock();
> while (...) {
> 	...
> 	print_trace_line();
> 	...
> }
> trace_event_read_unlock();



Yeah, I meant it could have been:

trace_event_read_lock();
print_trace_line();
trace_event_read_unlock();

It's more fine grained, but:

- the write lock path is rarely taken
- it would add more extra calls then more overhead

IMO this is fine as an rwsem design point of view.

But I have mixed feelings when I consider it could be
done using rcu. I will explain that in my next answer to
Paul and will wait for your comments.

Thanks!

Frederic.


> Thanks!
> Lai
> 
> > But I'm not sure it's that worthy since event removal is a rare thing.
> > 
> > So I guess this patch is fine.
> > 
> > 
> > 
> > 
> 
> 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] tracing: add trace_event_read_lock()
  2009-05-20  0:24     ` Frederic Weisbecker
@ 2009-05-20  2:25       ` Lai Jiangshan
  2009-05-20 15:41         ` Paul E. McKenney
  2009-05-20 16:04         ` Steven Rostedt
  0 siblings, 2 replies; 13+ messages in thread
From: Lai Jiangshan @ 2009-05-20  2:25 UTC (permalink / raw)
  To: Frederic Weisbecker; +Cc: Ingo Molnar, Steven Rostedt, LKML, Paul E. McKenney

Frederic Weisbecker wrote:
> On Tue, May 19, 2009 at 10:05:21AM +0800, Lai Jiangshan wrote:
>> Frederic Weisbecker wrote:
>>> On Mon, May 18, 2009 at 07:35:34PM +0800, Lai Jiangshan wrote:
>>>> I found that there is nothing to protect event_hash in
>>>> ftrace_find_event().
>>>
>>>
>>> Actually, rcu protects it, but not enough. We have neither
>>> synchronize_rcu() nor rcu_read_lock.
>> We have no rcu_read_lock(), RCU can not protects it.
>>
>>> So we protect against concurrent hlist accesses.
>>> But the event can be removed when a module is unloaded,
>>> and that can happen between the time we get the event output
>>> callback and the time we actually use it.
>>>
>> [...]
>>
>>> It could be more fine grained.
>> I think it's fine-grained enough, write-side(modules loading/unloading)
>> is happened rarely. trace_event_read_lock() will not sleep very likely.
>>
>> Thoughts?
> 
> 
> Yeah, the write lock is a rare event, that's why I think
> it's enough fine grained.
> 
>  
>>> We could have a per event rwsem, and also place the
>>> protected read section only in trace_print_entry() which is the only racy window.
>>>
>> print_trace_line() is the only racy window.
>> So I just protect print_trace_line()(except __ftrace_dump())
>>
>> I protect loops which call print_trace_line(), it
>> reduces invoke-times:
>>
>> trace_event_read_lock();
>> while (...) {
>> 	...
>> 	print_trace_line();
>> 	...
>> }
>> trace_event_read_unlock();
> 
> 
> 
> Yeah, I meant it could have been:
> 
> trace_event_read_lock();
> print_trace_line();
> trace_event_read_unlock();
> 
> It's more fine grained, but:
> 
> - the write lock path is rarely taken
> - it would add more extra calls then more overhead
> 
> IMO this is fine as an rwsem design point of view.
> 
> But I have mixed feelings when I consider it could be
> done using rcu. I will explain that in my next answer to
> Paul and will wait for your comments.
> 

rcu_read_lock() will disable preempt for im-preemptable RCU,
it will add latency to kernel, because print_trace_line() is not
a short function.


The smallest window is:
(print_trace_line() calls ftrace_find_event() by several paths)

	XXX_read_lock();
	event = ftrace_find_event(entry->type);
	if (event)
		event->YYYY();
	XXX_read_unlock();

but event->YYYY() is not a short function neither.

Since write-side is rarely taken, sleep-able read-side(rwsem)
will not block each other. So I use trace_event_read_lock()
protects the biggest window(the loops).


In LTTng, the tracing code(trace_NAME()) accesses to
event type list, so RCU is needed in LTTng for event type list.

But Ftrace's tracing code does not accesses to event type list,
I don't know this logic is still true in future. Steven may
give me an answer.

Lai.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] tracing: add trace_event_read_lock()
  2009-05-20  2:25       ` Lai Jiangshan
@ 2009-05-20 15:41         ` Paul E. McKenney
  2009-05-20 16:04         ` Steven Rostedt
  1 sibling, 0 replies; 13+ messages in thread
From: Paul E. McKenney @ 2009-05-20 15:41 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Frederic Weisbecker, Ingo Molnar, Steven Rostedt, LKML

On Wed, May 20, 2009 at 10:25:39AM +0800, Lai Jiangshan wrote:
> Frederic Weisbecker wrote:
> > On Tue, May 19, 2009 at 10:05:21AM +0800, Lai Jiangshan wrote:
> >> Frederic Weisbecker wrote:
> >>> On Mon, May 18, 2009 at 07:35:34PM +0800, Lai Jiangshan wrote:
> >>>> I found that there is nothing to protect event_hash in
> >>>> ftrace_find_event().
> >>>
> >>>
> >>> Actually, rcu protects it, but not enough. We have neither
> >>> synchronize_rcu() nor rcu_read_lock.
> >> We have no rcu_read_lock(), RCU can not protects it.
> >>
> >>> So we protect against concurrent hlist accesses.
> >>> But the event can be removed when a module is unloaded,
> >>> and that can happen between the time we get the event output
> >>> callback and the time we actually use it.
> >>>
> >> [...]
> >>
> >>> It could be more fine grained.
> >> I think it's fine-grained enough, write-side(modules loading/unloading)
> >> is happened rarely. trace_event_read_lock() will not sleep very likely.
> >>
> >> Thoughts?
> > 
> > 
> > Yeah, the write lock is a rare event, that's why I think
> > it's enough fine grained.
> > 
> >  
> >>> We could have a per event rwsem, and also place the
> >>> protected read section only in trace_print_entry() which is the only racy window.
> >>>
> >> print_trace_line() is the only racy window.
> >> So I just protect print_trace_line()(except __ftrace_dump())
> >>
> >> I protect loops which call print_trace_line(), it
> >> reduces invoke-times:
> >>
> >> trace_event_read_lock();
> >> while (...) {
> >> 	...
> >> 	print_trace_line();
> >> 	...
> >> }
> >> trace_event_read_unlock();
> > 
> > 
> > 
> > Yeah, I meant it could have been:
> > 
> > trace_event_read_lock();
> > print_trace_line();
> > trace_event_read_unlock();
> > 
> > It's more fine grained, but:
> > 
> > - the write lock path is rarely taken
> > - it would add more extra calls then more overhead
> > 
> > IMO this is fine as an rwsem design point of view.
> > 
> > But I have mixed feelings when I consider it could be
> > done using rcu. I will explain that in my next answer to
> > Paul and will wait for your comments.
> > 
> 
> rcu_read_lock() will disable preempt for im-preemptable RCU,
> it will add latency to kernel, because print_trace_line() is not
> a short function.
> 
> The smallest window is:
> (print_trace_line() calls ftrace_find_event() by several paths)
> 
> 	XXX_read_lock();
> 	event = ftrace_find_event(entry->type);
> 	if (event)
> 		event->YYYY();
> 	XXX_read_unlock();
> 
> but event->YYYY() is not a short function neither.

SRCU could be used instead, as noted in a separate message.

> Since write-side is rarely taken, sleep-able read-side(rwsem)
> will not block each other. So I use trace_event_read_lock()
> protects the biggest window(the loops).

But if simple locking works, I have no problem with it.  As always,
use the right tool for the job.  And even I will tell you that RCU is
not always the right tool.  ;-)

						Thanx, Paul

> In LTTng, the tracing code(trace_NAME()) accesses to
> event type list, so RCU is needed in LTTng for event type list.
> 
> But Ftrace's tracing code does not accesses to event type list,
> I don't know this logic is still true in future. Steven may
> give me an answer.
> 
> Lai.
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] tracing: add trace_event_read_lock()
  2009-05-20  2:25       ` Lai Jiangshan
  2009-05-20 15:41         ` Paul E. McKenney
@ 2009-05-20 16:04         ` Steven Rostedt
  1 sibling, 0 replies; 13+ messages in thread
From: Steven Rostedt @ 2009-05-20 16:04 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: Frederic Weisbecker, Ingo Molnar, LKML, Paul E. McKenney


Sorry for coming in late.

On Wed, 20 May 2009, Lai Jiangshan wrote:

> Frederic Weisbecker wrote:
> > On Tue, May 19, 2009 at 10:05:21AM +0800, Lai Jiangshan wrote:
> >> Frederic Weisbecker wrote:
> >>> On Mon, May 18, 2009 at 07:35:34PM +0800, Lai Jiangshan wrote:
> >>>> I found that there is nothing to protect event_hash in
> >>>> ftrace_find_event().
> >>>
> >>>
> >>> Actually, rcu protects it, but not enough. We have neither
> >>> synchronize_rcu() nor rcu_read_lock.
> >> We have no rcu_read_lock(), RCU can not protects it.
> >>
> >>> So we protect against concurrent hlist accesses.
> >>> But the event can be removed when a module is unloaded,
> >>> and that can happen between the time we get the event output
> >>> callback and the time we actually use it.
> >>>
> >> [...]
> >>
> >>> It could be more fine grained.
> >> I think it's fine-grained enough, write-side(modules loading/unloading)
> >> is happened rarely. trace_event_read_lock() will not sleep very likely.
> >>
> >> Thoughts?
> > 
> > 
> > Yeah, the write lock is a rare event, that's why I think
> > it's enough fine grained.
> > 
> >  
> >>> We could have a per event rwsem, and also place the
> >>> protected read section only in trace_print_entry() which is the only racy window.
> >>>
> >> print_trace_line() is the only racy window.
> >> So I just protect print_trace_line()(except __ftrace_dump())
> >>
> >> I protect loops which call print_trace_line(), it
> >> reduces invoke-times:
> >>
> >> trace_event_read_lock();
> >> while (...) {
> >> 	...
> >> 	print_trace_line();
> >> 	...
> >> }
> >> trace_event_read_unlock();
> > 
> > 
> > 
> > Yeah, I meant it could have been:
> > 
> > trace_event_read_lock();
> > print_trace_line();
> > trace_event_read_unlock();
> > 
> > It's more fine grained, but:
> > 
> > - the write lock path is rarely taken
> > - it would add more extra calls then more overhead
> > 
> > IMO this is fine as an rwsem design point of view.
> > 
> > But I have mixed feelings when I consider it could be
> > done using rcu. I will explain that in my next answer to
> > Paul and will wait for your comments.
> > 
> 
> rcu_read_lock() will disable preempt for im-preemptable RCU,
> it will add latency to kernel, because print_trace_line() is not
> a short function.
> 
> 
> The smallest window is:
> (print_trace_line() calls ftrace_find_event() by several paths)
> 
> 	XXX_read_lock();
> 	event = ftrace_find_event(entry->type);
> 	if (event)
> 		event->YYYY();
> 	XXX_read_unlock();
> 
> but event->YYYY() is not a short function neither.
> 
> Since write-side is rarely taken, sleep-able read-side(rwsem)
> will not block each other. So I use trace_event_read_lock()
> protects the biggest window(the loops).
> 
> 
> In LTTng, the tracing code(trace_NAME()) accesses to
> event type list, so RCU is needed in LTTng for event type list.
> 
> But Ftrace's tracing code does not accesses to event type list,
> I don't know this logic is still true in future. Steven may
> give me an answer.

My first thought is to agree with Frederic and say that this is fine with 
RCU locking.

	rcu_read_lock();
	event = ftrace_find_event(entry->type);
	if (event)
		ret = event->XXXX(iter, 0);
	rcu_read_unlock();

and then do the rcu_synchronize in the module unload (if it is not done 
after the callback).

But this limits the event->XXXX() function from sleeping, because we can 
not call a sleeping function in a rcu_read_lock() area. This code is not a 
fast path. It is the output in text form to the user, so it will be slow 
to begin with.

Thus, I agree with Lai's rwsem approach, and

Acked-by: Steven Rostedt <rostedt@goodmis.org>

-- Steve


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [tip:tracing/core] tracing: add trace_event_read_lock()
  2009-05-18 11:35 [PATCH] tracing: add trace_event_read_lock() Lai Jiangshan
  2009-05-18 13:59 ` Frederic Weisbecker
@ 2009-05-27 22:34 ` tip-bot for Lai Jiangshan
  1 sibling, 0 replies; 13+ messages in thread
From: tip-bot for Lai Jiangshan @ 2009-05-27 22:34 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, fweisbec, rostedt, tglx, laijs

Commit-ID:  4f5359685af6de7dca101393dc606620adbe963f
Gitweb:     http://git.kernel.org/tip/4f5359685af6de7dca101393dc606620adbe963f
Author:     Lai Jiangshan <laijs@cn.fujitsu.com>
AuthorDate: Mon, 18 May 2009 19:35:34 +0800
Committer:  Frederic Weisbecker <fweisbec@gmail.com>
CommitDate: Mon, 25 May 2009 23:53:41 +0200

tracing: add trace_event_read_lock()

I found that there is nothing to protect event_hash in
ftrace_find_event(). Rcu protects the event hashlist
but not the event itself while we use it after its extraction
through ftrace_find_event().

This lack of a proper locking in this spot opens a race
window between any event dereferencing and module removal.

Eg:

--Task A--

print_trace_line(trace) {
  event = find_ftrace_event(trace)

--Task B--

trace_module_remove_events(mod) {
  list_trace_events_module(ev, mod) {
    unregister_ftrace_event(ev->event) {
      hlist_del(ev->event->node)
        list_del(....)
    }
  }
}
|--> module removed, the event has been dropped

--Task A--

  event->print(trace); // Dereferencing freed memory

If the event retrieved belongs to a module and this module
is concurrently removed, we may end up dereferencing a data
from a freed module.

RCU could solve this, but it would add latency to the kernel and
forbid tracers output callbacks to call any sleepable code.
So this fix converts 'trace_event_mutex' to a read/write semaphore,
and adds trace_event_read_lock() to protect ftrace_find_event().

[ Impact: fix possible freed memory dereference in ftrace ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A114806.7090302@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>


---
 kernel/trace/trace.c        |    8 ++++++++
 kernel/trace/trace_output.c |   25 ++++++++++++++++++-------
 kernel/trace/trace_output.h |    2 ++
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dd40d23..02d32ba 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1569,12 +1569,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		p = s_next(m, p, &l);
 	}
 
+	trace_event_read_lock();
 	return p;
 }
 
 static void s_stop(struct seq_file *m, void *p)
 {
 	atomic_dec(&trace_record_cmdline_disabled);
+	trace_event_read_unlock();
 }
 
 static void print_lat_help_header(struct seq_file *m)
@@ -1817,6 +1819,7 @@ static int trace_empty(struct trace_iterator *iter)
 	return 1;
 }
 
+/*  Called with trace_event_read_lock() held. */
 static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
 	enum print_line_t ret;
@@ -3008,6 +3011,7 @@ waitagain:
 	       offsetof(struct trace_iterator, seq));
 	iter->pos = -1;
 
+	trace_event_read_lock();
 	while (find_next_entry_inc(iter) != NULL) {
 		enum print_line_t ret;
 		int len = iter->seq.len;
@@ -3024,6 +3028,7 @@ waitagain:
 		if (iter->seq.len >= cnt)
 			break;
 	}
+	trace_event_read_unlock();
 
 	/* Now copy what we have to the user */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -3146,6 +3151,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		goto out_err;
 	}
 
+	trace_event_read_lock();
+
 	/* Fill as many pages as possible. */
 	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
 		pages[i] = alloc_page(GFP_KERNEL);
@@ -3168,6 +3175,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		trace_seq_init(&iter->seq);
 	}
 
+	trace_event_read_unlock();
 	mutex_unlock(&iter->mutex);
 
 	spd.nr_pages = i;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 489c0e8..7136420 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE	128
 
-static DEFINE_MUTEX(trace_event_mutex);
+static DECLARE_RWSEM(trace_event_mutex);
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -466,6 +466,7 @@ static int task_state_char(unsigned long state)
  * @type: the type of event to look for
  *
  * Returns an event of type @type otherwise NULL
+ * Called with trace_event_read_lock() held.
  */
 struct trace_event *ftrace_find_event(int type)
 {
@@ -475,7 +476,7 @@ struct trace_event *ftrace_find_event(int type)
 
 	key = type & (EVENT_HASHSIZE - 1);
 
-	hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+	hlist_for_each_entry(event, n, &event_hash[key], node) {
 		if (event->type == type)
 			return event;
 	}
@@ -513,6 +514,16 @@ static int trace_search_list(struct list_head **list)
 	return last + 1;
 }
 
+void trace_event_read_lock(void)
+{
+	down_read(&trace_event_mutex);
+}
+
+void trace_event_read_unlock(void)
+{
+	up_read(&trace_event_mutex);
+}
+
 /**
  * register_ftrace_event - register output for an event type
  * @event: the event type to register
@@ -533,7 +544,7 @@ int register_ftrace_event(struct trace_event *event)
 	unsigned key;
 	int ret = 0;
 
-	mutex_lock(&trace_event_mutex);
+	down_write(&trace_event_mutex);
 
 	if (WARN_ON(!event))
 		goto out;
@@ -581,11 +592,11 @@ int register_ftrace_event(struct trace_event *event)
 
 	key = event->type & (EVENT_HASHSIZE - 1);
 
-	hlist_add_head_rcu(&event->node, &event_hash[key]);
+	hlist_add_head(&event->node, &event_hash[key]);
 
 	ret = event->type;
  out:
-	mutex_unlock(&trace_event_mutex);
+	up_write(&trace_event_mutex);
 
 	return ret;
 }
@@ -597,10 +608,10 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
  */
 int unregister_ftrace_event(struct trace_event *event)
 {
-	mutex_lock(&trace_event_mutex);
+	down_write(&trace_event_mutex);
 	hlist_del(&event->node);
 	list_del(&event->list);
-	mutex_unlock(&trace_event_mutex);
+	up_write(&trace_event_mutex);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 6e220a8..ac240e7 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -20,6 +20,8 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
+extern void trace_event_read_lock(void);
+extern void trace_event_read_unlock(void);
 extern struct trace_event *ftrace_find_event(int type);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2009-05-27 22:36 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-18 11:35 [PATCH] tracing: add trace_event_read_lock() Lai Jiangshan
2009-05-18 13:59 ` Frederic Weisbecker
2009-05-19  0:35   ` Paul E. McKenney
2009-05-19  5:15     ` Lai Jiangshan
2009-05-19 12:38       ` Paul E. McKenney
2009-05-20  0:59         ` Frederic Weisbecker
2009-05-20  4:38           ` Paul E. McKenney
2009-05-19  2:05   ` Lai Jiangshan
2009-05-20  0:24     ` Frederic Weisbecker
2009-05-20  2:25       ` Lai Jiangshan
2009-05-20 15:41         ` Paul E. McKenney
2009-05-20 16:04         ` Steven Rostedt
2009-05-27 22:34 ` [tip:tracing/core] " tip-bot for Lai Jiangshan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).