From: Frederic Weisbecker <fweisbec@gmail.com>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>, Paul Mackerras <paulus@samba.org>,
Arnaldo Carvalho de Melo <acme@infradead.org>,
Steven Rostedt <rostedt@goodmis.org>,
LKML <linux-kernel@vger.kernel.org>
Subject: Re: [PATCH 02/10] perf, trace: Use per-tracepoint-per-cpu hlist to track events
Date: Fri, 21 May 2010 11:40:16 +0200 [thread overview]
Message-ID: <20100521094014.GA30108@nowhere> (raw)
In-Reply-To: <20100521090710.473188012@chello.nl>
On Fri, May 21, 2010 at 11:02:03AM +0200, Peter Zijlstra wrote:
> Avoid the swevent hash-table by using per-tracepoint hlists.
>
> Also, avoid conditionals on the fast path by ordering with probe unregister
> so that we should never get on the callback path without the data being there.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> ---
> include/linux/ftrace_event.h | 16 ++---
> include/linux/perf_event.h | 6 +
> include/trace/ftrace.h | 4 -
> kernel/perf_event.c | 94 +++++++++++++++--------------
> kernel/trace/trace_event_perf.c | 127 +++++++++++++++++++++-------------------
> kernel/trace/trace_kprobe.c | 9 +-
> kernel/trace/trace_syscalls.c | 11 ++-
> 7 files changed, 143 insertions(+), 124 deletions(-)
>
> Index: linux-2.6/include/linux/ftrace_event.h
> ===================================================================
> --- linux-2.6.orig/include/linux/ftrace_event.h
> +++ linux-2.6/include/linux/ftrace_event.h
> @@ -133,7 +133,7 @@ struct ftrace_event_call {
> void *data;
>
> int perf_refcount;
> - void *perf_data;
> + struct hlist_head *perf_events;
> int (*perf_event_enable)(struct ftrace_event_call *);
> void (*perf_event_disable)(struct ftrace_event_call *);
> };
> @@ -192,9 +192,11 @@ struct perf_event;
>
> DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
>
> -extern int perf_trace_enable(int event_id, void *data);
> -extern void perf_trace_disable(int event_id);
> -extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
> +extern int perf_trace_init(struct perf_event *event);
> +extern void perf_trace_destroy(struct perf_event *event);
> +extern int perf_trace_enable(struct perf_event *event);
> +extern void perf_trace_disable(struct perf_event *event);
> +extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
> char *filter_str);
> extern void ftrace_profile_free_filter(struct perf_event *event);
> extern void *perf_trace_buf_prepare(int size, unsigned short type,
> @@ -202,11 +204,9 @@ extern void *perf_trace_buf_prepare(int
>
> static inline void
> perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
> - u64 count, struct pt_regs *regs, void *event)
> + u64 count, struct pt_regs *regs, void *head)
> {
> - struct trace_entry *entry = raw_data;
> -
> - perf_tp_event(entry->type, addr, count, raw_data, size, regs, event);
> + perf_tp_event(addr, count, raw_data, size, regs, head);
> perf_swevent_put_recursion_context(rctx);
> }
> #endif
> Index: linux-2.6/include/trace/ftrace.h
> ===================================================================
> --- linux-2.6.orig/include/trace/ftrace.h
> +++ linux-2.6/include/trace/ftrace.h
> @@ -768,6 +768,7 @@ perf_trace_templ_##call(struct ftrace_ev
> struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
> struct ftrace_raw_##call *entry; \
> u64 __addr = 0, __count = 1; \
> + struct hlist_head *head; \
> int __entry_size; \
> int __data_size; \
> int rctx; \
> @@ -790,8 +791,9 @@ perf_trace_templ_##call(struct ftrace_ev
> \
> { assign; } \
> \
> + head = per_cpu_ptr(event_call->perf_events, smp_processor_id());\
Should be rcu_dereference_sched ?
> perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
> - __count, __regs, event_call->perf_data); \
> + __count, __regs, head); \
> }
>
> #undef DEFINE_EVENT
> Index: linux-2.6/kernel/trace/trace_event_perf.c
> ===================================================================
> --- linux-2.6.orig/kernel/trace/trace_event_perf.c
> +++ linux-2.6/kernel/trace/trace_event_perf.c
> @@ -23,14 +23,25 @@ typedef typeof(unsigned long [PERF_MAX_T
> /* Count the events in use (per event id, not per instance) */
> static int total_ref_count;
>
> -static int perf_trace_event_enable(struct ftrace_event_call *event, void *data)
> +static int perf_trace_event_init(struct ftrace_event_call *tp_event,
> + struct perf_event *p_event)
> {
> + struct hlist_head *list;
> int ret = -ENOMEM;
> + int cpu;
>
> - if (event->perf_refcount++ > 0) {
> - event->perf_data = NULL;
> + p_event->tp_event = tp_event;
> + if (tp_event->perf_refcount++ > 0)
> return 0;
> - }
> +
> + list = alloc_percpu(struct hlist_head);
> + if (!list)
> + goto fail;
> +
> + for_each_possible_cpu(cpu)
> + INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
> +
> + tp_event->perf_events = list;
I suspect this must be rcu_assign_pointer.
>
> if (!total_ref_count) {
> char *buf;
> @@ -39,20 +50,20 @@ static int perf_trace_event_enable(struc
> for (i = 0; i < 4; i++) {
> buf = (char *)alloc_percpu(perf_trace_t);
> if (!buf)
> - goto fail_buf;
> + goto fail;
>
> - rcu_assign_pointer(perf_trace_buf[i], buf);
> + perf_trace_buf[i] = buf;
> }
> }
>
> - ret = event->perf_event_enable(event);
> - if (!ret) {
> - event->perf_data = data;
> - total_ref_count++;
> - return 0;
> - }
> + ret = tp_event->perf_event_enable(tp_event);
> + if (ret)
> + goto fail;
>
> -fail_buf:
> + total_ref_count++;
> + return 0;
> +
> +fail:
> if (!total_ref_count) {
> int i;
>
> @@ -61,21 +72,26 @@ fail_buf:
> perf_trace_buf[i] = NULL;
> }
> }
> - event->perf_refcount--;
> +
> + if (!--tp_event->perf_refcount) {
> + free_percpu(tp_event->perf_events);
> + tp_event->perf_events = NULL;
> + }
>
> return ret;
> }
>
> -int perf_trace_enable(int event_id, void *data)
> +int perf_trace_init(struct perf_event *p_event)
> {
> - struct ftrace_event_call *event;
> + struct ftrace_event_call *tp_event;
> + int event_id = p_event->attr.config;
> int ret = -EINVAL;
>
> mutex_lock(&event_mutex);
> - list_for_each_entry(event, &ftrace_events, list) {
> - if (event->id == event_id && event->perf_event_enable &&
> - try_module_get(event->mod)) {
> - ret = perf_trace_event_enable(event, data);
> + list_for_each_entry(tp_event, &ftrace_events, list) {
> + if (tp_event->id == event_id && tp_event->perf_event_enable &&
> + try_module_get(tp_event->mod)) {
> + ret = perf_trace_event_init(tp_event, p_event);
> break;
> }
> }
> @@ -84,53 +100,52 @@ int perf_trace_enable(int event_id, void
> return ret;
> }
>
> -static void perf_trace_event_disable(struct ftrace_event_call *event)
> +int perf_trace_enable(struct perf_event *p_event)
> {
> - if (--event->perf_refcount > 0)
> - return;
> + struct ftrace_event_call *tp_event = p_event->tp_event;
> + struct hlist_head *list;
>
> - event->perf_event_disable(event);
> + list = tp_event->perf_events;
> + if (WARN_ON_ONCE(!list))
> + return -EINVAL;
>
> - if (!--total_ref_count) {
> - char *buf[4];
> - int i;
> -
> - for (i = 0; i < 4; i++) {
> - buf[i] = perf_trace_buf[i];
> - rcu_assign_pointer(perf_trace_buf[i], NULL);
> - }
> + list = per_cpu_ptr(list, smp_processor_id());
> + hlist_add_head_rcu(&p_event->hlist_entry, list);
Ah and may be small comment, because using the hlist api here
may puzzle more people than just me ;)
> - /*
> - * Ensure every events in profiling have finished before
> - * releasing the buffers
> - */
> - synchronize_sched();
> + return 0;
> +}
>
> - for (i = 0; i < 4; i++)
> - free_percpu(buf[i]);
> - }
> +void perf_trace_disable(struct perf_event *p_event)
> +{
> + hlist_del_rcu(&p_event->hlist_entry);
> }
>
> -void perf_trace_disable(int event_id)
> +void perf_trace_destroy(struct perf_event *p_event)
> {
> - struct ftrace_event_call *event;
> + struct ftrace_event_call *tp_event = p_event->tp_event;
> + int i;
>
> - mutex_lock(&event_mutex);
> - list_for_each_entry(event, &ftrace_events, list) {
> - if (event->id == event_id) {
> - perf_trace_event_disable(event);
> - module_put(event->mod);
> - break;
> + if (--tp_event->perf_refcount > 0)
> + return;
> +
> + tp_event->perf_event_disable(tp_event);
Don't we need a rcu_synchronize_sched() here?
> + free_percpu(tp_event->perf_events);
> + tp_event->perf_events = NULL;
And rcu_assign?
> +
> + if (!--total_ref_count) {
> + for (i = 0; i < 4; i++) {
> + free_percpu(perf_trace_buf[i]);
> + perf_trace_buf[i] = NULL;
> }
> }
> - mutex_unlock(&event_mutex);
> }
>
> __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
> struct pt_regs *regs, int *rctxp)
> {
> struct trace_entry *entry;
> - char *trace_buf, *raw_data;
> + char *raw_data;
> int pc;
>
> BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
> @@ -139,13 +154,9 @@ __kprobes void *perf_trace_buf_prepare(i
>
> *rctxp = perf_swevent_get_recursion_context();
> if (*rctxp < 0)
> - goto err_recursion;
> -
> - trace_buf = rcu_dereference_sched(perf_trace_buf[*rctxp]);
> - if (!trace_buf)
> - goto err;
> + return NULL;
>
> - raw_data = per_cpu_ptr(trace_buf, smp_processor_id());
> + raw_data = per_cpu_ptr(perf_trace_buf[*rctxp], smp_processor_id());
Needs rcu_dereference_sched too. And this could be __this_cpu_var()
>
> /* zero the dead bytes from align to not leak stack to user */
> memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
> @@ -155,9 +166,5 @@ __kprobes void *perf_trace_buf_prepare(i
> entry->type = type;
>
> return raw_data;
> -err:
> - perf_swevent_put_recursion_context(*rctxp);
> -err_recursion:
> - return NULL;
> }
> EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
> Index: linux-2.6/kernel/trace/trace_kprobe.c
> ===================================================================
> --- linux-2.6.orig/kernel/trace/trace_kprobe.c
> +++ linux-2.6/kernel/trace/trace_kprobe.c
> @@ -1341,6 +1341,7 @@ static __kprobes void kprobe_perf_func(s
> struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
> struct ftrace_event_call *call = &tp->call;
> struct kprobe_trace_entry_head *entry;
> + struct hlist_head *head;
> u8 *data;
> int size, __size, i;
> int rctx;
> @@ -1361,7 +1362,8 @@ static __kprobes void kprobe_perf_func(s
> for (i = 0; i < tp->nr_args; i++)
> call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
>
> - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, call->perf_data);
> + head = per_cpu_ptr(call->perf_events, smp_processor_id());
Same
> + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
> }
>
> /* Kretprobe profile handler */
> @@ -1371,6 +1373,7 @@ static __kprobes void kretprobe_perf_fun
> struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
> struct ftrace_event_call *call = &tp->call;
> struct kretprobe_trace_entry_head *entry;
> + struct hlist_head *head;
> u8 *data;
> int size, __size, i;
> int rctx;
> @@ -1392,8 +1395,8 @@ static __kprobes void kretprobe_perf_fun
> for (i = 0; i < tp->nr_args; i++)
> call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
>
> - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
> - regs, call->perf_data);
> + head = per_cpu_ptr(call->perf_events, smp_processor_id());
ditto
> + perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
> }
>
> static int probe_perf_enable(struct ftrace_event_call *call)
> Index: linux-2.6/kernel/trace/trace_syscalls.c
> ===================================================================
> --- linux-2.6.orig/kernel/trace/trace_syscalls.c
> +++ linux-2.6/kernel/trace/trace_syscalls.c
> @@ -438,6 +438,7 @@ static void perf_syscall_enter(struct pt
> {
> struct syscall_metadata *sys_data;
> struct syscall_trace_enter *rec;
> + struct hlist_head *head;
> int syscall_nr;
> int rctx;
> int size;
> @@ -467,8 +468,9 @@ static void perf_syscall_enter(struct pt
> rec->nr = syscall_nr;
> syscall_get_arguments(current, regs, 0, sys_data->nb_args,
> (unsigned long *)&rec->args);
> - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs,
> - sys_data->enter_event->perf_data);
> +
> + head = per_cpu_ptr(sys_data->enter_event->perf_events, smp_processor_id());
ditto
> + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
> }
>
> int perf_sysenter_enable(struct ftrace_event_call *call)
> @@ -510,6 +512,7 @@ static void perf_syscall_exit(struct pt_
> {
> struct syscall_metadata *sys_data;
> struct syscall_trace_exit *rec;
> + struct hlist_head *head;
> int syscall_nr;
> int rctx;
> int size;
> @@ -542,8 +545,8 @@ static void perf_syscall_exit(struct pt_
> rec->nr = syscall_nr;
> rec->ret = syscall_get_return_value(current, regs);
>
> - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs,
> - sys_data->exit_event->perf_data);
> + head = per_cpu_ptr(sys_data->exit_event->perf_events, smp_processor_id());
and ditto :)
> + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
> }
>
> int perf_sysexit_enable(struct ftrace_event_call *call)
> Index: linux-2.6/kernel/perf_event.c
> ===================================================================
> --- linux-2.6.orig/kernel/perf_event.c
> +++ linux-2.6/kernel/perf_event.c
> @@ -4004,9 +4004,6 @@ static void perf_swevent_add(struct perf
> perf_swevent_overflow(event, 0, nmi, data, regs);
> }
>
> -static int perf_tp_event_match(struct perf_event *event,
> - struct perf_sample_data *data);
> -
> static int perf_exclude_event(struct perf_event *event,
> struct pt_regs *regs)
> {
> @@ -4036,10 +4033,6 @@ static int perf_swevent_match(struct per
> if (perf_exclude_event(event, regs))
> return 0;
>
> - if (event->attr.type == PERF_TYPE_TRACEPOINT &&
> - !perf_tp_event_match(event, data))
> - return 0;
> -
> return 1;
> }
>
> @@ -4094,7 +4087,7 @@ end:
>
> int perf_swevent_get_recursion_context(void)
> {
> - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
> + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
> int rctx;
>
> if (in_nmi())
> @@ -4106,10 +4099,8 @@ int perf_swevent_get_recursion_context(v
> else
> rctx = 0;
>
> - if (cpuctx->recursion[rctx]) {
> - put_cpu_var(perf_cpu_context);
> + if (cpuctx->recursion[rctx])
> return -1;
> - }
>
> cpuctx->recursion[rctx]++;
> barrier();
> @@ -4123,7 +4114,6 @@ void perf_swevent_put_recursion_context(
> struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
> barrier();
> cpuctx->recursion[rctx]--;
> - put_cpu_var(perf_cpu_context);
> }
> EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
>
> @@ -4134,6 +4124,7 @@ void __perf_sw_event(u32 event_id, u64 n
> struct perf_sample_data data;
> int rctx;
>
> + preempt_disable_notrace();
Why is this needed. We have the recursion context protection already.
Ok, otherwise looks good.
next prev parent reply other threads:[~2010-05-21 9:40 UTC|newest]
Thread overview: 53+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-05-21 9:02 [PATCH 00/10] perf tracepoint and output optimizations Peter Zijlstra
2010-05-21 9:02 ` [PATCH 01/10] perf, trace: Remove IRQ-disable from perf/tracepoint interaction Peter Zijlstra
2010-05-21 17:43 ` Frank Ch. Eigler
2010-05-21 17:53 ` Steven Rostedt
2010-05-21 18:07 ` Frank Ch. Eigler
2010-05-23 12:11 ` Paul Mackerras
2010-05-23 18:16 ` Peter Zijlstra
2010-05-24 4:29 ` Paul Mackerras
2010-05-25 8:06 ` [tip:perf/core] perf, trace: Fix IRQ-disable removal " tip-bot for Peter Zijlstra
2010-05-25 9:02 ` Peter Zijlstra
2010-05-25 9:30 ` [tip:perf/core] perf, trace: Fix !x86 build bug tip-bot for Peter Zijlstra
2010-05-24 11:31 ` [PATCH 01/10] perf, trace: Remove IRQ-disable from perf/tracepoint interaction Frederic Weisbecker
2010-05-25 7:30 ` [PATCH 01a/10] perf, trace: Fix !x86 build issue Peter Zijlstra
2010-05-21 9:02 ` [PATCH 02/10] perf, trace: Use per-tracepoint-per-cpu hlist to track events Peter Zijlstra
2010-05-21 9:40 ` Frederic Weisbecker [this message]
2010-05-21 10:02 ` Peter Zijlstra
2010-05-21 10:13 ` Frederic Weisbecker
2010-05-21 10:15 ` Peter Zijlstra
2010-05-21 10:19 ` Frederic Weisbecker
2010-05-21 10:38 ` Ingo Molnar
2010-05-21 10:51 ` Ingo Molnar
2010-05-21 10:19 ` Peter Zijlstra
2010-05-21 10:21 ` Frederic Weisbecker
2010-05-21 10:34 ` Peter Zijlstra
2010-05-21 10:38 ` Frederic Weisbecker
2010-05-21 10:41 ` [PATCH 02b/10] perf, trace: Fix probe unregister race Peter Zijlstra
2010-05-21 10:43 ` Frederic Weisbecker
2010-05-31 7:19 ` [tip:perf/urgent] perf_events, " tip-bot for Peter Zijlstra
2010-05-21 11:28 ` [tip:perf/core] perf, trace: Optimize tracepoints by using per-tracepoint-per-cpu hlist to track events tip-bot for Peter Zijlstra
2010-05-21 14:04 ` [PATCH 02/10] perf, trace: Use " Steven Rostedt
2010-05-21 14:18 ` Peter Zijlstra
2010-05-21 14:25 ` Peter Zijlstra
2010-05-31 7:20 ` [tip:perf/urgent] perf_events, trace: Fix perf_trace_destroy(), mutex went missing tip-bot for Peter Zijlstra
2010-05-21 9:02 ` [PATCH 03/10] perf: Ensure IOC_OUTPUT isnt used to create multi-writer buffers Peter Zijlstra
2010-05-21 11:28 ` [tip:perf/core] perf: Ensure that IOC_OUTPUT isn't " tip-bot for Peter Zijlstra
2010-05-21 9:02 ` [PATCH 04/10] perf-record: Remove -M Peter Zijlstra
2010-05-21 11:28 ` [tip:perf/core] " tip-bot for Peter Zijlstra
2010-05-21 9:02 ` [PATCH 05/10] perf-record: Share per-cpu buffers Peter Zijlstra
2010-05-21 9:44 ` Frederic Weisbecker
2010-05-21 10:03 ` Peter Zijlstra
2010-05-21 11:29 ` [tip:perf/core] " tip-bot for Peter Zijlstra
2010-05-21 9:02 ` [PATCH 06/10] perf: Fix wakeup storm for RO mmap()s Peter Zijlstra
2010-05-21 11:29 ` [tip:perf/core] " tip-bot for Peter Zijlstra
2010-05-21 9:02 ` [PATCH 07/10] perf: Optimize perf_output_copy Peter Zijlstra
2010-05-21 11:29 ` [tip:perf/core] perf: Optimize perf_output_copy() tip-bot for Peter Zijlstra
2010-05-21 9:02 ` [PATCH 08/10] perf: Optimize the !vmalloc backed buffer Peter Zijlstra
2010-05-21 11:29 ` [tip:perf/core] " tip-bot for Peter Zijlstra
2010-05-21 9:02 ` [PATCH 09/10] perf: Remove more fastpath code Peter Zijlstra
2010-05-21 11:15 ` Steven Rostedt
2010-05-21 11:18 ` Peter Zijlstra
2010-05-21 11:30 ` [tip:perf/core] perf: Remove more code from the fastpath tip-bot for Peter Zijlstra
2010-05-21 9:02 ` [PATCH 10/10] perf: Optimize perf_tp_event_match Peter Zijlstra
2010-05-21 11:30 ` [tip:perf/core] perf: Optimize perf_tp_event_match() tip-bot for Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20100521094014.GA30108@nowhere \
--to=fweisbec@gmail.com \
--cc=a.p.zijlstra@chello.nl \
--cc=acme@infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@elte.hu \
--cc=paulus@samba.org \
--cc=rostedt@goodmis.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.