* [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()
@ 2010-01-18 13:42 Xiao Guangrong
2010-01-18 13:44 ` [PATCH 2/3] perf_event: cleanup for event profile buffer operation Xiao Guangrong
` (2 more replies)
0 siblings, 3 replies; 25+ messages in thread
From: Xiao Guangrong @ 2010-01-18 13:42 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Peter Zijlstra, Frederic Weisbecker, Paul Mackerras, LKML
It only disable preemption in perf_swevent_get_recursion_context()
it can't avoid race of hard-irq and NMI
In this patch, we use atomic operation to avoid it and reduce
cpu_ctx->recursion size, it also make this patch no need diable
preemption
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
include/linux/perf_event.h | 2 +-
kernel/perf_event.c | 14 ++++----------
2 files changed, 5 insertions(+), 11 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c6f812e..8474ab0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -730,7 +730,7 @@ struct perf_cpu_context {
*
* task, softirq, irq, nmi context
*/
- int recursion[4];
+ unsigned long recursion;
};
struct perf_output_handle {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eae6ff6..77ef16e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3921,7 +3921,7 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
int perf_swevent_get_recursion_context(void)
{
- struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
int rctx;
if (in_nmi())
@@ -3933,13 +3933,8 @@ int perf_swevent_get_recursion_context(void)
else
rctx = 0;
- if (cpuctx->recursion[rctx]) {
- put_cpu_var(perf_cpu_context);
+ if (test_and_set_bit(rctx, &cpuctx->recursion))
return -1;
- }
-
- cpuctx->recursion[rctx]++;
- barrier();
return rctx;
}
@@ -3948,9 +3943,8 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
void perf_swevent_put_recursion_context(int rctx)
{
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
- barrier();
- cpuctx->recursion[rctx]--;
- put_cpu_var(perf_cpu_context);
+
+ clear_bit(rctx, &cpuctx->recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
--
1.6.1.2
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 2/3] perf_event: cleanup for event profile buffer operation
2010-01-18 13:42 [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context() Xiao Guangrong
@ 2010-01-18 13:44 ` Xiao Guangrong
2010-01-18 13:46 ` [PATCH 3/3] tracing/kprobe: cleanup unused return value of function Xiao Guangrong
` (2 more replies)
2010-01-18 13:55 ` [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context() Peter Zijlstra
2010-01-18 16:41 ` Frederic Weisbecker
2 siblings, 3 replies; 25+ messages in thread
From: Xiao Guangrong @ 2010-01-18 13:44 UTC (permalink / raw)
To: Ingo Molnar
Cc: Peter Zijlstra, Frederic Weisbecker, Paul Mackerras,
Masami Hiramatsu, Jason Baron, LKML
Introduce ftrace_profile_buf_begin() and ftrace_profile_buf_end() to
operate event profile buffer, clean up redundant code
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
include/linux/ftrace_event.h | 9 +++-
include/trace/ftrace.h | 48 +++-----------------
kernel/trace/trace_event_profile.c | 60 +++++++++++++++++++++++--
kernel/trace/trace_kprobe.c | 86 ++++-------------------------------
kernel/trace/trace_syscalls.c | 71 ++++-------------------------
5 files changed, 87 insertions(+), 187 deletions(-)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 3ca9485..5ab4b81 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -137,9 +137,6 @@ struct ftrace_event_call {
#define FTRACE_MAX_PROFILE_SIZE 2048
-extern char *perf_trace_buf;
-extern char *perf_trace_buf_nmi;
-
#define MAX_FILTER_PRED 32
#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */
@@ -194,6 +191,12 @@ extern void ftrace_profile_disable(int event_id);
extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str);
extern void ftrace_profile_free_filter(struct perf_event *event);
+extern void *
+ftrace_profile_buf_begin(int size, unsigned short type, int *rctxp,
+ unsigned long *irq_flags);
+extern void
+ftrace_profile_buf_end(void *raw_data, int size, int rctx, u64 addr,
+ u64 count, unsigned long irq_flags);
#endif
#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 5b46cf9..1007302 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -755,22 +755,12 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \
proto) \
{ \
struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
- extern int perf_swevent_get_recursion_context(void); \
- extern void perf_swevent_put_recursion_context(int rctx); \
- extern void perf_tp_event(int, u64, u64, void *, int); \
struct ftrace_raw_##call *entry; \
u64 __addr = 0, __count = 1; \
unsigned long irq_flags; \
- struct trace_entry *ent; \
int __entry_size; \
int __data_size; \
- char *trace_buf; \
- char *raw_data; \
- int __cpu; \
int rctx; \
- int pc; \
- \
- pc = preempt_count(); \
\
__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
@@ -780,42 +770,16 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \
if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE, \
"profile buffer not large enough")) \
return; \
- \
- local_irq_save(irq_flags); \
- \
- rctx = perf_swevent_get_recursion_context(); \
- if (rctx < 0) \
- goto end_recursion; \
- \
- __cpu = smp_processor_id(); \
- \
- if (in_nmi()) \
- trace_buf = rcu_dereference(perf_trace_buf_nmi); \
- else \
- trace_buf = rcu_dereference(perf_trace_buf); \
- \
- if (!trace_buf) \
- goto end; \
- \
- raw_data = per_cpu_ptr(trace_buf, __cpu); \
- \
- *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \
- entry = (struct ftrace_raw_##call *)raw_data; \
- ent = &entry->ent; \
- tracing_generic_entry_update(ent, irq_flags, pc); \
- ent->type = event_call->id; \
- \
+ entry = (struct ftrace_raw_##call *)ftrace_profile_buf_begin( \
+ __entry_size, event_call->id, &rctx, &irq_flags); \
+ if (!entry) \
+ return; \
tstruct \
\
{ assign; } \
\
- perf_tp_event(event_call->id, __addr, __count, entry, \
- __entry_size); \
- \
-end: \
- perf_swevent_put_recursion_context(rctx); \
-end_recursion: \
- local_irq_restore(irq_flags); \
+ ftrace_profile_buf_end(entry, __entry_size, rctx, __addr, \
+ __count, irq_flags); \
}
#undef DEFINE_EVENT
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573..f0fa16b 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -9,11 +9,8 @@
#include "trace.h"
-char *perf_trace_buf;
-EXPORT_SYMBOL_GPL(perf_trace_buf);
-
-char *perf_trace_buf_nmi;
-EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
+static char *perf_trace_buf;
+static char *perf_trace_buf_nmi;
typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
@@ -120,3 +117,56 @@ void ftrace_profile_disable(int event_id)
}
mutex_unlock(&event_mutex);
}
+
+void *ftrace_profile_buf_begin(int size, unsigned short type, int *rctxp,
+ unsigned long *irq_flags)
+{
+ struct trace_entry *entry;
+ char *trace_buf, *raw_data;
+ int pc, cpu;
+
+ pc = preempt_count();
+
+ /* Protect the per cpu buffer, begin the rcu read side */
+ local_irq_save(*irq_flags);
+
+ *rctxp = perf_swevent_get_recursion_context();
+ if (*rctxp < 0)
+ goto err_recursion;
+
+ cpu = smp_processor_id();
+
+ if (in_nmi())
+ trace_buf = rcu_dereference(perf_trace_buf_nmi);
+ else
+ trace_buf = rcu_dereference(perf_trace_buf);
+
+ if (!trace_buf)
+ goto err;
+
+ raw_data = per_cpu_ptr(trace_buf, cpu);
+
+ /* zero the dead bytes from align to not leak stack to user */
+ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+ entry = (struct trace_entry *)raw_data;
+ tracing_generic_entry_update(entry, *irq_flags, pc);
+ entry->type = type;
+
+ return raw_data;
+err:
+ perf_swevent_put_recursion_context(*rctxp);
+err_recursion:
+ local_irq_restore(*irq_flags);
+ return NULL;
+}
+
+void ftrace_profile_buf_end(void *raw_data, int size, int rctx, u64 addr,
+ u64 count, unsigned long irq_flags)
+{
+ struct trace_entry *entry = raw_data;
+
+ perf_tp_event(entry->type, addr, count, raw_data, size);
+ perf_swevent_put_recursion_context(rctx);
+ local_irq_restore(irq_flags);
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aa19b6a..6d478c1 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1223,14 +1223,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct ftrace_event_call *call = &tp->call;
struct kprobe_trace_entry *entry;
- struct trace_entry *ent;
- int size, __size, i, pc, __cpu;
+ int size, __size, i;
unsigned long irq_flags;
- char *trace_buf;
- char *raw_data;
int rctx;
- pc = preempt_count();
__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1238,45 +1234,16 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
"profile buffer not large enough"))
return 0;
- /*
- * Protect the non nmi buffer
- * This also protects the rcu read side
- */
- local_irq_save(irq_flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- __cpu = smp_processor_id();
-
- if (in_nmi())
- trace_buf = rcu_dereference(perf_trace_buf_nmi);
- else
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, __cpu);
-
- /* Zero dead bytes from alignment to avoid buffer leak to userspace */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
- entry = (struct kprobe_trace_entry *)raw_data;
- ent = &entry->ent;
+ entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
+ if (!entry)
+ return 0;
- tracing_generic_entry_update(ent, irq_flags, pc);
- ent->type = call->id;
entry->nargs = tp->nr_args;
entry->ip = (unsigned long)kp->addr;
for (i = 0; i < tp->nr_args; i++)
entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
- perf_tp_event(call->id, entry->ip, 1, entry, size);
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(irq_flags);
+ ftrace_profile_buf_end(entry, size, rctx, entry->ip, 1, irq_flags);
return 0;
}
@@ -1288,14 +1255,10 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
struct ftrace_event_call *call = &tp->call;
struct kretprobe_trace_entry *entry;
- struct trace_entry *ent;
- int size, __size, i, pc, __cpu;
+ int size, __size, i;
unsigned long irq_flags;
- char *trace_buf;
- char *raw_data;
int rctx;
- pc = preempt_count();
__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1303,46 +1266,17 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
"profile buffer not large enough"))
return 0;
- /*
- * Protect the non nmi buffer
- * This also protects the rcu read side
- */
- local_irq_save(irq_flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- __cpu = smp_processor_id();
-
- if (in_nmi())
- trace_buf = rcu_dereference(perf_trace_buf_nmi);
- else
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, __cpu);
-
- /* Zero dead bytes from alignment to avoid buffer leak to userspace */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
- entry = (struct kretprobe_trace_entry *)raw_data;
- ent = &entry->ent;
+ entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
+ if (!entry)
+ return 0;
- tracing_generic_entry_update(ent, irq_flags, pc);
- ent->type = call->id;
entry->nargs = tp->nr_args;
entry->func = (unsigned long)tp->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
for (i = 0; i < tp->nr_args; i++)
entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
- perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(irq_flags);
+ ftrace_profile_buf_end(entry, size, rctx, entry->ret_ip, 1, irq_flags);
return 0;
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f6b0712..fb40e91 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -433,12 +433,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
unsigned long flags;
- char *trace_buf;
- char *raw_data;
int syscall_nr;
int rctx;
int size;
- int cpu;
syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +454,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
"profile buffer not large enough"))
return;
- /* Protect the per cpu buffer, begin the rcu read side */
- local_irq_save(flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- cpu = smp_processor_id();
-
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, cpu);
-
- /* zero the dead bytes from align to not leak stack to user */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+ rec = (struct syscall_trace_enter *)ftrace_profile_buf_begin(size,
+ sys_data->enter_event->id, &rctx, &flags);
+ if (!rec)
+ return;
- rec = (struct syscall_trace_enter *) raw_data;
- tracing_generic_entry_update(&rec->ent, 0, 0);
- rec->ent.type = sys_data->enter_event->id;
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
-
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(flags);
+ ftrace_profile_buf_end(rec, size, rctx, 0, 1, flags);
}
int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +506,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
struct syscall_trace_exit *rec;
unsigned long flags;
int syscall_nr;
- char *trace_buf;
- char *raw_data;
int rctx;
int size;
- int cpu;
syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +529,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
"exit event has grown above profile buffer size"))
return;
- /* Protect the per cpu buffer, begin the rcu read side */
- local_irq_save(flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- cpu = smp_processor_id();
-
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, cpu);
-
- /* zero the dead bytes from align to not leak stack to user */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-
- rec = (struct syscall_trace_exit *)raw_data;
+ rec = (struct syscall_trace_exit *)ftrace_profile_buf_begin(size,
+ sys_data->exit_event->id, &rctx, &flags);
+ if (!rec)
+ return;
- tracing_generic_entry_update(&rec->ent, 0, 0);
- rec->ent.type = sys_data->exit_event->id;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
-
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(flags);
+ ftrace_profile_buf_end(rec, size, rctx, 0, 1, flags);
}
int prof_sysexit_enable(struct ftrace_event_call *call)
--
1.6.1.2
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 3/3] tracing/kprobe: cleanup unused return value of function
2010-01-18 13:44 ` [PATCH 2/3] perf_event: cleanup for event profile buffer operation Xiao Guangrong
@ 2010-01-18 13:46 ` Xiao Guangrong
2010-01-18 16:16 ` Masami Hiramatsu
` (3 more replies)
2010-01-18 16:21 ` [PATCH 2/3] perf_event: cleanup for event profile buffer operation Masami Hiramatsu
2010-01-18 17:11 ` Frederic Weisbecker
2 siblings, 4 replies; 25+ messages in thread
From: Xiao Guangrong @ 2010-01-18 13:46 UTC (permalink / raw)
To: Ingo Molnar
Cc: Peter Zijlstra, Frederic Weisbecker, Paul Mackerras,
Masami Hiramatsu, Jason Baron, LKML
Those function's return value is meaningless
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
kernel/trace/trace_kprobe.c | 28 ++++++++++++++--------------
1 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6d478c1..5869819 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -942,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
};
/* Kprobe handler */
-static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct kprobe_trace_entry *entry;
@@ -962,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
irq_flags, pc);
if (!event)
- return 0;
+ return ;
entry = ring_buffer_event_data(event);
entry->nargs = tp->nr_args;
@@ -972,11 +972,11 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
- return 0;
+ return ;
}
/* Kretprobe handler */
-static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -995,7 +995,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
irq_flags, pc);
if (!event)
- return 0;
+ return ;
entry = ring_buffer_event_data(event);
entry->nargs = tp->nr_args;
@@ -1007,7 +1007,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
- return 0;
+ return ;
}
/* Event entry printers */
@@ -1217,7 +1217,7 @@ static int set_print_fmt(struct trace_probe *tp)
#ifdef CONFIG_PERF_EVENTS
/* Kprobe profile handler */
-static __kprobes int kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_profile_func(struct kprobe *kp,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1232,11 +1232,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
size -= sizeof(u32);
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
"profile buffer not large enough"))
- return 0;
+ return ;
entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
if (!entry)
- return 0;
+ return ;
entry->nargs = tp->nr_args;
entry->ip = (unsigned long)kp->addr;
@@ -1245,11 +1245,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
ftrace_profile_buf_end(entry, size, rctx, entry->ip, 1, irq_flags);
- return 0;
+ return ;
}
/* Kretprobe profile handler */
-static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1264,11 +1264,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
size -= sizeof(u32);
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
"profile buffer not large enough"))
- return 0;
+ return ;
entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
if (!entry)
- return 0;
+ return ;
entry->nargs = tp->nr_args;
entry->func = (unsigned long)tp->rp.kp.addr;
@@ -1278,7 +1278,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
ftrace_profile_buf_end(entry, size, rctx, entry->ret_ip, 1, irq_flags);
- return 0;
+ return ;
}
static int probe_profile_enable(struct ftrace_event_call *call)
--
1.6.1.2
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()
2010-01-18 13:42 [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context() Xiao Guangrong
2010-01-18 13:44 ` [PATCH 2/3] perf_event: cleanup for event profile buffer operation Xiao Guangrong
@ 2010-01-18 13:55 ` Peter Zijlstra
2010-01-19 7:36 ` Xiao Guangrong
2010-01-18 16:41 ` Frederic Weisbecker
2 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2010-01-18 13:55 UTC (permalink / raw)
To: Xiao Guangrong; +Cc: Ingo Molnar, Frederic Weisbecker, Paul Mackerras, LKML
On Mon, 2010-01-18 at 21:42 +0800, Xiao Guangrong wrote:
> It only disable preemption in perf_swevent_get_recursion_context()
> it can't avoid race of hard-irq and NMI
>
> In this patch, we use atomic operation to avoid it and reduce
> cpu_ctx->recursion size, it also make this patch no need diable
> preemption
Uhm why?
This patch looks terminally broken
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 3/3] tracing/kprobe: cleanup unused return value of function
2010-01-18 13:46 ` [PATCH 3/3] tracing/kprobe: cleanup unused return value of function Xiao Guangrong
@ 2010-01-18 16:16 ` Masami Hiramatsu
2010-01-19 8:37 ` [PATCH 1/3 v2] perf_event: fix race in perf_swevent_get_recursion_context() Xiao Guangrong
` (2 subsequent siblings)
3 siblings, 0 replies; 25+ messages in thread
From: Masami Hiramatsu @ 2010-01-18 16:16 UTC (permalink / raw)
To: Xiao Guangrong
Cc: Ingo Molnar, Peter Zijlstra, Frederic Weisbecker, Paul Mackerras,
Jason Baron, LKML
Xiao Guangrong wrote:
> Those function's return value is meaningless
Right, now these functions are called from dispatchers,
and return values are wasted.
>
> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Thanks!
> ---
> kernel/trace/trace_kprobe.c | 28 ++++++++++++++--------------
> 1 files changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 6d478c1..5869819 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -942,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
> };
>
> /* Kprobe handler */
> -static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
> +static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
> {
> struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
> struct kprobe_trace_entry *entry;
> @@ -962,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
> event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
> irq_flags, pc);
> if (!event)
> - return 0;
> + return ;
>
> entry = ring_buffer_event_data(event);
> entry->nargs = tp->nr_args;
> @@ -972,11 +972,11 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
>
> if (!filter_current_check_discard(buffer, call, entry, event))
> trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
> - return 0;
> + return ;
> }
>
> /* Kretprobe handler */
> -static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
> +static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
> struct pt_regs *regs)
> {
> struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
> @@ -995,7 +995,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
> event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
> irq_flags, pc);
> if (!event)
> - return 0;
> + return ;
>
> entry = ring_buffer_event_data(event);
> entry->nargs = tp->nr_args;
> @@ -1007,7 +1007,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
> if (!filter_current_check_discard(buffer, call, entry, event))
> trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
>
> - return 0;
> + return ;
> }
>
> /* Event entry printers */
> @@ -1217,7 +1217,7 @@ static int set_print_fmt(struct trace_probe *tp)
> #ifdef CONFIG_PERF_EVENTS
>
> /* Kprobe profile handler */
> -static __kprobes int kprobe_profile_func(struct kprobe *kp,
> +static __kprobes void kprobe_profile_func(struct kprobe *kp,
> struct pt_regs *regs)
> {
> struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
> @@ -1232,11 +1232,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
> size -= sizeof(u32);
> if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
> "profile buffer not large enough"))
> - return 0;
> + return ;
>
> entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
> if (!entry)
> - return 0;
> + return ;
>
> entry->nargs = tp->nr_args;
> entry->ip = (unsigned long)kp->addr;
> @@ -1245,11 +1245,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
>
> ftrace_profile_buf_end(entry, size, rctx, entry->ip, 1, irq_flags);
>
> - return 0;
> + return ;
> }
>
> /* Kretprobe profile handler */
> -static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
> +static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
> struct pt_regs *regs)
> {
> struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
> @@ -1264,11 +1264,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
> size -= sizeof(u32);
> if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
> "profile buffer not large enough"))
> - return 0;
> + return ;
>
> entry = ftrace_profile_buf_begin(size, call->id, &rctx, &irq_flags);
> if (!entry)
> - return 0;
> + return ;
>
> entry->nargs = tp->nr_args;
> entry->func = (unsigned long)tp->rp.kp.addr;
> @@ -1278,7 +1278,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
>
> ftrace_profile_buf_end(entry, size, rctx, entry->ret_ip, 1, irq_flags);
>
> - return 0;
> + return ;
> }
>
> static int probe_profile_enable(struct ftrace_event_call *call)
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhiramat@redhat.com
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation
2010-01-18 13:44 ` [PATCH 2/3] perf_event: cleanup for event profile buffer operation Xiao Guangrong
2010-01-18 13:46 ` [PATCH 3/3] tracing/kprobe: cleanup unused return value of function Xiao Guangrong
@ 2010-01-18 16:21 ` Masami Hiramatsu
2010-01-18 17:20 ` Frederic Weisbecker
2010-01-18 17:11 ` Frederic Weisbecker
2 siblings, 1 reply; 25+ messages in thread
From: Masami Hiramatsu @ 2010-01-18 16:21 UTC (permalink / raw)
To: Xiao Guangrong
Cc: Ingo Molnar, Peter Zijlstra, Frederic Weisbecker, Paul Mackerras,
Jason Baron, LKML
Xiao Guangrong wrote:
> Introduce ftrace_profile_buf_begin() and ftrace_profile_buf_end() to
> operate event profile buffer, clean up redundant code
>
> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
[...]
> diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
> index 9e25573..f0fa16b 100644
> --- a/kernel/trace/trace_event_profile.c
> +++ b/kernel/trace/trace_event_profile.c
> @@ -9,11 +9,8 @@
> #include "trace.h"
>
>
> -char *perf_trace_buf;
> -EXPORT_SYMBOL_GPL(perf_trace_buf);
> -
> -char *perf_trace_buf_nmi;
> -EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
> +static char *perf_trace_buf;
> +static char *perf_trace_buf_nmi;
>
> typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
>
> @@ -120,3 +117,56 @@ void ftrace_profile_disable(int event_id)
> }
> mutex_unlock(&event_mutex);
> }
> +
> +void *ftrace_profile_buf_begin(int size, unsigned short type, int *rctxp,
> + unsigned long *irq_flags)
> +{
> + struct trace_entry *entry;
> + char *trace_buf, *raw_data;
> + int pc, cpu;
> +
> + pc = preempt_count();
> +
> + /* Protect the per cpu buffer, begin the rcu read side */
> + local_irq_save(*irq_flags);
> +
> + *rctxp = perf_swevent_get_recursion_context();
> + if (*rctxp < 0)
> + goto err_recursion;
> +
> + cpu = smp_processor_id();
> +
> + if (in_nmi())
> + trace_buf = rcu_dereference(perf_trace_buf_nmi);
> + else
> + trace_buf = rcu_dereference(perf_trace_buf);
> +
> + if (!trace_buf)
> + goto err;
> +
> + raw_data = per_cpu_ptr(trace_buf, cpu);
> +
> + /* zero the dead bytes from align to not leak stack to user */
> + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
> +
> + entry = (struct trace_entry *)raw_data;
> + tracing_generic_entry_update(entry, *irq_flags, pc);
> + entry->type = type;
> +
> + return raw_data;
> +err:
> + perf_swevent_put_recursion_context(*rctxp);
> +err_recursion:
> + local_irq_restore(*irq_flags);
> + return NULL;
> +}
> +
> +void ftrace_profile_buf_end(void *raw_data, int size, int rctx, u64 addr,
> + u64 count, unsigned long irq_flags)
> +{
> + struct trace_entry *entry = raw_data;
> +
> + perf_tp_event(entry->type, addr, count, raw_data, size);
> + perf_swevent_put_recursion_context(rctx);
> + local_irq_restore(irq_flags);
> +}
Hmm, could you make it inline-functions or add __kprobes?
Because it is called from kprobes, we don't want to probe
the function which will be called from kprobes handlers itself.
(IMHO, from the viewpoint of performance, inline-function
could be better.)
Thank you,
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhiramat@redhat.com
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()
2010-01-18 13:42 [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context() Xiao Guangrong
2010-01-18 13:44 ` [PATCH 2/3] perf_event: cleanup for event profile buffer operation Xiao Guangrong
2010-01-18 13:55 ` [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context() Peter Zijlstra
@ 2010-01-18 16:41 ` Frederic Weisbecker
2010-01-19 1:19 ` Xiao Guangrong
2 siblings, 1 reply; 25+ messages in thread
From: Frederic Weisbecker @ 2010-01-18 16:41 UTC (permalink / raw)
To: Xiao Guangrong; +Cc: Ingo Molnar, Peter Zijlstra, Paul Mackerras, LKML
On Mon, Jan 18, 2010 at 09:42:34PM +0800, Xiao Guangrong wrote:
> It only disable preemption in perf_swevent_get_recursion_context()
> it can't avoid race of hard-irq and NMI
>
> In this patch, we use atomic operation to avoid it and reduce
> cpu_ctx->recursion size, it also make this patch no need diable
> preemption
>
> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
I don't understand what is racy in what we have currently.
> int perf_swevent_get_recursion_context(void)
> {
> - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
> + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
> int rctx;
>
> if (in_nmi())
> @@ -3933,13 +3933,8 @@ int perf_swevent_get_recursion_context(void)
> else
> rctx = 0;
>
> - if (cpuctx->recursion[rctx]) {
> - put_cpu_var(perf_cpu_context);
> + if (test_and_set_bit(rctx, &cpuctx->recursion))
> return -1;
This looks broken. We don't call back perf_swevent_put_recursion_context
in fail case, so the bit won't ever be cleared once we recurse.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation
2010-01-18 13:44 ` [PATCH 2/3] perf_event: cleanup for event profile buffer operation Xiao Guangrong
2010-01-18 13:46 ` [PATCH 3/3] tracing/kprobe: cleanup unused return value of function Xiao Guangrong
2010-01-18 16:21 ` [PATCH 2/3] perf_event: cleanup for event profile buffer operation Masami Hiramatsu
@ 2010-01-18 17:11 ` Frederic Weisbecker
2 siblings, 0 replies; 25+ messages in thread
From: Frederic Weisbecker @ 2010-01-18 17:11 UTC (permalink / raw)
To: Xiao Guangrong
Cc: Ingo Molnar, Peter Zijlstra, Paul Mackerras, Masami Hiramatsu,
Jason Baron, LKML
On Mon, Jan 18, 2010 at 09:44:56PM +0800, Xiao Guangrong wrote:
> Introduce ftrace_profile_buf_begin() and ftrace_profile_buf_end() to
> operate event profile buffer, clean up redundant code
>
> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
> ---
Looks good, nice and desired cleanup, thanks!
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation
2010-01-18 16:21 ` [PATCH 2/3] perf_event: cleanup for event profile buffer operation Masami Hiramatsu
@ 2010-01-18 17:20 ` Frederic Weisbecker
2010-01-18 17:48 ` Masami Hiramatsu
0 siblings, 1 reply; 25+ messages in thread
From: Frederic Weisbecker @ 2010-01-18 17:20 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: Xiao Guangrong, Ingo Molnar, Peter Zijlstra, Paul Mackerras,
Jason Baron, LKML
On Mon, Jan 18, 2010 at 11:21:46AM -0500, Masami Hiramatsu wrote:
> Xiao Guangrong wrote:
> > Introduce ftrace_profile_buf_begin() and ftrace_profile_buf_end() to
> > operate event profile buffer, clean up redundant code
> >
> > Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
> [...]
>
> > diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
> > index 9e25573..f0fa16b 100644
> > --- a/kernel/trace/trace_event_profile.c
> > +++ b/kernel/trace/trace_event_profile.c
> > @@ -9,11 +9,8 @@
> > #include "trace.h"
> >
> >
> > -char *perf_trace_buf;
> > -EXPORT_SYMBOL_GPL(perf_trace_buf);
> > -
> > -char *perf_trace_buf_nmi;
> > -EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
> > +static char *perf_trace_buf;
> > +static char *perf_trace_buf_nmi;
> >
> > typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
> >
> > @@ -120,3 +117,56 @@ void ftrace_profile_disable(int event_id)
> > }
> > mutex_unlock(&event_mutex);
> > }
> > +
> > +void *ftrace_profile_buf_begin(int size, unsigned short type, int *rctxp,
> > + unsigned long *irq_flags)
> > +{
> > + struct trace_entry *entry;
> > + char *trace_buf, *raw_data;
> > + int pc, cpu;
> > +
> > + pc = preempt_count();
> > +
> > + /* Protect the per cpu buffer, begin the rcu read side */
> > + local_irq_save(*irq_flags);
> > +
> > + *rctxp = perf_swevent_get_recursion_context();
> > + if (*rctxp < 0)
> > + goto err_recursion;
> > +
> > + cpu = smp_processor_id();
> > +
> > + if (in_nmi())
> > + trace_buf = rcu_dereference(perf_trace_buf_nmi);
> > + else
> > + trace_buf = rcu_dereference(perf_trace_buf);
> > +
> > + if (!trace_buf)
> > + goto err;
> > +
> > + raw_data = per_cpu_ptr(trace_buf, cpu);
> > +
> > + /* zero the dead bytes from align to not leak stack to user */
> > + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
> > +
> > + entry = (struct trace_entry *)raw_data;
> > + tracing_generic_entry_update(entry, *irq_flags, pc);
> > + entry->type = type;
> > +
> > + return raw_data;
> > +err:
> > + perf_swevent_put_recursion_context(*rctxp);
> > +err_recursion:
> > + local_irq_restore(*irq_flags);
> > + return NULL;
> > +}
> > +
> > +void ftrace_profile_buf_end(void *raw_data, int size, int rctx, u64 addr,
> > + u64 count, unsigned long irq_flags)
> > +{
> > + struct trace_entry *entry = raw_data;
> > +
> > + perf_tp_event(entry->type, addr, count, raw_data, size);
> > + perf_swevent_put_recursion_context(rctx);
> > + local_irq_restore(irq_flags);
> > +}
>
> Hmm, could you make it inline-functions or add __kprobes?
> Because it is called from kprobes, we don't want to probe
> the function which will be called from kprobes handlers itself.
>
> (IMHO, from the viewpoint of performance, inline-function
> could be better.)
>
> Thank you,
Yeah, may be inline ftrace_profile_buf_end, would be better.
But we shouldn't inline ftrace_profile_buf_begin() I guess,
considering its size.
While at it, may be let's choose more verbose names
like
ftrace_profile_buf_fill() and ftrace_profile_buf_submit().
Also, profile is a bit of a misnomer. Not a problem since
ftrace_profile_templ_##call() is already a misnomer, but
we should start a bit of a rename. Sometimes, perf only
profiles trace events as counters and sometimes it records
the raw samples too.
So, as more generic names, I would suggest:
ftrace_perf_buf_fill() and ftrace_perf_buf_submit().
At least it's unambiguous, I hope...
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation
2010-01-18 17:20 ` Frederic Weisbecker
@ 2010-01-18 17:48 ` Masami Hiramatsu
2010-01-18 18:02 ` Frederic Weisbecker
2010-01-19 1:26 ` Xiao Guangrong
0 siblings, 2 replies; 25+ messages in thread
From: Masami Hiramatsu @ 2010-01-18 17:48 UTC (permalink / raw)
To: Frederic Weisbecker
Cc: Xiao Guangrong, Ingo Molnar, Peter Zijlstra, Paul Mackerras,
Jason Baron, LKML
Frederic Weisbecker wrote:
> On Mon, Jan 18, 2010 at 11:21:46AM -0500, Masami Hiramatsu wrote:
>> Hmm, could you make it inline-functions or add __kprobes?
>> Because it is called from kprobes, we don't want to probe
>> the function which will be called from kprobes handlers itself.
>>
>> (IMHO, from the viewpoint of performance, inline-function
>> could be better.)
>>
>> Thank you,
>
>
>
> Yeah, may be inline ftrace_profile_buf_end, would be better.
> But we shouldn't inline ftrace_profile_buf_begin() I guess,
> considering its size.
Indeed, especially for events...
> While at it, may be let's choose more verbose names
> like
>
> ftrace_profile_buf_fill() and ftrace_profile_buf_submit().
>
> Also, profile is a bit of a misnomer. Not a problem since
> ftrace_profile_templ_##call() is already a misnomer, but
> we should start a bit of a rename. Sometimes, perf only
> profiles trace events as counters and sometimes it records
> the raw samples too.
>
> So, as more generic names, I would suggest:
>
> ftrace_perf_buf_fill() and ftrace_perf_buf_submit().
Actual filling buffer is done in the profile handlers,
so I think ftrace_perf_buf_prepare() may be better :-)
ftrace_perf_buf_submit is good to me:-)
Thank you,
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhiramat@redhat.com
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation
2010-01-18 17:48 ` Masami Hiramatsu
@ 2010-01-18 18:02 ` Frederic Weisbecker
2010-01-19 1:26 ` Xiao Guangrong
1 sibling, 0 replies; 25+ messages in thread
From: Frederic Weisbecker @ 2010-01-18 18:02 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: Xiao Guangrong, Ingo Molnar, Peter Zijlstra, Paul Mackerras,
Jason Baron, LKML
On Mon, Jan 18, 2010 at 12:48:48PM -0500, Masami Hiramatsu wrote:
> > So, as more generic names, I would suggest:
> >
> > ftrace_perf_buf_fill() and ftrace_perf_buf_submit().
>
>
> Actual filling buffer is done in the profile handlers,
> so I think ftrace_perf_buf_prepare() may be better :-)
> ftrace_perf_buf_submit is good to me:-)
Oh you're right :)
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()
2010-01-18 16:41 ` Frederic Weisbecker
@ 2010-01-19 1:19 ` Xiao Guangrong
2010-01-19 8:46 ` Peter Zijlstra
2010-01-19 8:58 ` Frederic Weisbecker
0 siblings, 2 replies; 25+ messages in thread
From: Xiao Guangrong @ 2010-01-19 1:19 UTC (permalink / raw)
To: Frederic Weisbecker; +Cc: Ingo Molnar, Peter Zijlstra, Paul Mackerras, LKML
Frederic Weisbecker wrote:
> On Mon, Jan 18, 2010 at 09:42:34PM +0800, Xiao Guangrong wrote:
>> It only disable preemption in perf_swevent_get_recursion_context()
>> it can't avoid race of hard-irq and NMI
>>
>> In this patch, we use atomic operation to avoid it and reduce
>> cpu_ctx->recursion size, it also make this patch no need diable
>> preemption
>>
>> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
>
>
>
> I don't understand what is racy in what we have currently.
>
It's because hard-irq(we can handle interruption with interruption enabled)
and NMI are nested, for example:
int perf_swevent_get_recursion_context(void)
{
......
if (cpuctx->recursion[rctx]) {
put_cpu_var(perf_cpu_context);
return -1;
}
/*
* Another interruption handler/NMI will re-enter there if it
* happed, it make the recursion value chaotic
*/
cpuctx->recursion[rctx]++;
......
}
>
>
>> int perf_swevent_get_recursion_context(void)
>> {
>> - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
>> + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
>> int rctx;
>>
>> if (in_nmi())
>> @@ -3933,13 +3933,8 @@ int perf_swevent_get_recursion_context(void)
>> else
>> rctx = 0;
>>
>> - if (cpuctx->recursion[rctx]) {
>> - put_cpu_var(perf_cpu_context);
>> + if (test_and_set_bit(rctx, &cpuctx->recursion))
>> return -1;
>
>
>
> This looks broken. We don't call back perf_swevent_put_recursion_context
> in fail case, so the bit won't ever be cleared once we recurse.
>
Um, i think we can't clear the bit in this fail case, consider below
sequence:
path A: path B
set bit but find the bit already set
atomic set bit |
| |
V |
handle SW event |
| V
V exit and not clear the bit
atomic clear bit
After A and B, the bit is still zero
Right? :-)
Thanks,
Xiao
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation
2010-01-18 17:48 ` Masami Hiramatsu
2010-01-18 18:02 ` Frederic Weisbecker
@ 2010-01-19 1:26 ` Xiao Guangrong
2010-01-19 9:00 ` Frederic Weisbecker
1 sibling, 1 reply; 25+ messages in thread
From: Xiao Guangrong @ 2010-01-19 1:26 UTC (permalink / raw)
To: Masami Hiramatsu
Cc: Frederic Weisbecker, Ingo Molnar, Peter Zijlstra, Paul Mackerras,
Jason Baron, LKML
Masami Hiramatsu wrote:
> Frederic Weisbecker wrote:
>> On Mon, Jan 18, 2010 at 11:21:46AM -0500, Masami Hiramatsu wrote:
>>> Hmm, could you make it inline-functions or add __kprobes?
>>> Because it is called from kprobes, we don't want to probe
>>> the function which will be called from kprobes handlers itself.
>>>
>>> (IMHO, from the viewpoint of performance, inline-function
>>> could be better.)
>>>
>>> Thank you,
>>
>>
>> Yeah, may be inline ftrace_profile_buf_end, would be better.
>> But we shouldn't inline ftrace_profile_buf_begin() I guess,
>> considering its size.
>
> Indeed, especially for events...
>
Thanks Masami and Frederic, i'll fix it in the next version
>> While at it, may be let's choose more verbose names
>> like
>>
>> ftrace_profile_buf_fill() and ftrace_profile_buf_submit().
>>
>> Also, profile is a bit of a misnomer. Not a problem since
>> ftrace_profile_templ_##call() is already a misnomer, but
>> we should start a bit of a rename. Sometimes, perf only
>> profiles trace events as counters and sometimes it records
>> the raw samples too.
>>
>> So, as more generic names, I would suggest:
>>
>> ftrace_perf_buf_fill() and ftrace_perf_buf_submit().
>
>
> Actual filling buffer is done in the profile handlers,
> so I think ftrace_perf_buf_prepare() may be better :-)
> ftrace_perf_buf_submit is good to me:-)
>
OK, i'll rename those functions and add __kprobe for ftrace_perf_buf_prepare(),
i guess i can add you Acked-by :-)
Thanks,
Xiao
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()
2010-01-18 13:55 ` [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context() Peter Zijlstra
@ 2010-01-19 7:36 ` Xiao Guangrong
2010-01-19 8:41 ` Peter Zijlstra
0 siblings, 1 reply; 25+ messages in thread
From: Xiao Guangrong @ 2010-01-19 7:36 UTC (permalink / raw)
To: Peter Zijlstra; +Cc: Ingo Molnar, Frederic Weisbecker, Paul Mackerras, LKML
Hi Peter,
Peter Zijlstra wrote:
> On Mon, 2010-01-18 at 21:42 +0800, Xiao Guangrong wrote:
>> It only disable preemption in perf_swevent_get_recursion_context()
>> it can't avoid race of hard-irq and NMI
>>
>> In this patch, we use atomic operation to avoid it and reduce
>> cpu_ctx->recursion size, it also make this patch no need diable
>> preemption
>
> Uhm why?
>
> This patch looks terminally broken
Please see my explanation in another mail:
http://lkml.org/lkml/2010/1/18/501
Thanks,
Xiao
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 1/3 v2] perf_event: fix race in perf_swevent_get_recursion_context()
2010-01-18 13:46 ` [PATCH 3/3] tracing/kprobe: cleanup unused return value of function Xiao Guangrong
2010-01-18 16:16 ` Masami Hiramatsu
@ 2010-01-19 8:37 ` Xiao Guangrong
2010-01-19 8:46 ` Peter Zijlstra
2010-01-19 8:39 ` [PATCH 2/3 v2] perf_event: cleanup for event profile buffer operation Xiao Guangrong
2010-01-19 8:41 ` [PATCH 3/3 v2] tracing/kprobe: cleanup unused return value of function Xiao Guangrong
3 siblings, 1 reply; 25+ messages in thread
From: Xiao Guangrong @ 2010-01-19 8:37 UTC (permalink / raw)
To: Ingo Molnar
Cc: Peter Zijlstra, Frederic Weisbecker, Paul Mackerras,
Masami Hiramatsu, Jason Baron, LKML
It only disable preemption in perf_swevent_get_recursion_context(),
it can't avoid race of hard-irq and NMI since they are nested that
will re-enter this path and make the recursion value chaotic
In this patch, we use atomic operation to avoid it and reduce
cpu_ctx->recursion size, it also make this patch no need disable
preemption
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
include/linux/perf_event.h | 2 +-
kernel/perf_event.c | 14 ++++----------
2 files changed, 5 insertions(+), 11 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c6f812e..8474ab0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -730,7 +730,7 @@ struct perf_cpu_context {
*
* task, softirq, irq, nmi context
*/
- int recursion[4];
+ unsigned long recursion;
};
struct perf_output_handle {
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eae6ff6..77ef16e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3921,7 +3921,7 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
int perf_swevent_get_recursion_context(void)
{
- struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
int rctx;
if (in_nmi())
@@ -3933,13 +3933,8 @@ int perf_swevent_get_recursion_context(void)
else
rctx = 0;
- if (cpuctx->recursion[rctx]) {
- put_cpu_var(perf_cpu_context);
+ if (test_and_set_bit(rctx, &cpuctx->recursion))
return -1;
- }
-
- cpuctx->recursion[rctx]++;
- barrier();
return rctx;
}
@@ -3948,9 +3943,8 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
void perf_swevent_put_recursion_context(int rctx)
{
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
- barrier();
- cpuctx->recursion[rctx]--;
- put_cpu_var(perf_cpu_context);
+
+ clear_bit(rctx, &cpuctx->recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
--
1.6.1.2
^ permalink raw reply related [flat|nested] 25+ messages in thread
* [PATCH 2/3 v2] perf_event: cleanup for event profile buffer operation
2010-01-18 13:46 ` [PATCH 3/3] tracing/kprobe: cleanup unused return value of function Xiao Guangrong
2010-01-18 16:16 ` Masami Hiramatsu
2010-01-19 8:37 ` [PATCH 1/3 v2] perf_event: fix race in perf_swevent_get_recursion_context() Xiao Guangrong
@ 2010-01-19 8:39 ` Xiao Guangrong
2010-01-19 8:41 ` [PATCH 3/3 v2] tracing/kprobe: cleanup unused return value of function Xiao Guangrong
3 siblings, 0 replies; 25+ messages in thread
From: Xiao Guangrong @ 2010-01-19 8:39 UTC (permalink / raw)
To: Ingo Molnar
Cc: Peter Zijlstra, Frederic Weisbecker, Paul Mackerras,
Masami Hiramatsu, Jason Baron, LKML
Introduce ftrace_perf_buf_prepare() and ftrace_perf_buf_submit() to
operate event profile buffer, clean up redundant code
Changlog v1->v2:
- Rename function name address Masami and Frederic's suggestion
- Add __kprobes for ftrace_perf_buf_prepare() and make
ftrace_perf_buf_submit() inline address Masami's suggestion
- Export ftrace_perf_buf_prepare since module will use it
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
include/linux/ftrace_event.h | 18 ++++++-
include/trace/ftrace.h | 48 +++-----------------
kernel/trace/trace_event_profile.c | 52 +++++++++++++++++++--
kernel/trace/trace_kprobe.c | 86 ++++-------------------------------
kernel/trace/trace_syscalls.c | 71 ++++-------------------------
5 files changed, 88 insertions(+), 187 deletions(-)
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 3ca9485..6b7c444 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -5,6 +5,7 @@
#include <linux/trace_seq.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/perf_event.h>
struct trace_array;
struct tracer;
@@ -137,9 +138,6 @@ struct ftrace_event_call {
#define FTRACE_MAX_PROFILE_SIZE 2048
-extern char *perf_trace_buf;
-extern char *perf_trace_buf_nmi;
-
#define MAX_FILTER_PRED 32
#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */
@@ -194,6 +192,20 @@ extern void ftrace_profile_disable(int event_id);
extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str);
extern void ftrace_profile_free_filter(struct perf_event *event);
+extern void *
+ftrace_perf_buf_prepare(int size, unsigned short type, int *rctxp,
+ unsigned long *irq_flags);
+
+static inline void
+ftrace_perf_buf_submit(void *raw_data, int size, int rctx, u64 addr,
+ u64 count, unsigned long irq_flags)
+{
+ struct trace_entry *entry = raw_data;
+
+ perf_tp_event(entry->type, addr, count, raw_data, size);
+ perf_swevent_put_recursion_context(rctx);
+ local_irq_restore(irq_flags);
+}
#endif
#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 5b46cf9..fb2c5bd 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -755,22 +755,12 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \
proto) \
{ \
struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
- extern int perf_swevent_get_recursion_context(void); \
- extern void perf_swevent_put_recursion_context(int rctx); \
- extern void perf_tp_event(int, u64, u64, void *, int); \
struct ftrace_raw_##call *entry; \
u64 __addr = 0, __count = 1; \
unsigned long irq_flags; \
- struct trace_entry *ent; \
int __entry_size; \
int __data_size; \
- char *trace_buf; \
- char *raw_data; \
- int __cpu; \
int rctx; \
- int pc; \
- \
- pc = preempt_count(); \
\
__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
@@ -780,42 +770,16 @@ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \
if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE, \
"profile buffer not large enough")) \
return; \
- \
- local_irq_save(irq_flags); \
- \
- rctx = perf_swevent_get_recursion_context(); \
- if (rctx < 0) \
- goto end_recursion; \
- \
- __cpu = smp_processor_id(); \
- \
- if (in_nmi()) \
- trace_buf = rcu_dereference(perf_trace_buf_nmi); \
- else \
- trace_buf = rcu_dereference(perf_trace_buf); \
- \
- if (!trace_buf) \
- goto end; \
- \
- raw_data = per_cpu_ptr(trace_buf, __cpu); \
- \
- *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \
- entry = (struct ftrace_raw_##call *)raw_data; \
- ent = &entry->ent; \
- tracing_generic_entry_update(ent, irq_flags, pc); \
- ent->type = event_call->id; \
- \
+ entry = (struct ftrace_raw_##call *)ftrace_perf_buf_prepare( \
+ __entry_size, event_call->id, &rctx, &irq_flags); \
+ if (!entry) \
+ return; \
tstruct \
\
{ assign; } \
\
- perf_tp_event(event_call->id, __addr, __count, entry, \
- __entry_size); \
- \
-end: \
- perf_swevent_put_recursion_context(rctx); \
-end_recursion: \
- local_irq_restore(irq_flags); \
+ ftrace_perf_buf_submit(entry, __entry_size, rctx, __addr, \
+ __count, irq_flags); \
}
#undef DEFINE_EVENT
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573..f0d6930 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -6,14 +6,12 @@
*/
#include <linux/module.h>
+#include <linux/kprobes.h>
#include "trace.h"
-char *perf_trace_buf;
-EXPORT_SYMBOL_GPL(perf_trace_buf);
-
-char *perf_trace_buf_nmi;
-EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
+static char *perf_trace_buf;
+static char *perf_trace_buf_nmi;
typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
@@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id)
}
mutex_unlock(&event_mutex);
}
+
+__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
+ int *rctxp, unsigned long *irq_flags)
+{
+ struct trace_entry *entry;
+ char *trace_buf, *raw_data;
+ int pc, cpu;
+
+ pc = preempt_count();
+
+ /* Protect the per cpu buffer, begin the rcu read side */
+ local_irq_save(*irq_flags);
+
+ *rctxp = perf_swevent_get_recursion_context();
+ if (*rctxp < 0)
+ goto err_recursion;
+
+ cpu = smp_processor_id();
+
+ if (in_nmi())
+ trace_buf = rcu_dereference(perf_trace_buf_nmi);
+ else
+ trace_buf = rcu_dereference(perf_trace_buf);
+
+ if (!trace_buf)
+ goto err;
+
+ raw_data = per_cpu_ptr(trace_buf, cpu);
+
+ /* zero the dead bytes from align to not leak stack to user */
+ *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+
+ entry = (struct trace_entry *)raw_data;
+ tracing_generic_entry_update(entry, *irq_flags, pc);
+ entry->type = type;
+
+ return raw_data;
+err:
+ perf_swevent_put_recursion_context(*rctxp);
+err_recursion:
+ local_irq_restore(*irq_flags);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aa19b6a..2714d23 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1223,14 +1223,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct ftrace_event_call *call = &tp->call;
struct kprobe_trace_entry *entry;
- struct trace_entry *ent;
- int size, __size, i, pc, __cpu;
+ int size, __size, i;
unsigned long irq_flags;
- char *trace_buf;
- char *raw_data;
int rctx;
- pc = preempt_count();
__size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1238,45 +1234,16 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
"profile buffer not large enough"))
return 0;
- /*
- * Protect the non nmi buffer
- * This also protects the rcu read side
- */
- local_irq_save(irq_flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- __cpu = smp_processor_id();
-
- if (in_nmi())
- trace_buf = rcu_dereference(perf_trace_buf_nmi);
- else
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, __cpu);
-
- /* Zero dead bytes from alignment to avoid buffer leak to userspace */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
- entry = (struct kprobe_trace_entry *)raw_data;
- ent = &entry->ent;
+ entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+ if (!entry)
+ return 0;
- tracing_generic_entry_update(ent, irq_flags, pc);
- ent->type = call->id;
entry->nargs = tp->nr_args;
entry->ip = (unsigned long)kp->addr;
for (i = 0; i < tp->nr_args; i++)
entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
- perf_tp_event(call->id, entry->ip, 1, entry, size);
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(irq_flags);
+ ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
return 0;
}
@@ -1288,14 +1255,10 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
struct ftrace_event_call *call = &tp->call;
struct kretprobe_trace_entry *entry;
- struct trace_entry *ent;
- int size, __size, i, pc, __cpu;
+ int size, __size, i;
unsigned long irq_flags;
- char *trace_buf;
- char *raw_data;
int rctx;
- pc = preempt_count();
__size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
@@ -1303,46 +1266,17 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
"profile buffer not large enough"))
return 0;
- /*
- * Protect the non nmi buffer
- * This also protects the rcu read side
- */
- local_irq_save(irq_flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- __cpu = smp_processor_id();
-
- if (in_nmi())
- trace_buf = rcu_dereference(perf_trace_buf_nmi);
- else
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, __cpu);
-
- /* Zero dead bytes from alignment to avoid buffer leak to userspace */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
- entry = (struct kretprobe_trace_entry *)raw_data;
- ent = &entry->ent;
+ entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
+ if (!entry)
+ return 0;
- tracing_generic_entry_update(ent, irq_flags, pc);
- ent->type = call->id;
entry->nargs = tp->nr_args;
entry->func = (unsigned long)tp->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
for (i = 0; i < tp->nr_args; i++)
entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
- perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(irq_flags);
+ ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
return 0;
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f6b0712..6cce6a8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -433,12 +433,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
unsigned long flags;
- char *trace_buf;
- char *raw_data;
int syscall_nr;
int rctx;
int size;
- int cpu;
syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +454,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
"profile buffer not large enough"))
return;
- /* Protect the per cpu buffer, begin the rcu read side */
- local_irq_save(flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- cpu = smp_processor_id();
-
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, cpu);
-
- /* zero the dead bytes from align to not leak stack to user */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
+ rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
+ sys_data->enter_event->id, &rctx, &flags);
+ if (!rec)
+ return;
- rec = (struct syscall_trace_enter *) raw_data;
- tracing_generic_entry_update(&rec->ent, 0, 0);
- rec->ent.type = sys_data->enter_event->id;
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size);
-
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(flags);
+ ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
}
int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +506,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
struct syscall_trace_exit *rec;
unsigned long flags;
int syscall_nr;
- char *trace_buf;
- char *raw_data;
int rctx;
int size;
- int cpu;
syscall_nr = syscall_get_nr(current, regs);
if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +529,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
"exit event has grown above profile buffer size"))
return;
- /* Protect the per cpu buffer, begin the rcu read side */
- local_irq_save(flags);
-
- rctx = perf_swevent_get_recursion_context();
- if (rctx < 0)
- goto end_recursion;
-
- cpu = smp_processor_id();
-
- trace_buf = rcu_dereference(perf_trace_buf);
-
- if (!trace_buf)
- goto end;
-
- raw_data = per_cpu_ptr(trace_buf, cpu);
-
- /* zero the dead bytes from align to not leak stack to user */
- *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
-
- rec = (struct syscall_trace_exit *)raw_data;
+ rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
+ sys_data->exit_event->id, &rctx, &flags);
+ if (!rec)
+ return;
- tracing_generic_entry_update(&rec->ent, 0, 0);
- rec->ent.type = sys_data->exit_event->id;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size);
-
-end:
- perf_swevent_put_recursion_context(rctx);
-end_recursion:
- local_irq_restore(flags);
+ ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
}
int prof_sysexit_enable(struct ftrace_event_call *call)
--
1.6.1.2
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()
2010-01-19 7:36 ` Xiao Guangrong
@ 2010-01-19 8:41 ` Peter Zijlstra
0 siblings, 0 replies; 25+ messages in thread
From: Peter Zijlstra @ 2010-01-19 8:41 UTC (permalink / raw)
To: Xiao Guangrong; +Cc: Ingo Molnar, Frederic Weisbecker, Paul Mackerras, LKML
On Tue, 2010-01-19 at 15:36 +0800, Xiao Guangrong wrote:
> Hi Peter,
>
> Peter Zijlstra wrote:
> > On Mon, 2010-01-18 at 21:42 +0800, Xiao Guangrong wrote:
> >> It only disable preemption in perf_swevent_get_recursion_context()
> >> it can't avoid race of hard-irq and NMI
> >>
> >> In this patch, we use atomic operation to avoid it and reduce
> >> cpu_ctx->recursion size, it also make this patch no need diable
> >> preemption
> >
> > Uhm why?
> >
> > This patch looks terminally broken
>
> Please see my explanation in another mail:
> http://lkml.org/lkml/2010/1/18/501
Still its not going to happen, we need those 4 recursion contexts.
Otherwise we could not receive a software event from an IRQ while we are
processing a software event from process context, etc.
^ permalink raw reply [flat|nested] 25+ messages in thread
* [PATCH 3/3 v2] tracing/kprobe: cleanup unused return value of function
2010-01-18 13:46 ` [PATCH 3/3] tracing/kprobe: cleanup unused return value of function Xiao Guangrong
` (2 preceding siblings ...)
2010-01-19 8:39 ` [PATCH 2/3 v2] perf_event: cleanup for event profile buffer operation Xiao Guangrong
@ 2010-01-19 8:41 ` Xiao Guangrong
3 siblings, 0 replies; 25+ messages in thread
From: Xiao Guangrong @ 2010-01-19 8:41 UTC (permalink / raw)
To: Ingo Molnar
Cc: Peter Zijlstra, Frederic Weisbecker, Paul Mackerras,
Masami Hiramatsu, Jason Baron, LKML
Those function's return value is meaningless
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
---
kernel/trace/trace_kprobe.c | 28 ++++++++++++++--------------
1 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2714d23..de2cf85 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -942,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
};
/* Kprobe handler */
-static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
+static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
struct kprobe_trace_entry *entry;
@@ -962,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
irq_flags, pc);
if (!event)
- return 0;
+ return ;
entry = ring_buffer_event_data(event);
entry->nargs = tp->nr_args;
@@ -972,11 +972,11 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
- return 0;
+ return ;
}
/* Kretprobe handler */
-static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -995,7 +995,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
irq_flags, pc);
if (!event)
- return 0;
+ return ;
entry = ring_buffer_event_data(event);
entry->nargs = tp->nr_args;
@@ -1007,7 +1007,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
if (!filter_current_check_discard(buffer, call, entry, event))
trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
- return 0;
+ return ;
}
/* Event entry printers */
@@ -1217,7 +1217,7 @@ static int set_print_fmt(struct trace_probe *tp)
#ifdef CONFIG_PERF_EVENTS
/* Kprobe profile handler */
-static __kprobes int kprobe_profile_func(struct kprobe *kp,
+static __kprobes void kprobe_profile_func(struct kprobe *kp,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
@@ -1232,11 +1232,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
size -= sizeof(u32);
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
"profile buffer not large enough"))
- return 0;
+ return ;
entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
if (!entry)
- return 0;
+ return ;
entry->nargs = tp->nr_args;
entry->ip = (unsigned long)kp->addr;
@@ -1245,11 +1245,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
- return 0;
+ return ;
}
/* Kretprobe profile handler */
-static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
+static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1264,11 +1264,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
size -= sizeof(u32);
if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
"profile buffer not large enough"))
- return 0;
+ return ;
entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
if (!entry)
- return 0;
+ return ;
entry->nargs = tp->nr_args;
entry->func = (unsigned long)tp->rp.kp.addr;
@@ -1278,7 +1278,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
- return 0;
+ return ;
}
static int probe_profile_enable(struct ftrace_event_call *call)
--
1.6.1.2
^ permalink raw reply related [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()
2010-01-19 1:19 ` Xiao Guangrong
@ 2010-01-19 8:46 ` Peter Zijlstra
2010-01-19 8:58 ` Frederic Weisbecker
1 sibling, 0 replies; 25+ messages in thread
From: Peter Zijlstra @ 2010-01-19 8:46 UTC (permalink / raw)
To: Xiao Guangrong; +Cc: Frederic Weisbecker, Ingo Molnar, Paul Mackerras, LKML
On Tue, 2010-01-19 at 09:19 +0800, Xiao Guangrong wrote:
>
>
> It's because hard-irq(we can handle interruption with interruption enabled)
> and NMI are nested, for example:
>
> int perf_swevent_get_recursion_context(void)
> {
> ......
> if (cpuctx->recursion[rctx]) {
> put_cpu_var(perf_cpu_context);
> return -1;
> }
>
> /*
> * Another interruption handler/NMI will re-enter there if it
> * happed, it make the recursion value chaotic
> */
> cpuctx->recursion[rctx]++;
> ......
> }
This doesn't make any sense at all, if IRQs are nested (bad bad bad)
then it will still end up with the same recursion context and detect
that recursion and bail on the nested one. So what's the problem?
NMIs are not allowed to nest, ever.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3 v2] perf_event: fix race in perf_swevent_get_recursion_context()
2010-01-19 8:37 ` [PATCH 1/3 v2] perf_event: fix race in perf_swevent_get_recursion_context() Xiao Guangrong
@ 2010-01-19 8:46 ` Peter Zijlstra
2010-01-19 9:06 ` Xiao Guangrong
0 siblings, 1 reply; 25+ messages in thread
From: Peter Zijlstra @ 2010-01-19 8:46 UTC (permalink / raw)
To: Xiao Guangrong
Cc: Ingo Molnar, Frederic Weisbecker, Paul Mackerras,
Masami Hiramatsu, Jason Baron, LKML
On Tue, 2010-01-19 at 16:37 +0800, Xiao Guangrong wrote:
> It only disable preemption in perf_swevent_get_recursion_context(),
> it can't avoid race of hard-irq and NMI since they are nested that
> will re-enter this path and make the recursion value chaotic
>
> In this patch, we use atomic operation to avoid it and reduce
> cpu_ctx->recursion size, it also make this patch no need disable
> preemption
NAK
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()
2010-01-19 1:19 ` Xiao Guangrong
2010-01-19 8:46 ` Peter Zijlstra
@ 2010-01-19 8:58 ` Frederic Weisbecker
2010-01-19 9:09 ` Xiao Guangrong
1 sibling, 1 reply; 25+ messages in thread
From: Frederic Weisbecker @ 2010-01-19 8:58 UTC (permalink / raw)
To: Xiao Guangrong; +Cc: Ingo Molnar, Peter Zijlstra, Paul Mackerras, LKML
On Tue, Jan 19, 2010 at 09:19:43AM +0800, Xiao Guangrong wrote:
>
>
> Frederic Weisbecker wrote:
> > On Mon, Jan 18, 2010 at 09:42:34PM +0800, Xiao Guangrong wrote:
> >> It only disable preemption in perf_swevent_get_recursion_context()
> >> it can't avoid race of hard-irq and NMI
> >>
> >> In this patch, we use atomic operation to avoid it and reduce
> >> cpu_ctx->recursion size, it also make this patch no need diable
> >> preemption
> >>
> >> Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
> >
> >
> >
> > I don't understand what is racy in what we have currently.
> >
>
> It's because hard-irq(we can handle interruption with interruption enabled)
> and NMI are nested, for example:
>
> int perf_swevent_get_recursion_context(void)
> {
> ......
> if (cpuctx->recursion[rctx]) {
> put_cpu_var(perf_cpu_context);
> return -1;
> }
>
> /*
> * Another interruption handler/NMI will re-enter there if it
> * happed, it make the recursion value chaotic
> */
> cpuctx->recursion[rctx]++;
> ......
I still don't understand the problem.
It's not like a fight between different cpus, it's a local per cpu
fight.
NMIs can't nest other NMIs but hardirq can nest another hardirqs,
we don't care much about these though.
So let's imagine the following sequence, a fight between nested
hardirqs:
cpuctx->recursion[irq] initially = 0
Interrupt (level 0):
if (cpuctx->recursion[rctx]) {
put_cpu_var(perf_cpu_context);
return -1;
}
Interrupt (level 1):
cpuctx->recursion[rctx]++; // = 1
...
do something
...
cpuctx->recursion[rctx]--; // = 0
End Interrupt (level 1)
cpuctx->recursion[rctx]++; // = 1
...
do something
...
cpuctx->recursion[rctx]--; // = 0
End interrupt (level 0)
Another sequence could be Interrupt level 0 has
already incremented recursion and we are interrupted by
irq level 1 which then won't be able to get the recursion
context. But that's not a big deal I think.
> > This looks broken. We don't call back perf_swevent_put_recursion_context
> > in fail case, so the bit won't ever be cleared once we recurse.
> >
>
> Um, i think we can't clear the bit in this fail case, consider below
> sequence:
>
> path A: path B
>
> set bit but find the bit already set
> atomic set bit |
> | |
> V |
> handle SW event |
> | V
> V exit and not clear the bit
> atomic clear bit
>
> After A and B, the bit is still zero
>
> Right? :-)
Ah indeed, it will be cleared by the interrupted path.
I still don't understand what this patch brings us though.
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation
2010-01-19 1:26 ` Xiao Guangrong
@ 2010-01-19 9:00 ` Frederic Weisbecker
2010-01-19 14:26 ` Masami Hiramatsu
0 siblings, 1 reply; 25+ messages in thread
From: Frederic Weisbecker @ 2010-01-19 9:00 UTC (permalink / raw)
To: Xiao Guangrong
Cc: Masami Hiramatsu, Ingo Molnar, Peter Zijlstra, Paul Mackerras,
Jason Baron, LKML
On Tue, Jan 19, 2010 at 09:26:10AM +0800, Xiao Guangrong wrote:
> > Actual filling buffer is done in the profile handlers,
> > so I think ftrace_perf_buf_prepare() may be better :-)
> > ftrace_perf_buf_submit is good to me:-)
> >
>
> OK, i'll rename those functions and add __kprobe for ftrace_perf_buf_prepare(),
> i guess i can add you Acked-by :-)
You can add mine yeah (couldn't tell for Masami).
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3 v2] perf_event: fix race in perf_swevent_get_recursion_context()
2010-01-19 8:46 ` Peter Zijlstra
@ 2010-01-19 9:06 ` Xiao Guangrong
0 siblings, 0 replies; 25+ messages in thread
From: Xiao Guangrong @ 2010-01-19 9:06 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Frederic Weisbecker, Paul Mackerras,
Masami Hiramatsu, Jason Baron, LKML
Peter Zijlstra wrote:
> On Tue, 2010-01-19 at 16:37 +0800, Xiao Guangrong wrote:
>> It only disable preemption in perf_swevent_get_recursion_context(),
>> it can't avoid race of hard-irq and NMI since they are nested that
>> will re-enter this path and make the recursion value chaotic
>>
>> In this patch, we use atomic operation to avoid it and reduce
>> cpu_ctx->recursion size, it also make this patch no need disable
>> preemption
>
> NAK
>
Sorry, i forget the nesting of hard-irq is sequential, and not pollute
recursion value, thanks for you point out.
Hi Ingo,
Please ignore this patch, it not pollute other 2 patches in this
patchset
Thanks,
Xiao
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context()
2010-01-19 8:58 ` Frederic Weisbecker
@ 2010-01-19 9:09 ` Xiao Guangrong
0 siblings, 0 replies; 25+ messages in thread
From: Xiao Guangrong @ 2010-01-19 9:09 UTC (permalink / raw)
To: Frederic Weisbecker; +Cc: Ingo Molnar, Peter Zijlstra, Paul Mackerras, LKML
Frederic Weisbecker wrote:
>
> I still don't understand the problem.
>
> It's not like a fight between different cpus, it's a local per cpu
> fight.
>
> NMIs can't nest other NMIs but hardirq can nest another hardirqs,
> we don't care much about these though.
> So let's imagine the following sequence, a fight between nested
> hardirqs:
>
> cpuctx->recursion[irq] initially = 0
>
> Interrupt (level 0):
>
> if (cpuctx->recursion[rctx]) {
> put_cpu_var(perf_cpu_context);
> return -1;
> }
>
> Interrupt (level 1):
>
>
> cpuctx->recursion[rctx]++; // = 1
>
> ...
> do something
> ...
> cpuctx->recursion[rctx]--; // = 0
>
> End Interrupt (level 1)
>
> cpuctx->recursion[rctx]++; // = 1
>
> ...
> do something
> ...
> cpuctx->recursion[rctx]--; // = 0
>
> End interrupt (level 0)
>
> Another sequence could be Interrupt level 0 has
> already incremented recursion and we are interrupted by
> irq level 1 which then won't be able to get the recursion
> context. But that's not a big deal I think.
>
Thanks Frederic, i forget this feature of nesting of hard-irq :-(
Sorry for disturb you all
- Xiao
^ permalink raw reply [flat|nested] 25+ messages in thread
* Re: [PATCH 2/3] perf_event: cleanup for event profile buffer operation
2010-01-19 9:00 ` Frederic Weisbecker
@ 2010-01-19 14:26 ` Masami Hiramatsu
0 siblings, 0 replies; 25+ messages in thread
From: Masami Hiramatsu @ 2010-01-19 14:26 UTC (permalink / raw)
To: Frederic Weisbecker
Cc: Xiao Guangrong, Ingo Molnar, Peter Zijlstra, Paul Mackerras,
Jason Baron, LKML
Frederic Weisbecker wrote:
> On Tue, Jan 19, 2010 at 09:26:10AM +0800, Xiao Guangrong wrote:
>>> Actual filling buffer is done in the profile handlers,
>>> so I think ftrace_perf_buf_prepare() may be better :-)
>>> ftrace_perf_buf_submit is good to me:-)
>>>
>>
>> OK, i'll rename those functions and add __kprobe for ftrace_perf_buf_prepare(),
>> i guess i can add you Acked-by :-)
>
>
> You can add mine yeah (couldn't tell for Masami).
Sure, add mine too, because other parts look good for me. :-)
--
Masami Hiramatsu
Software Engineer
Hitachi Computer Products (America), Inc.
Software Solutions Division
e-mail: mhiramat@redhat.com
^ permalink raw reply [flat|nested] 25+ messages in thread
end of thread, other threads:[~2010-01-19 14:24 UTC | newest]
Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-01-18 13:42 [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context() Xiao Guangrong
2010-01-18 13:44 ` [PATCH 2/3] perf_event: cleanup for event profile buffer operation Xiao Guangrong
2010-01-18 13:46 ` [PATCH 3/3] tracing/kprobe: cleanup unused return value of function Xiao Guangrong
2010-01-18 16:16 ` Masami Hiramatsu
2010-01-19 8:37 ` [PATCH 1/3 v2] perf_event: fix race in perf_swevent_get_recursion_context() Xiao Guangrong
2010-01-19 8:46 ` Peter Zijlstra
2010-01-19 9:06 ` Xiao Guangrong
2010-01-19 8:39 ` [PATCH 2/3 v2] perf_event: cleanup for event profile buffer operation Xiao Guangrong
2010-01-19 8:41 ` [PATCH 3/3 v2] tracing/kprobe: cleanup unused return value of function Xiao Guangrong
2010-01-18 16:21 ` [PATCH 2/3] perf_event: cleanup for event profile buffer operation Masami Hiramatsu
2010-01-18 17:20 ` Frederic Weisbecker
2010-01-18 17:48 ` Masami Hiramatsu
2010-01-18 18:02 ` Frederic Weisbecker
2010-01-19 1:26 ` Xiao Guangrong
2010-01-19 9:00 ` Frederic Weisbecker
2010-01-19 14:26 ` Masami Hiramatsu
2010-01-18 17:11 ` Frederic Weisbecker
2010-01-18 13:55 ` [PATCH 1/3] perf_event: fix race in perf_swevent_get_recursion_context() Peter Zijlstra
2010-01-19 7:36 ` Xiao Guangrong
2010-01-19 8:41 ` Peter Zijlstra
2010-01-18 16:41 ` Frederic Weisbecker
2010-01-19 1:19 ` Xiao Guangrong
2010-01-19 8:46 ` Peter Zijlstra
2010-01-19 8:58 ` Frederic Weisbecker
2010-01-19 9:09 ` Xiao Guangrong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox