* [PATCH 0/4] perf counter bits
@ 2009-08-19 9:18 Peter Zijlstra
2009-08-19 9:18 ` [PATCH 1/4] perf_counter: Default to higher paranoia level Peter Zijlstra
` (3 more replies)
0 siblings, 4 replies; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-19 9:18 UTC (permalink / raw)
To: Ingo Molnar, Paul Mackerras
Cc: Arnaldo Carvalho de Melo, Frederic Weisbecker, Mike Galbraith,
linux-kernel, Peter Zijlstra
Some perf counter patches for your consideration ;-)
--
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 1/4] perf_counter: Default to higher paranoia level
2009-08-19 9:18 [PATCH 0/4] perf counter bits Peter Zijlstra
@ 2009-08-19 9:18 ` Peter Zijlstra
2009-08-19 14:07 ` Peter Zijlstra
2009-08-19 9:18 ` [PATCH 2/4] perf_counter: powerpc: Support the anonymized kernel callchain bits Peter Zijlstra
` (2 subsequent siblings)
3 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-19 9:18 UTC (permalink / raw)
To: Ingo Molnar, Paul Mackerras
Cc: Arnaldo Carvalho de Melo, Frederic Weisbecker, Mike Galbraith,
linux-kernel, Jens Axboe, Peter Zijlstra
[-- Attachment #1: perf-sekure.patch --]
[-- Type: text/plain, Size: 3901 bytes --]
Change the default permissions on perf counters.
The new default will disallow regular users to create cpu-wide
counters, and will anonymize kernel IPs for task samples.
This will allow a user to profile his own applications and still know
the proportion of the kernel events, but does not expose kernel IPs.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jens Axboe <jens.axboe@oracle.com>
---
arch/x86/kernel/cpu/perf_counter.c | 6 ++++++
include/linux/perf_counter.h | 1 +
kernel/perf_counter.c | 27 ++++++++++++++++++++-------
3 files changed, 27 insertions(+), 7 deletions(-)
Index: linux-2.6/arch/x86/kernel/cpu/perf_counter.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/perf_counter.c
+++ linux-2.6/arch/x86/kernel/cpu/perf_counter.c
@@ -2153,7 +2153,13 @@ static const struct stacktrace_ops backt
static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
+ u64 anon_ip = perf_paranoid_anon_ip();
+
callchain_store(entry, PERF_CONTEXT_KERNEL);
+ if (anon_ip) {
+ callchain_store(entry, anon_ip);
+ return;
+ }
callchain_store(entry, regs->ip);
dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -754,6 +754,7 @@ static inline void perf_counter_mmap(str
extern void perf_counter_comm(struct task_struct *tsk);
extern void perf_counter_fork(struct task_struct *tsk);
+extern unsigned long perf_paranoid_anon_ip(void);
extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
extern int sysctl_perf_counter_paranoid;
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -48,18 +48,29 @@ static atomic_t nr_task_counters __read_
* perf counter paranoia level:
* 0 - not paranoid
* 1 - disallow cpu counters to unpriv
- * 2 - disallow kernel profiling to unpriv
+ * 2 - anonymize kernel RIPs to unpriv
+ * 3 - disallow kernel profiling to unpriv
*/
-int sysctl_perf_counter_paranoid __read_mostly;
+int sysctl_perf_counter_paranoid __read_mostly = 2;
static inline bool perf_paranoid_cpu(void)
{
- return sysctl_perf_counter_paranoid > 0;
+ return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 0;
+}
+
+static inline bool perf_paranoid_anon(void)
+{
+ return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 1;
}
static inline bool perf_paranoid_kernel(void)
{
- return sysctl_perf_counter_paranoid > 1;
+ return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 2;
+}
+
+unsigned long perf_paranoid_anon_ip(void)
+{
+ return perf_paranoid_anon() ? _THIS_IP_ : 0;
}
int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
@@ -1571,7 +1582,7 @@ static struct perf_counter_context *find
*/
if (cpu != -1) {
/* Must be root to operate on a CPU counter: */
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_cpu())
return ERR_PTR(-EACCES);
if (cpu < 0 || cpu > num_possible_cpus())
@@ -2841,7 +2852,9 @@ void perf_counter_output(struct perf_cou
header.misc |= perf_misc_flags(data->regs);
if (sample_type & PERF_SAMPLE_IP) {
- ip = perf_instruction_pointer(data->regs);
+ ip = perf_paranoid_anon_ip();
+ if (!ip || user_mode(data->regs))
+ ip = perf_instruction_pointer(data->regs);
header.size += sizeof(ip);
}
@@ -4227,7 +4240,7 @@ SYSCALL_DEFINE5(perf_counter_open,
return ret;
if (!attr.exclude_kernel) {
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_kernel())
return -EACCES;
}
--
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 2/4] perf_counter: powerpc: Support the anonymized kernel callchain bits
2009-08-19 9:18 [PATCH 0/4] perf counter bits Peter Zijlstra
2009-08-19 9:18 ` [PATCH 1/4] perf_counter: Default to higher paranoia level Peter Zijlstra
@ 2009-08-19 9:18 ` Peter Zijlstra
2009-08-19 13:30 ` [tip:perfcounters/core] perf_counter: powerpc: Support the anonimized " tip-bot for Peter Zijlstra
2009-08-19 9:18 ` [PATCH 3/4] perf tools: Check perf.data owner Peter Zijlstra
2009-08-19 9:18 ` [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels Peter Zijlstra
3 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-19 9:18 UTC (permalink / raw)
To: Ingo Molnar, Paul Mackerras
Cc: Arnaldo Carvalho de Melo, Frederic Weisbecker, Mike Galbraith,
linux-kernel, Peter Zijlstra
[-- Attachment #1: perf-sekure-ppc.patch --]
[-- Type: text/plain, Size: 969 bytes --]
Adds support for anonymized kernel callchains to the powerpc callchain
code.
This patch is not folded into the patch that introduces this feature
because the powerpc callchain code isn't upstream yet, and this allows
re-ordering the patches.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
arch/powerpc/kernel/perf_callchain.c | 5 +++++
1 file changed, 5 insertions(+)
Index: linux-2.6/arch/powerpc/kernel/perf_callchain.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/perf_callchain.c
+++ linux-2.6/arch/powerpc/kernel/perf_callchain.c
@@ -70,6 +70,11 @@ static void perf_callchain_kernel(struct
lr = regs->link;
sp = regs->gpr[1];
callchain_store(entry, PERF_CONTEXT_KERNEL);
+ next_ip = perf_paranoid_anon_ip();
+ if (next_ip) {
+ callchain_store(entry, next_ip);
+ return;
+ }
callchain_store(entry, regs->nip);
if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
--
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 3/4] perf tools: Check perf.data owner
2009-08-19 9:18 [PATCH 0/4] perf counter bits Peter Zijlstra
2009-08-19 9:18 ` [PATCH 1/4] perf_counter: Default to higher paranoia level Peter Zijlstra
2009-08-19 9:18 ` [PATCH 2/4] perf_counter: powerpc: Support the anonymized kernel callchain bits Peter Zijlstra
@ 2009-08-19 9:18 ` Peter Zijlstra
2009-08-19 13:32 ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-08-19 9:18 ` [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels Peter Zijlstra
3 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-19 9:18 UTC (permalink / raw)
To: Ingo Molnar, Paul Mackerras
Cc: Arnaldo Carvalho de Melo, Frederic Weisbecker, Mike Galbraith,
linux-kernel, Peter Zijlstra
[-- Attachment #1: perf-tool-check-uid.patch --]
[-- Type: text/plain, Size: 2900 bytes --]
Add an owner check to opening perf.data files and a switch to silence
it.
Because perf-report/perf-annotate are binary parsers reading another
users' perf.data file could be a security risk if the file were
explicitly engineered to trigger bugs in the parser (we hope of course
there are non such bugs, but you never know).
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
tools/perf/builtin-annotate.c | 7 +++++++
tools/perf/builtin-report.c | 7 +++++++
2 files changed, 14 insertions(+)
Index: linux-2.6/tools/perf/builtin-annotate.c
===================================================================
--- linux-2.6.orig/tools/perf/builtin-annotate.c
+++ linux-2.6/tools/perf/builtin-annotate.c
@@ -28,6 +28,7 @@ static char const *input_name = "perf.d
static char default_sort_order[] = "comm,symbol";
static char *sort_order = default_sort_order;
+static int force;
static int input;
static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
@@ -976,6 +977,11 @@ static int __cmd_annotate(void)
exit(-1);
}
+ if (!force && (input_stat.st_uid != geteuid())) {
+ fprintf(stderr, "file: %s not owned by current user\n", input_name);
+ exit(-1);
+ }
+
if (!input_stat.st_size) {
fprintf(stderr, "zero-sized file, nothing to do!\n");
exit(0);
@@ -1081,6 +1087,7 @@ static const struct option options[] = {
"input file name"),
OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
"symbol to annotate"),
+ OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
OPT_BOOLEAN('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
Index: linux-2.6/tools/perf/builtin-report.c
===================================================================
--- linux-2.6.orig/tools/perf/builtin-report.c
+++ linux-2.6/tools/perf/builtin-report.c
@@ -37,6 +37,7 @@ static char *dso_list_str, *comm_list_s
static struct strlist *dso_list, *comm_list, *sym_list;
static char *field_sep;
+static int force;
static int input;
static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
@@ -1383,6 +1384,11 @@ static int __cmd_report(void)
exit(-1);
}
+ if (!force && (input_stat.st_uid != geteuid())) {
+ fprintf(stderr, "file: %s not owned by current user\n", input_name);
+ exit(-1);
+ }
+
if (!input_stat.st_size) {
fprintf(stderr, "zero-sized file, nothing to do!\n");
exit(0);
@@ -1594,6 +1600,7 @@ static const struct option options[] = {
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
"dump raw trace in ASCII"),
OPT_STRING('k', "vmlinux", &vmlinux_name, "file", "vmlinux pathname"),
+ OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
OPT_BOOLEAN('m', "modules", &modules,
"load module symbols - WARNING: use only with -k and LIVE kernel"),
OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples,
--
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 9:18 [PATCH 0/4] perf counter bits Peter Zijlstra
` (2 preceding siblings ...)
2009-08-19 9:18 ` [PATCH 3/4] perf tools: Check perf.data owner Peter Zijlstra
@ 2009-08-19 9:18 ` Peter Zijlstra
2009-08-19 10:58 ` Ingo Molnar
` (3 more replies)
3 siblings, 4 replies; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-19 9:18 UTC (permalink / raw)
To: Ingo Molnar, Paul Mackerras
Cc: Arnaldo Carvalho de Melo, Frederic Weisbecker, Mike Galbraith,
linux-kernel, stephane eranian, Peter Zijlstra
[-- Attachment #1: perf-output-channel.patch --]
[-- Type: text/plain, Size: 5613 bytes --]
Provide the ability to configure a counter to send its output to
another (already existing) counter's output stream.
[ compile tested only ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stephane eranian <eranian@googlemail.com>
---
include/linux/perf_counter.h | 5 ++
kernel/perf_counter.c | 83 +++++++++++++++++++++++++++++++++++++++++--
2 files changed, 85 insertions(+), 3 deletions(-)
Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -216,6 +216,7 @@ struct perf_counter_attr {
#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2)
#define PERF_COUNTER_IOC_RESET _IO ('$', 3)
#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64)
+#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5)
enum perf_counter_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
@@ -415,6 +416,9 @@ enum perf_callchain_context {
PERF_CONTEXT_MAX = (__u64)-4095,
};
+#define PERF_FLAG_FD_NO_GROUP (1U << 0)
+#define PERF_FLAG_FD_OUTPUT (1U << 1)
+
#ifdef __KERNEL__
/*
* Kernel-internal data types and definitions:
@@ -536,6 +540,7 @@ struct perf_counter {
struct list_head sibling_list;
int nr_siblings;
struct perf_counter *group_leader;
+ struct perf_counter *output;
const struct pmu *pmu;
enum perf_counter_active_state state;
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1686,6 +1686,11 @@ static void free_counter(struct perf_cou
atomic_dec(&nr_task_counters);
}
+ if (counter->output) {
+ fput(counter->output->filp);
+ counter->output = NULL;
+ }
+
if (counter->destroy)
counter->destroy(counter);
@@ -1971,6 +1976,8 @@ unlock:
return ret;
}
+int perf_counter_set_output(struct perf_counter *counter, int output_fd);
+
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct perf_counter *counter = file->private_data;
@@ -1994,6 +2001,9 @@ static long perf_ioctl(struct file *file
case PERF_COUNTER_IOC_PERIOD:
return perf_counter_period(counter, (u64 __user *)arg);
+ case PERF_COUNTER_IOC_SET_OUTPUT:
+ return perf_counter_set_output(counter, arg);
+
default:
return -ENOTTY;
}
@@ -2264,6 +2274,11 @@ static int perf_mmap(struct file *file,
WARN_ON_ONCE(counter->ctx->parent_ctx);
mutex_lock(&counter->mmap_mutex);
+ if (counter->output) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
if (atomic_inc_not_zero(&counter->mmap_count)) {
if (nr_pages != counter->data->nr_pages)
ret = -EINVAL;
@@ -2649,6 +2664,7 @@ static int perf_output_begin(struct perf
struct perf_counter *counter, unsigned int size,
int nmi, int sample)
{
+ struct perf_counter *output_counter;
struct perf_mmap_data *data;
unsigned int offset, head;
int have_lost;
@@ -2658,13 +2674,17 @@ static int perf_output_begin(struct perf
u64 lost;
} lost_event;
+ rcu_read_lock();
/*
* For inherited counters we send all the output towards the parent.
*/
if (counter->parent)
counter = counter->parent;
- rcu_read_lock();
+ output_counter = rcu_dereference(counter->output);
+ if (output_counter)
+ counter = output_counter;
+
data = rcu_dereference(counter->data);
if (!data)
goto out;
@@ -4214,6 +4234,57 @@ err_size:
goto out;
}
+int perf_counter_set_output(struct perf_counter *counter, int output_fd)
+{
+ struct perf_counter *output_counter = NULL;
+ struct file *output_file = NULL;
+ struct perf_counter *old_output;
+ int fput_needed = 0;
+ int ret = -EINVAL;
+
+ if (!output_fd)
+ goto set;
+
+ output_file = fget_light(output_fd, &fput_needed);
+ if (!output_file)
+ return -EBADF;
+
+ if (output_file->f_op != &perf_fops)
+ goto out;
+
+ output_counter = output_file->private_data;
+
+ /* Don't chain output fds */
+ if (output_counter->output)
+ goto out;
+
+ /* Don't set an output fd when we already have an output channel */
+ if (counter->data)
+ goto out;
+
+ atomic_long_inc(&output_file->f_count);
+
+set:
+ mutex_lock(&counter->mmap_mutex);
+ old_output = counter->output;
+ rcu_assign_pointer(counter->output, output_counter);
+ mutex_unlock(&counter->mmap_mutex);
+
+ if (old_output) {
+ /*
+ * we need to make sure no existing perf_output_*()
+ * is still referencing this counter.
+ */
+ synchronize_rcu();
+ fput(old_output->filp);
+ }
+
+ ret = 0;
+out:
+ fput_light(output_file, fput_needed);
+ return ret;
+}
+
/**
* sys_perf_counter_open - open a performance counter, associate it to a task/cpu
*
@@ -4236,7 +4307,7 @@ SYSCALL_DEFINE5(perf_counter_open,
int ret;
/* for future expandability... */
- if (flags)
+ if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
return -EINVAL;
ret = perf_copy_attr(attr_uptr, &attr);
@@ -4264,7 +4335,7 @@ SYSCALL_DEFINE5(perf_counter_open,
* Look up the group leader (we will attach this counter to it):
*/
group_leader = NULL;
- if (group_fd != -1) {
+ if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
ret = -EINVAL;
group_file = fget_light(group_fd, &fput_needed);
if (!group_file)
@@ -4306,6 +4377,12 @@ SYSCALL_DEFINE5(perf_counter_open,
if (!counter_file)
goto err_free_put_context;
+ if (flags & PERF_FLAG_FD_OUTPUT) {
+ ret = perf_counter_set_output(counter, group_fd);
+ if (ret)
+ goto err_free_put_context;
+ }
+
counter->filp = counter_file;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
--
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 9:18 ` [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels Peter Zijlstra
@ 2009-08-19 10:58 ` Ingo Molnar
2009-08-19 11:07 ` Peter Zijlstra
2009-08-19 12:41 ` Paul Mackerras
2009-08-19 12:36 ` Paul Mackerras
` (2 subsequent siblings)
3 siblings, 2 replies; 27+ messages in thread
From: Ingo Molnar @ 2009-08-19 10:58 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Paul Mackerras, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, stephane eranian
* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> Provide the ability to configure a counter to send its output to
> another (already existing) counter's output stream.
>
> [ compile tested only ]
very nice!
two comments:
> Index: linux-2.6/include/linux/perf_counter.h
> ===================================================================
> --- linux-2.6.orig/include/linux/perf_counter.h
> +++ linux-2.6/include/linux/perf_counter.h
> @@ -216,6 +216,7 @@ struct perf_counter_attr {
> #define PERF_COUNTER_IOC_REFRESH _IO ('$', 2)
> #define PERF_COUNTER_IOC_RESET _IO ('$', 3)
> #define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64)
> +#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5)
Time to add a new sys_perf_counter_chattr() syscall and deprecate
the ioctls?
> @@ -415,6 +416,9 @@ enum perf_callchain_context {
> PERF_CONTEXT_MAX = (__u64)-4095,
> };
>
> +#define PERF_FLAG_FD_NO_GROUP (1U << 0)
> +#define PERF_FLAG_FD_OUTPUT (1U << 1)
Why not extend the size of perf_counter_attr and add an output_fd
parameter? Zero would mean no fd (and this is also backwards
compatible behavior).
FD_NO_GROUP is a bit unclean API as it aliases group_fd to two
purposes: the real group_fd and this new output_fd. I think we
should move output_fd to the attribute structure.
That way one could also create a group counter which shares the
output channel with another counter, in the same
perf_counter_open() call.
Ingo
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 10:58 ` Ingo Molnar
@ 2009-08-19 11:07 ` Peter Zijlstra
2009-08-19 12:41 ` Paul Mackerras
1 sibling, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-19 11:07 UTC (permalink / raw)
To: Ingo Molnar
Cc: Paul Mackerras, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, stephane eranian
On Wed, 2009-08-19 at 12:58 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>
> > Provide the ability to configure a counter to send its output to
> > another (already existing) counter's output stream.
> >
> > [ compile tested only ]
>
> very nice!
>
> two comments:
>
> > Index: linux-2.6/include/linux/perf_counter.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/perf_counter.h
> > +++ linux-2.6/include/linux/perf_counter.h
> > @@ -216,6 +216,7 @@ struct perf_counter_attr {
> > #define PERF_COUNTER_IOC_REFRESH _IO ('$', 2)
> > #define PERF_COUNTER_IOC_RESET _IO ('$', 3)
> > #define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64)
> > +#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5)
>
> Time to add a new sys_perf_counter_chattr() syscall and deprecate
> the ioctls?
Could do I guess.. in order to support things like refresh and reset we
need a few new offset fields in the attr struct, but that's doable.
> > @@ -415,6 +416,9 @@ enum perf_callchain_context {
> > PERF_CONTEXT_MAX = (__u64)-4095,
> > };
> >
> > +#define PERF_FLAG_FD_NO_GROUP (1U << 0)
> > +#define PERF_FLAG_FD_OUTPUT (1U << 1)
>
> Why not extend the size of perf_counter_attr and add an output_fd
> parameter? Zero would mean no fd (and this is also backwards
> compatible behavior).
>
> FD_NO_GROUP is a bit unclean API as it aliases group_fd to two
> purposes: the real group_fd and this new output_fd. I think we
> should move output_fd to the attribute structure.
Yeah, the NO_GROUP thing is ugly..
I'll wait for some more comments before reworking this.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 9:18 ` [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels Peter Zijlstra
2009-08-19 10:58 ` Ingo Molnar
@ 2009-08-19 12:36 ` Paul Mackerras
2009-08-19 12:56 ` Ingo Molnar
2009-08-19 12:56 ` Peter Zijlstra
2009-08-19 16:19 ` Frederic Weisbecker
2009-08-25 7:39 ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
3 siblings, 2 replies; 27+ messages in thread
From: Paul Mackerras @ 2009-08-19 12:36 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, stephane eranian
Peter Zijlstra writes:
> Provide the ability to configure a counter to send its output to
> another (already existing) counter's output stream.
What sort of thing might this be useful for?
Does this only apply to sampling counters?
Paul.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 10:58 ` Ingo Molnar
2009-08-19 11:07 ` Peter Zijlstra
@ 2009-08-19 12:41 ` Paul Mackerras
1 sibling, 0 replies; 27+ messages in thread
From: Paul Mackerras @ 2009-08-19 12:41 UTC (permalink / raw)
To: Ingo Molnar
Cc: Peter Zijlstra, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, stephane eranian
Ingo Molnar writes:
> Time to add a new sys_perf_counter_chattr() syscall and deprecate
> the ioctls?
Hmmm... if 2.6.31 goes out with the ioctls, I think we should leave
them in for a long time. I've never seen what the problem with ioctl
is anyway. At least, I don't see how a new multi-function syscall
would be any better.
Paul.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 12:36 ` Paul Mackerras
@ 2009-08-19 12:56 ` Ingo Molnar
2009-08-19 12:56 ` Peter Zijlstra
1 sibling, 0 replies; 27+ messages in thread
From: Ingo Molnar @ 2009-08-19 12:56 UTC (permalink / raw)
To: Paul Mackerras
Cc: Peter Zijlstra, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, stephane eranian
* Paul Mackerras <paulus@samba.org> wrote:
> Peter Zijlstra writes:
>
> > Provide the ability to configure a counter to send its output
> > to another (already existing) counter's output stream.
>
> What sort of thing might this be useful for?
To share the output buffer - say you have a number of tracepoint
counters enabled and want a single (coherent) stream of events
without a post-processing sorting pass.
Right now each counter has to be opened, mmap-ed separately and
then the result stuffed into the output file.
Ingo
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 12:36 ` Paul Mackerras
2009-08-19 12:56 ` Ingo Molnar
@ 2009-08-19 12:56 ` Peter Zijlstra
2009-08-19 13:00 ` Ingo Molnar
2009-08-20 10:13 ` stephane eranian
1 sibling, 2 replies; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-19 12:56 UTC (permalink / raw)
To: Paul Mackerras
Cc: Ingo Molnar, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, stephane eranian
On Wed, 2009-08-19 at 22:36 +1000, Paul Mackerras wrote:
> Peter Zijlstra writes:
>
> > Provide the ability to configure a counter to send its output to
> > another (already existing) counter's output stream.
>
> What sort of thing might this be useful for?
Some people complained that its tedious to mmap() for every counter and
would like to share the mmap() output buffer between counters.
This saves on address space and mlock budget and I guess fd management
logic.
As long as you're not mixing counters for different tasks/cpus there
should be no performance penalty, but even if you do that it might work
well enough on slow samples/small systems..
> Does this only apply to sampling counters?
Yeah, everything that would otherwise go through the mmap() buffer.
I'm not sure there's anything to be done about the read(2) thing.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 12:56 ` Peter Zijlstra
@ 2009-08-19 13:00 ` Ingo Molnar
2009-08-20 10:13 ` stephane eranian
1 sibling, 0 replies; 27+ messages in thread
From: Ingo Molnar @ 2009-08-19 13:00 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Paul Mackerras, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, stephane eranian
* Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> > Does this only apply to sampling counters?
>
> Yeah, everything that would otherwise go through the mmap()
> buffer.
Note that this can include sample-driven normal counts as well,
i.e. PERF_SAMPLE_READ and perf record -s/--stat.
Ingo
^ permalink raw reply [flat|nested] 27+ messages in thread
* [tip:perfcounters/core] perf_counter: powerpc: Support the anonimized kernel callchain bits
2009-08-19 9:18 ` [PATCH 2/4] perf_counter: powerpc: Support the anonymized kernel callchain bits Peter Zijlstra
@ 2009-08-19 13:30 ` tip-bot for Peter Zijlstra
0 siblings, 0 replies; 27+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-08-19 13:30 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, acme, hpa, mingo, a.p.zijlstra, efault,
fweisbec, tglx, mingo
Commit-ID: 512cec5d6f2a71e04464bf4fd76a50571bd0dea1
Gitweb: http://git.kernel.org/tip/512cec5d6f2a71e04464bf4fd76a50571bd0dea1
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Wed, 19 Aug 2009 11:18:25 +0200
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Wed, 19 Aug 2009 14:49:03 +0200
perf_counter: powerpc: Support the anonimized kernel callchain bits
Adds support for anonimized kernel callchains to the powerpc
callchain code.
This patch is not folded into the patch that introduces this
feature because the powerpc callchain code isn't upstream yet,
and this allows re-ordering the patches.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090819092023.812428023@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
arch/powerpc/kernel/perf_callchain.c | 5 +++++
1 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c
index f74b62c..21fb1ad 100644
--- a/arch/powerpc/kernel/perf_callchain.c
+++ b/arch/powerpc/kernel/perf_callchain.c
@@ -70,6 +70,11 @@ static void perf_callchain_kernel(struct pt_regs *regs,
lr = regs->link;
sp = regs->gpr[1];
callchain_store(entry, PERF_CONTEXT_KERNEL);
+ next_ip = perf_paranoid_anon_ip();
+ if (next_ip) {
+ callchain_store(entry, next_ip);
+ return;
+ }
callchain_store(entry, regs->nip);
if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD))
^ permalink raw reply related [flat|nested] 27+ messages in thread
* [tip:perfcounters/core] perf tools: Check perf.data owner
2009-08-19 9:18 ` [PATCH 3/4] perf tools: Check perf.data owner Peter Zijlstra
@ 2009-08-19 13:32 ` tip-bot for Peter Zijlstra
0 siblings, 0 replies; 27+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-08-19 13:32 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, acme, hpa, mingo, a.p.zijlstra, efault,
fweisbec, tglx, mingo
Commit-ID: fa6963b2481beff8b11f76006fbb63fdbbf2d2d7
Gitweb: http://git.kernel.org/tip/fa6963b2481beff8b11f76006fbb63fdbbf2d2d7
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Wed, 19 Aug 2009 11:18:26 +0200
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Wed, 19 Aug 2009 15:25:51 +0200
perf tools: Check perf.data owner
Add an owner check to opening perf.data files and a switch to
silence it.
Because perf-report/perf-annotate are binary parsers reading
another users' perf.data file could be a security risk if the
file were explicitly engineered to trigger bugs in the parser
(we hope of course there are non such bugs, but you never
know).
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090819092023.896648538@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
tools/perf/builtin-annotate.c | 7 +++++++
tools/perf/builtin-report.c | 7 +++++++
2 files changed, 14 insertions(+), 0 deletions(-)
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 343e7b1..5e17de9 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -31,6 +31,7 @@ static char *vmlinux = "vmlinux";
static char default_sort_order[] = "comm,symbol";
static char *sort_order = default_sort_order;
+static int force;
static int input;
static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
@@ -1334,6 +1335,11 @@ static int __cmd_annotate(void)
exit(-1);
}
+ if (!force && (stat.st_uid != geteuid())) {
+ fprintf(stderr, "file: %s not owned by current user\n", input_name);
+ exit(-1);
+ }
+
if (!stat.st_size) {
fprintf(stderr, "zero-sized file, nothing to do!\n");
exit(0);
@@ -1439,6 +1445,7 @@ static const struct option options[] = {
"input file name"),
OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
"symbol to annotate"),
+ OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
OPT_BOOLEAN('v', "verbose", &verbose,
"be more verbose (show symbol address, etc)"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index b53a60f..8b2ec88 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -38,6 +38,7 @@ static char *dso_list_str, *comm_list_str, *sym_list_str,
static struct strlist *dso_list, *comm_list, *sym_list;
static char *field_sep;
+static int force;
static int input;
static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
@@ -1856,6 +1857,11 @@ static int __cmd_report(void)
exit(-1);
}
+ if (!force && (stat.st_uid != geteuid())) {
+ fprintf(stderr, "file: %s not owned by current user\n", input_name);
+ exit(-1);
+ }
+
if (!stat.st_size) {
fprintf(stderr, "zero-sized file, nothing to do!\n");
exit(0);
@@ -2064,6 +2070,7 @@ static const struct option options[] = {
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
"dump raw trace in ASCII"),
OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
+ OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
OPT_BOOLEAN('m', "modules", &modules,
"load module symbols - WARNING: use only with -k and LIVE kernel"),
OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples,
^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 1/4] perf_counter: Default to higher paranoia level
2009-08-19 9:18 ` [PATCH 1/4] perf_counter: Default to higher paranoia level Peter Zijlstra
@ 2009-08-19 14:07 ` Peter Zijlstra
2009-08-19 16:04 ` Frederic Weisbecker
2009-08-20 12:00 ` Peter Zijlstra
0 siblings, 2 replies; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-19 14:07 UTC (permalink / raw)
To: Ingo Molnar
Cc: Paul Mackerras, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, Jens Axboe, James Morris
On Wed, 2009-08-19 at 11:18 +0200, Peter Zijlstra wrote:
> +static inline bool perf_paranoid_anon(void)
> +{
> + return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 1;
> }
>
> static inline bool perf_paranoid_kernel(void)
> {
> - return sysctl_perf_counter_paranoid > 1;
> + return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 2;
> +}
OK, this is buggy:
- capable() uses current, which is unlikely to be counter->owner,
- but even security_real_capable(counter->owner, ...) wouldn't
work, since the ->capable() callback isn't NMI safe
(selinux takes locks and does allocations in that path).
This puts a severe strain on more complex anonymizers since its
basically impossible to tell if counter->owner has permissions on
current from NMI context.
I'll fix up this patch to pre-compute the perf_paranoid_anon_ip() per
counter based on creation time state, unless somebody has a better idea.
I could possibly only anonymize IRQ context (SoftIRQ context is
difficult since in_softirq() means both in-softirq and
softirq-disabled).
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 1/4] perf_counter: Default to higher paranoia level
2009-08-19 14:07 ` Peter Zijlstra
@ 2009-08-19 16:04 ` Frederic Weisbecker
2009-08-20 12:00 ` Peter Zijlstra
1 sibling, 0 replies; 27+ messages in thread
From: Frederic Weisbecker @ 2009-08-19 16:04 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Paul Mackerras, Arnaldo Carvalho de Melo,
Mike Galbraith, linux-kernel, Jens Axboe, James Morris
On Wed, Aug 19, 2009 at 04:07:33PM +0200, Peter Zijlstra wrote:
> On Wed, 2009-08-19 at 11:18 +0200, Peter Zijlstra wrote:
>
> > +static inline bool perf_paranoid_anon(void)
> > +{
> > + return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 1;
> > }
> >
> > static inline bool perf_paranoid_kernel(void)
> > {
> > - return sysctl_perf_counter_paranoid > 1;
> > + return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 2;
> > +}
>
> OK, this is buggy:
>
> - capable() uses current, which is unlikely to be counter->owner,
> - but even security_real_capable(counter->owner, ...) wouldn't
> work, since the ->capable() callback isn't NMI safe
> (selinux takes locks and does allocations in that path).
>
> This puts a severe strain on more complex anonymizers since its
> basically impossible to tell if counter->owner has permissions on
> current from NMI context.
>
> I'll fix up this patch to pre-compute the perf_paranoid_anon_ip() per
> counter based on creation time state, unless somebody has a better idea.
Something I don't understand there: it's about wide per cpu profiling,
then the task that have been created before the counter can also be
profiled, then how is the creation time useful here?
> I could possibly only anonymize IRQ context (SoftIRQ context is
> difficult since in_softirq() means both in-softirq and
> softirq-disabled).
I don't understand why we need to set this paranoid level concerning
kernel RIPS.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 9:18 ` [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels Peter Zijlstra
2009-08-19 10:58 ` Ingo Molnar
2009-08-19 12:36 ` Paul Mackerras
@ 2009-08-19 16:19 ` Frederic Weisbecker
2009-08-19 16:24 ` Peter Zijlstra
2009-08-25 7:39 ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
3 siblings, 1 reply; 27+ messages in thread
From: Frederic Weisbecker @ 2009-08-19 16:19 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Paul Mackerras, Arnaldo Carvalho de Melo,
Mike Galbraith, linux-kernel, stephane eranian
On Wed, Aug 19, 2009 at 11:18:27AM +0200, Peter Zijlstra wrote:
> Provide the ability to configure a counter to send its output to
> another (already existing) counter's output stream.
>
> [ compile tested only ]
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Cc: stephane eranian <eranian@googlemail.com>
> +int perf_counter_set_output(struct perf_counter *counter, int output_fd)
> +{
> + struct perf_counter *output_counter = NULL;
> + struct file *output_file = NULL;
> + struct perf_counter *old_output;
> + int fput_needed = 0;
> + int ret = -EINVAL;
> +
> + if (!output_fd)
> + goto set;
> +
> + output_file = fget_light(output_fd, &fput_needed);
> + if (!output_file)
> + return -EBADF;
> +
> + if (output_file->f_op != &perf_fops)
> + goto out;
> +
> + output_counter = output_file->private_data;
> +
> + /* Don't chain output fds */
> + if (output_counter->output)
> + goto out;
If you don't chain them, how do you propagate more than one
path of output redirected?
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 16:19 ` Frederic Weisbecker
@ 2009-08-19 16:24 ` Peter Zijlstra
2009-08-19 16:27 ` Frederic Weisbecker
0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-19 16:24 UTC (permalink / raw)
To: Frederic Weisbecker
Cc: Ingo Molnar, Paul Mackerras, Arnaldo Carvalho de Melo,
Mike Galbraith, linux-kernel, stephane eranian
On Wed, 2009-08-19 at 18:19 +0200, Frederic Weisbecker wrote:
> On Wed, Aug 19, 2009 at 11:18:27AM +0200, Peter Zijlstra wrote:
> > Provide the ability to configure a counter to send its output to
> > another (already existing) counter's output stream.
> >
> > [ compile tested only ]
> >
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > Cc: stephane eranian <eranian@googlemail.com>
> > +int perf_counter_set_output(struct perf_counter *counter, int output_fd)
> > +{
> > + struct perf_counter *output_counter = NULL;
> > + struct file *output_file = NULL;
> > + struct perf_counter *old_output;
> > + int fput_needed = 0;
> > + int ret = -EINVAL;
> > +
> > + if (!output_fd)
> > + goto set;
> > +
> > + output_file = fget_light(output_fd, &fput_needed);
> > + if (!output_file)
> > + return -EBADF;
> > +
> > + if (output_file->f_op != &perf_fops)
> > + goto out;
> > +
> > + output_counter = output_file->private_data;
> > +
> > + /* Don't chain output fds */
> > + if (output_counter->output)
> > + goto out;
>
>
>
> If you don't chain them, how do you propagate more than one
> path of output redirected?
you mean, fd2->fd1->fd0 ?
You don't, that sounds like a silly thing to do, do fd1->fd0, fd2->fd0
instead.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 16:24 ` Peter Zijlstra
@ 2009-08-19 16:27 ` Frederic Weisbecker
0 siblings, 0 replies; 27+ messages in thread
From: Frederic Weisbecker @ 2009-08-19 16:27 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, Paul Mackerras, Arnaldo Carvalho de Melo,
Mike Galbraith, linux-kernel, stephane eranian
On Wed, Aug 19, 2009 at 06:24:28PM +0200, Peter Zijlstra wrote:
> On Wed, 2009-08-19 at 18:19 +0200, Frederic Weisbecker wrote:
> > On Wed, Aug 19, 2009 at 11:18:27AM +0200, Peter Zijlstra wrote:
> > > Provide the ability to configure a counter to send its output to
> > > another (already existing) counter's output stream.
> > >
> > > [ compile tested only ]
> > >
> > > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > > Cc: stephane eranian <eranian@googlemail.com>
> > > +int perf_counter_set_output(struct perf_counter *counter, int output_fd)
> > > +{
> > > + struct perf_counter *output_counter = NULL;
> > > + struct file *output_file = NULL;
> > > + struct perf_counter *old_output;
> > > + int fput_needed = 0;
> > > + int ret = -EINVAL;
> > > +
> > > + if (!output_fd)
> > > + goto set;
> > > +
> > > + output_file = fget_light(output_fd, &fput_needed);
> > > + if (!output_file)
> > > + return -EBADF;
> > > +
> > > + if (output_file->f_op != &perf_fops)
> > > + goto out;
> > > +
> > > + output_counter = output_file->private_data;
> > > +
> > > + /* Don't chain output fds */
> > > + if (output_counter->output)
> > > + goto out;
> >
> >
> >
> > If you don't chain them, how do you propagate more than one
> > path of output redirected?
>
> you mean, fd2->fd1->fd0 ?
>
> You don't, that sounds like a silly thing to do, do fd1->fd0, fd2->fd0
> instead.
>
Ok.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-19 12:56 ` Peter Zijlstra
2009-08-19 13:00 ` Ingo Molnar
@ 2009-08-20 10:13 ` stephane eranian
2009-08-20 10:24 ` Peter Zijlstra
2009-08-20 10:28 ` Ingo Molnar
1 sibling, 2 replies; 27+ messages in thread
From: stephane eranian @ 2009-08-20 10:13 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Paul Mackerras, Ingo Molnar, Arnaldo Carvalho de Melo,
Frederic Weisbecker, Mike Galbraith, linux-kernel
On Wed, Aug 19, 2009 at 2:56 PM, Peter Zijlstra<a.p.zijlstra@chello.nl> wrote:
> On Wed, 2009-08-19 at 22:36 +1000, Paul Mackerras wrote:
>> Peter Zijlstra writes:
>>
>> > Provide the ability to configure a counter to send its output to
>> > another (already existing) counter's output stream.
>>
>> What sort of thing might this be useful for?
>
> Some people complained that its tedious to mmap() for every counter and
> would like to share the mmap() output buffer between counters.
>
> This saves on address space and mlock budget and I guess fd management
> logic.
>
Interesting to see, you seem to have changed your mind on this.
I recall pointing this out in my early comments.
But anyway, here are some more comments:
- how does this work with the remapped counts?
Probably only see the count for the target, i.e., output, event
- if samples from multiple events end up in the same buffer, how do
I tell them apart,
i.e., how do I know sample X came from event A, sample X from event B?
This may be useful to detect patterns.
> As long as you're not mixing counters for different tasks/cpus there
> should be no performance penalty, but even if you do that it might work
> well enough on slow samples/small systems..
>
>> Does this only apply to sampling counters?
>
> Yeah, everything that would otherwise go through the mmap() buffer.
>
> I'm not sure there's anything to be done about the read(2) thing.
>
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-20 10:13 ` stephane eranian
@ 2009-08-20 10:24 ` Peter Zijlstra
2009-08-20 10:28 ` Ingo Molnar
1 sibling, 0 replies; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-20 10:24 UTC (permalink / raw)
To: eranian
Cc: Paul Mackerras, Ingo Molnar, Arnaldo Carvalho de Melo,
Frederic Weisbecker, Mike Galbraith, linux-kernel
On Thu, 2009-08-20 at 12:13 +0200, stephane eranian wrote:
> On Wed, Aug 19, 2009 at 2:56 PM, Peter Zijlstra<a.p.zijlstra@chello.nl> wrote:
> > On Wed, 2009-08-19 at 22:36 +1000, Paul Mackerras wrote:
> >> Peter Zijlstra writes:
> >>
> >> > Provide the ability to configure a counter to send its output to
> >> > another (already existing) counter's output stream.
> >>
> >> What sort of thing might this be useful for?
> >
> > Some people complained that its tedious to mmap() for every counter and
> > would like to share the mmap() output buffer between counters.
> >
>
> > This saves on address space and mlock budget and I guess fd management
> > logic.
> >
> Interesting to see, you seem to have changed your mind on this.
Lets say I haven't quite made up my mind yet, but people seem to want
it.
> I recall pointing this out in my early comments.
Yeah :-)
> But anyway, here are some more comments:
>
> - how does this work with the remapped counts?
> Probably only see the count for the target, i.e., output, event
Ah, right you are, we'd have to allow mmap() of the first page but no
data section.
> - if samples from multiple events end up in the same buffer, how do
> I tell them apart,
> i.e., how do I know sample X came from event A, sample X from event B?
> This may be useful to detect patterns.
PERF_SAMPLE_ID?
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
2009-08-20 10:13 ` stephane eranian
2009-08-20 10:24 ` Peter Zijlstra
@ 2009-08-20 10:28 ` Ingo Molnar
1 sibling, 0 replies; 27+ messages in thread
From: Ingo Molnar @ 2009-08-20 10:28 UTC (permalink / raw)
To: eranian
Cc: Peter Zijlstra, Paul Mackerras, Arnaldo Carvalho de Melo,
Frederic Weisbecker, Mike Galbraith, linux-kernel
* stephane eranian <eranian@googlemail.com> wrote:
> On Wed, Aug 19, 2009 at 2:56 PM, Peter Zijlstra<a.p.zijlstra@chello.nl> wrote:
> > On Wed, 2009-08-19 at 22:36 +1000, Paul Mackerras wrote:
> >> Peter Zijlstra writes:
> >>
> >> > Provide the ability to configure a counter to send
> >> > its output to another (already existing) counter's
> >> > output stream.
> >>
> >> What sort of thing might this be useful for?
> >
> > Some people complained that its tedious to mmap() for
> > every counter and would like to share the mmap()
> > output buffer between counters.
> >
>
> > This saves on address space and mlock budget and I
> > guess fd management logic.
>
> Interesting to see, you seem to have changed your mind
> on this. I recall pointing this out in my early
> comments.
Btw., did we strongly oppose the notion? The mmap output
buffers were always multi-event in essence - just by
virtue of the automatic events such as task creation,
mmap, etc.
> But anyway, here are some more comments:
>
> - how does this work with the remapped counts?
> Probably only see the count for the target, i.e.,
> output, event
if you want to recover the momentary index for RDPC you
need to mmap the counter(s) you are interested in, to get
to the header->index value.
So you can create these counters with 1 pages only (one
is the header) - and then remap the output to the primary
counter's output buffer.
> - if samples from multiple events end up in the same
> buffer, how do I tell them apart, i.e., how do I
> know sample X came from event A, sample X from
> event B? This may be useful to detect patterns.
Via PERF_SAMPLE_ID. That saves the counter ID into the
event so it can be demultiplexed later on.
Ingo
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 1/4] perf_counter: Default to higher paranoia level
2009-08-19 14:07 ` Peter Zijlstra
2009-08-19 16:04 ` Frederic Weisbecker
@ 2009-08-20 12:00 ` Peter Zijlstra
2009-08-21 14:21 ` Ingo Molnar
1 sibling, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-20 12:00 UTC (permalink / raw)
To: Ingo Molnar
Cc: Paul Mackerras, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, Jens Axboe, James Morris,
Linus Torvalds
On Wed, 2009-08-19 at 16:07 +0200, Peter Zijlstra wrote:
> On Wed, 2009-08-19 at 11:18 +0200, Peter Zijlstra wrote:
>
> > +static inline bool perf_paranoid_anon(void)
> > +{
> > + return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 1;
> > }
> >
> > static inline bool perf_paranoid_kernel(void)
> > {
> > - return sysctl_perf_counter_paranoid > 1;
> > + return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 2;
> > +}
>
> OK, this is buggy:
>
> - capable() uses current, which is unlikely to be counter->owner,
> - but even security_real_capable(counter->owner, ...) wouldn't
> work, since the ->capable() callback isn't NMI safe
> (selinux takes locks and does allocations in that path).
>
> This puts a severe strain on more complex anonymizers since its
> basically impossible to tell if counter->owner has permissions on
> current from NMI context.
>
> I'll fix up this patch to pre-compute the perf_paranoid_anon_ip() per
> counter based on creation time state, unless somebody has a better idea.
>
> I could possibly only anonymize IRQ context (SoftIRQ context is
> difficult since in_softirq() means both in-softirq and
> softirq-disabled).
Something like the below maybe.. its 3 patches folded and I've no clue
how to adapt the ppc code, what do people think?
compile tested on x86-64 only.
---
arch/x86/include/asm/stacktrace.h | 14 +++++--
arch/x86/kernel/cpu/perf_counter.c | 66 +++++++++++++++++++++++++++++++----
arch/x86/kernel/dumpstack.c | 10 +++--
arch/x86/kernel/dumpstack_32.c | 8 +----
arch/x86/kernel/dumpstack_64.c | 13 ++-----
arch/x86/kernel/stacktrace.c | 8 +++--
arch/x86/oprofile/backtrace.c | 5 ++-
include/linux/perf_counter.h | 5 ++-
kernel/perf_counter.c | 30 ++++++++++++----
kernel/trace/trace_sysprof.c | 5 ++-
10 files changed, 117 insertions(+), 47 deletions(-)
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index cf86a5e..7066caa 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,17 +3,23 @@
extern int kstack_depth_to_print;
-int x86_is_stack_id(int id, char *name);
-
/* Generic stack tracer with callbacks */
+enum stack_type {
+ STACK_UNKNOWN = -1,
+ STACK_PROCESS = 0,
+ STACK_INTERRUPT = 1,
+ STACK_EXCEPTION = 2,
+};
+
struct stacktrace_ops {
void (*warning)(void *data, char *msg);
/* msg must contain %s for the symbol */
void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
- void (*address)(void *data, unsigned long address, int reliable);
+ void (*address)(void *data, unsigned long stack,
+ unsigned long address, int reliable);
/* On negative return stop dumping */
- int (*stack)(void *data, char *name);
+ int (*stack)(void *data, int type, char *name);
};
void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 396e35d..a6ddb3b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -2108,7 +2108,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
-static DEFINE_PER_CPU(int, in_nmi_frame);
+static DEFINE_PER_CPU(int, skip_frame);
static void
@@ -2122,21 +2122,49 @@ static void backtrace_warning(void *data, char *msg)
/* Ignore warnings */
}
-static int backtrace_stack(void *data, char *name)
+static int backtrace_stack(void *data, int type, char *name)
{
- per_cpu(in_nmi_frame, smp_processor_id()) =
- x86_is_stack_id(NMI_STACK, name);
+ struct perf_callchain_entry *entry = data;
+ int skip = 0;
+
+ /*
+ * Always skip exception (NMI) context
+ */
+ if (type == STACK_EXCEPTION)
+ skip = 1;
+
+ /*
+ * If we're restricted, don't show IRQ context
+ */
+ if (entry->restricted && type == STACK_INTERRUPT)
+ skip = 1;
+
+ __get_cpu_var(skip_frame) = skip;
return 0;
}
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static void backtrace_address(void *data, unsigned long stack,
+ unsigned long addr, int reliable)
{
struct perf_callchain_entry *entry = data;
- if (per_cpu(in_nmi_frame, smp_processor_id()))
+ if (__get_cpu_var(skip_frame))
return;
+#ifdef CONFIG_4KSTACKS
+ if (entry->restricted) {
+ struct thread_info *tinfo = current_thread_info();
+ unsigned long task_stack = (unsigned long)tinfo;
+
+ /*
+ * If its not on the task stack, don't show it.
+ */
+ if (stack < task_stack || stack >= task_stack + PAGE_SIZE)
+ return;
+ }
+#endif
+
if (reliable)
callchain_store(entry, addr);
}
@@ -2153,8 +2181,28 @@ static const struct stacktrace_ops backtrace_ops = {
static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
+ int irq = hardirq_count() - (in_nmi() ? 0 : HARDIRQ_OFFSET);
+
callchain_store(entry, PERF_CONTEXT_KERNEL);
- callchain_store(entry, regs->ip);
+ if (entry->restricted && !irq)
+ callchain_store(entry, regs->ip);
+
+#ifdef CONFIG_X86_32
+ /*
+ * i386 is a friggin mess, 4KSTACKS makes it somewhat saner but
+ * still don't have a way to identify NMI entries as they are
+ * nested onto whatever stack is there.
+ *
+ * So until i386 starts using per context stacks unconditionally
+ * and fixed up the unwinder (dump_trace behaves differently between
+ * i386 and x86_64), we'll have to unconditionally truncate kernel
+ * stacks that have IRQ context bits in them.
+ */
+#ifndef CONFIG_4KSTACKS
+ if (irq)
+ return;
+#endif
+#endif
dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
}
@@ -2255,7 +2303,8 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
perf_callchain_user(regs, entry);
}
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *
+perf_callchain(struct perf_counter *counter, struct pt_regs *regs)
{
struct perf_callchain_entry *entry;
@@ -2265,6 +2314,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
entry = &__get_cpu_var(irq_entry);
entry->nr = 0;
+ entry->restricted = counter->restricted;
perf_do_callchain(regs, entry);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371..978ae89 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -51,7 +51,7 @@ print_ftrace_graph_addr(unsigned long addr, void *data,
index -= *graph;
ret_addr = task->ret_stack[index].ret;
- ops->address(data, ret_addr, 1);
+ ops->address(data, 0, ret_addr, 1);
(*graph)++;
}
@@ -96,12 +96,14 @@ print_context_stack(struct thread_info *tinfo,
addr = *stack;
if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
+ unsigned long stk = (unsigned long)stack;
+
+ if (stk == bp + sizeof(long)) {
+ ops->address(data, stk, addr, 1);
frame = frame->next_frame;
bp = (unsigned long) frame;
} else {
- ops->address(data, addr, 0);
+ ops->address(data, stk, addr, 0);
}
print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba..be633ed 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -19,12 +19,6 @@
#include "dumpstack.h"
-/* Just a stub for now */
-int x86_is_stack_id(int id, char *name)
-{
- return 0;
-}
-
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
@@ -64,7 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
stack = (unsigned long *)context->previous_esp;
if (!stack)
break;
- if (ops->stack(data, "IRQ") < 0)
+ if (ops->stack(data, STACK_UNKNOWN, "IRQ") < 0)
break;
touch_nmi_watchdog();
}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a32..9309c5c 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -32,11 +32,6 @@ static char x86_stack_ids[][8] = {
#endif
};
-int x86_is_stack_id(int id, char *name)
-{
- return x86_stack_ids[id - 1] == name;
-}
-
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
unsigned *usedp, char **idp)
{
@@ -155,12 +150,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
&used, &id);
if (estack_end) {
- if (ops->stack(data, id) < 0)
+ if (ops->stack(data, STACK_EXCEPTION, id) < 0)
break;
bp = print_context_stack(tinfo, stack, bp, ops,
data, estack_end, &graph);
- ops->stack(data, "<EOE>");
+ ops->stack(data, STACK_PROCESS, "<EOE>");
/*
* We link to the next stack via the
* second-to-last pointer (index -2 to end) in the
@@ -175,7 +170,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
(IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
if (stack >= irq_stack && stack < irq_stack_end) {
- if (ops->stack(data, "IRQ") < 0)
+ if (ops->stack(data, STACK_INTERRUPT, "IRQ") < 0)
break;
bp = print_context_stack(tinfo, stack, bp,
ops, data, irq_stack_end, &graph);
@@ -186,7 +181,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
*/
stack = (unsigned long *) (irq_stack_end[-1]);
irq_stack_end = NULL;
- ops->stack(data, "EOI");
+ ops->stack(data, STACK_PROCESS, "EOI");
continue;
}
}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c3eb207..a1415f4 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -18,12 +18,13 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
{
}
-static int save_stack_stack(void *data, char *name)
+static int save_stack_stack(void *data, int type, char *name)
{
return 0;
}
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static void save_stack_address(void *data, unsigned long stack,
+ unsigned long addr, int reliable)
{
struct stack_trace *trace = data;
if (!reliable)
@@ -37,7 +38,8 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
}
static void
-save_stack_address_nosched(void *data, unsigned long addr, int reliable)
+save_stack_address_nosched(void *data, unsigned long stack,
+ unsigned long addr, int reliable)
{
struct stack_trace *trace = (struct stack_trace *)data;
if (!reliable)
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 044897b..7232bc9 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -26,13 +26,14 @@ static void backtrace_warning(void *data, char *msg)
/* Ignore warnings */
}
-static int backtrace_stack(void *data, char *name)
+static int backtrace_stack(void *data, int type, char *name)
{
/* Yes, we want all stacks */
return 0;
}
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static void backtrace_address(void *data, unsigned long stack,
+ unsigned long addr, int reliable)
{
unsigned int *depth = data;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9ba1822..2b0528f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -439,6 +439,7 @@ enum perf_callchain_context {
struct perf_callchain_entry {
__u64 nr;
__u64 ip[PERF_MAX_STACK_DEPTH];
+ int restricted;
};
struct perf_raw_record {
@@ -535,6 +536,7 @@ struct perf_counter {
struct list_head event_entry;
struct list_head sibling_list;
int nr_siblings;
+ int restricted;
struct perf_counter *group_leader;
const struct pmu *pmu;
@@ -754,7 +756,8 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma)
extern void perf_counter_comm(struct task_struct *tsk);
extern void perf_counter_fork(struct task_struct *tsk);
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_counter *counter, struct pt_regs *regs);
extern int sysctl_perf_counter_paranoid;
extern int sysctl_perf_counter_mlock;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 36f65e2..cc8450e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -50,16 +50,28 @@ static atomic_t nr_task_counters __read_mostly;
* 1 - disallow cpu counters to unpriv
* 2 - disallow kernel profiling to unpriv
*/
-int sysctl_perf_counter_paranoid __read_mostly;
+int sysctl_perf_counter_paranoid __read_mostly = 1;
static inline bool perf_paranoid_cpu(void)
{
- return sysctl_perf_counter_paranoid > 0;
+ return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 0;
}
static inline bool perf_paranoid_kernel(void)
{
- return sysctl_perf_counter_paranoid > 1;
+ return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 1;
+}
+
+/*
+ * When paranoid == 1 we're limiting kernel profiling to not include
+ * other users' information, this includes IRQ stack traces.
+ */
+static bool perf_counter_restricted(struct perf_counter *counter)
+{
+ if (counter->restricted)
+ return hardirq_count() - (in_nmi() ? 0 : HARDIRQ_OFFSET);
+
+ return 0;
}
int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
@@ -1571,7 +1583,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
*/
if (cpu != -1) {
/* Must be root to operate on a CPU counter: */
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_cpu())
return ERR_PTR(-EACCES);
if (cpu < 0 || cpu > num_possible_cpus())
@@ -2458,7 +2470,8 @@ void perf_counter_do_pending(void)
* Callchain support -- arch specific
*/
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+__weak struct perf_callchain_entry *
+perf_callchain(struct perf_counter *counter, struct pt_regs *regs)
{
return NULL;
}
@@ -2846,6 +2859,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
if (sample_type & PERF_SAMPLE_IP) {
ip = perf_instruction_pointer(data->regs);
+ if (perf_counter_restricted(counter))
+ ip = 0;
header.size += sizeof(ip);
}
@@ -2889,7 +2904,7 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
header.size += perf_counter_read_size(counter);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- callchain = perf_callchain(data->regs);
+ callchain = perf_callchain(counter, data->regs);
if (callchain) {
callchain_size = (1 + callchain->nr) * sizeof(u64);
@@ -4049,6 +4064,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
counter->pmu = NULL;
counter->ctx = ctx;
counter->oncpu = -1;
+ counter->restricted = perf_paranoid_cpu();
counter->parent = parent_counter;
@@ -4231,7 +4247,7 @@ SYSCALL_DEFINE5(perf_counter_open,
return ret;
if (!attr.exclude_kernel) {
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ if (perf_paranoid_kernel())
return -EACCES;
}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index f669396..d7d2d0d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -71,13 +71,14 @@ static void backtrace_warning(void *data, char *msg)
/* Ignore warnings */
}
-static int backtrace_stack(void *data, char *name)
+static int backtrace_stack(void *data, int type, char *name)
{
/* Don't bother with IRQ stacks for now */
return -1;
}
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static void backtrace_address(void *data, unsigned long stack,
+ unsigned long addr, int reliable)
{
struct backtrace_info *info = data;
^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH 1/4] perf_counter: Default to higher paranoia level
2009-08-20 12:00 ` Peter Zijlstra
@ 2009-08-21 14:21 ` Ingo Molnar
2009-08-24 7:29 ` Peter Zijlstra
0 siblings, 1 reply; 27+ messages in thread
From: Ingo Molnar @ 2009-08-21 14:21 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Paul Mackerras, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, Jens Axboe, James Morris,
Linus Torvalds
* Peter Zijlstra <peterz@infradead.org> wrote:
> On Wed, 2009-08-19 at 16:07 +0200, Peter Zijlstra wrote:
> > On Wed, 2009-08-19 at 11:18 +0200, Peter Zijlstra wrote:
> >
> > > +static inline bool perf_paranoid_anon(void)
> > > +{
> > > + return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 1;
> > > }
> > >
> > > static inline bool perf_paranoid_kernel(void)
> > > {
> > > - return sysctl_perf_counter_paranoid > 1;
> > > + return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 2;
> > > +}
> >
> > OK, this is buggy:
> >
> > - capable() uses current, which is unlikely to be counter->owner,
> > - but even security_real_capable(counter->owner, ...) wouldn't
> > work, since the ->capable() callback isn't NMI safe
> > (selinux takes locks and does allocations in that path).
> >
> > This puts a severe strain on more complex anonymizers since its
> > basically impossible to tell if counter->owner has permissions on
> > current from NMI context.
> >
> > I'll fix up this patch to pre-compute the perf_paranoid_anon_ip() per
> > counter based on creation time state, unless somebody has a better idea.
> >
> > I could possibly only anonymize IRQ context (SoftIRQ context is
> > difficult since in_softirq() means both in-softirq and
> > softirq-disabled).
>
> Something like the below maybe.. its 3 patches folded and I've no clue
> how to adapt the ppc code, what do people think?
>
> compile tested on x86-64 only.
>
> ---
> arch/x86/include/asm/stacktrace.h | 14 +++++--
> arch/x86/kernel/cpu/perf_counter.c | 66 +++++++++++++++++++++++++++++++----
> arch/x86/kernel/dumpstack.c | 10 +++--
> arch/x86/kernel/dumpstack_32.c | 8 +----
> arch/x86/kernel/dumpstack_64.c | 13 ++-----
> arch/x86/kernel/stacktrace.c | 8 +++--
> arch/x86/oprofile/backtrace.c | 5 ++-
> include/linux/perf_counter.h | 5 ++-
> kernel/perf_counter.c | 30 ++++++++++++----
> kernel/trace/trace_sysprof.c | 5 ++-
> 10 files changed, 117 insertions(+), 47 deletions(-)
>
> diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
> index cf86a5e..7066caa 100644
> --- a/arch/x86/include/asm/stacktrace.h
> +++ b/arch/x86/include/asm/stacktrace.h
> @@ -3,17 +3,23 @@
>
> extern int kstack_depth_to_print;
>
> -int x86_is_stack_id(int id, char *name);
> -
> /* Generic stack tracer with callbacks */
>
> +enum stack_type {
> + STACK_UNKNOWN = -1,
> + STACK_PROCESS = 0,
> + STACK_INTERRUPT = 1,
> + STACK_EXCEPTION = 2,
> +};
> +
> struct stacktrace_ops {
> void (*warning)(void *data, char *msg);
> /* msg must contain %s for the symbol */
> void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
> - void (*address)(void *data, unsigned long address, int reliable);
> + void (*address)(void *data, unsigned long stack,
> + unsigned long address, int reliable);
> /* On negative return stop dumping */
> - int (*stack)(void *data, char *name);
> + int (*stack)(void *data, int type, char *name);
> };
nice generalization ...
> diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
> index 9ba1822..2b0528f 100644
> --- a/include/linux/perf_counter.h
> +++ b/include/linux/perf_counter.h
> @@ -439,6 +439,7 @@ enum perf_callchain_context {
> struct perf_callchain_entry {
> __u64 nr;
> __u64 ip[PERF_MAX_STACK_DEPTH];
> + int restricted;
> };
i'd love to have something more specific here - i.e. a context type
ID that identifies these basic types:
- process
- softirq
- hardirq
- NMI
and then let it be up to upper layers to decide what they do with a
restricted entry, and how to further process this information.
And it's not just security: for example it would be interesting to
sample pure, non-irq overhead - as IRQ overhead is often unrelated
to the process being measured.
Ingo
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 1/4] perf_counter: Default to higher paranoia level
2009-08-21 14:21 ` Ingo Molnar
@ 2009-08-24 7:29 ` Peter Zijlstra
2009-08-24 7:37 ` Ingo Molnar
0 siblings, 1 reply; 27+ messages in thread
From: Peter Zijlstra @ 2009-08-24 7:29 UTC (permalink / raw)
To: Ingo Molnar
Cc: Paul Mackerras, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, Jens Axboe, James Morris,
Linus Torvalds
On Fri, 2009-08-21 at 16:21 +0200, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> > diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
> > index 9ba1822..2b0528f 100644
> > --- a/include/linux/perf_counter.h
> > +++ b/include/linux/perf_counter.h
> > @@ -439,6 +439,7 @@ enum perf_callchain_context {
> > struct perf_callchain_entry {
> > __u64 nr;
> > __u64 ip[PERF_MAX_STACK_DEPTH];
> > + int restricted;
> > };
>
> i'd love to have something more specific here - i.e. a context type
> ID that identifies these basic types:
>
> - process
> - softirq
> - hardirq
> - NMI
>
> and then let it be up to upper layers to decide what they do with a
> restricted entry, and how to further process this information.
>
> And it's not just security: for example it would be interesting to
> sample pure, non-irq overhead - as IRQ overhead is often unrelated
> to the process being measured.
Yes it is, this is purely about not showing some data. If you don't want
to sample IRQ stuff that's something else, we'd have to grow that
capability in hardware (like the OS/USR bits) or put perf enable/disable
hooks into the irq entry/exit hooks (which doesn't sound all too hot an
idea to me).
Simply not showing the call-trace is something all-together different
from not profiling it.
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH 1/4] perf_counter: Default to higher paranoia level
2009-08-24 7:29 ` Peter Zijlstra
@ 2009-08-24 7:37 ` Ingo Molnar
0 siblings, 0 replies; 27+ messages in thread
From: Ingo Molnar @ 2009-08-24 7:37 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Paul Mackerras, Arnaldo Carvalho de Melo, Frederic Weisbecker,
Mike Galbraith, linux-kernel, Jens Axboe, James Morris,
Linus Torvalds
* Peter Zijlstra <peterz@infradead.org> wrote:
> On Fri, 2009-08-21 at 16:21 +0200, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
>
> > > diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
> > > index 9ba1822..2b0528f 100644
> > > --- a/include/linux/perf_counter.h
> > > +++ b/include/linux/perf_counter.h
> > > @@ -439,6 +439,7 @@ enum perf_callchain_context {
> > > struct perf_callchain_entry {
> > > __u64 nr;
> > > __u64 ip[PERF_MAX_STACK_DEPTH];
> > > + int restricted;
> > > };
> >
> > i'd love to have something more specific here - i.e. a context type
> > ID that identifies these basic types:
> >
> > - process
> > - softirq
> > - hardirq
> > - NMI
> >
> > and then let it be up to upper layers to decide what they do with a
> > restricted entry, and how to further process this information.
> >
> > And it's not just security: for example it would be interesting to
> > sample pure, non-irq overhead - as IRQ overhead is often unrelated
> > to the process being measured.
>
> Yes it is, this is purely about not showing some data. If you
> don't want to sample IRQ stuff that's something else, we'd have to
> grow that capability in hardware (like the OS/USR bits) or put
> perf enable/disable hooks into the irq entry/exit hooks (which
> doesn't sound all too hot an idea to me).
>
> Simply not showing the call-trace is something all-together
> different from not profiling it.
Well, it's not about not profiling it - it's about being able to
_separate out_ the samples from the various contexts.
Right now we already have context separators for call-chains:
PERF_CONTEXT_HV = (__u64)-32,
PERF_CONTEXT_KERNEL = (__u64)-128,
PERF_CONTEXT_USER = (__u64)-512,
PERF_CONTEXT_GUEST = (__u64)-2048,
PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176,
PERF_CONTEXT_GUEST_USER = (__u64)-2560,
All i'm suggesting is to also have these context separators:
PERF_CONTEXT_KERNEL_HARDIRQ
PERF_CONTEXT_KERNEL_SOFTIRQ
PERF_CONTEXT_KERNEL /* syscall level */
So that if user-space wants to visualize just a portion of it, it
can do it.
Ingo
^ permalink raw reply [flat|nested] 27+ messages in thread
* [tip:perfcounters/core] perf_counter: Allow sharing of output channels
2009-08-19 9:18 ` [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels Peter Zijlstra
` (2 preceding siblings ...)
2009-08-19 16:19 ` Frederic Weisbecker
@ 2009-08-25 7:39 ` tip-bot for Peter Zijlstra
3 siblings, 0 replies; 27+ messages in thread
From: tip-bot for Peter Zijlstra @ 2009-08-25 7:39 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, paulus, acme, hpa, mingo, eranian, a.p.zijlstra,
efault, fweisbec, tglx, mingo
Commit-ID: a4be7c2778d1fd8e0a8a5607bec91fa221ab2c91
Gitweb: http://git.kernel.org/tip/a4be7c2778d1fd8e0a8a5607bec91fa221ab2c91
Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
AuthorDate: Wed, 19 Aug 2009 11:18:27 +0200
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Tue, 25 Aug 2009 09:36:13 +0200
perf_counter: Allow sharing of output channels
Provide the ability to configure a counter to send its output
to another (already existing) counter's output stream.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090819092023.980284148@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
include/linux/perf_counter.h | 5 +++
kernel/perf_counter.c | 83 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 85 insertions(+), 3 deletions(-)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index b53f700..e022b84 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -216,6 +216,7 @@ struct perf_counter_attr {
#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2)
#define PERF_COUNTER_IOC_RESET _IO ('$', 3)
#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64)
+#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5)
enum perf_counter_ioc_flags {
PERF_IOC_FLAG_GROUP = 1U << 0,
@@ -415,6 +416,9 @@ enum perf_callchain_context {
PERF_CONTEXT_MAX = (__u64)-4095,
};
+#define PERF_FLAG_FD_NO_GROUP (1U << 0)
+#define PERF_FLAG_FD_OUTPUT (1U << 1)
+
#ifdef __KERNEL__
/*
* Kernel-internal data types and definitions:
@@ -536,6 +540,7 @@ struct perf_counter {
struct list_head sibling_list;
int nr_siblings;
struct perf_counter *group_leader;
+ struct perf_counter *output;
const struct pmu *pmu;
enum perf_counter_active_state state;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 06bf6a4..53abcbe 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1692,6 +1692,11 @@ static void free_counter(struct perf_counter *counter)
atomic_dec(&nr_task_counters);
}
+ if (counter->output) {
+ fput(counter->output->filp);
+ counter->output = NULL;
+ }
+
if (counter->destroy)
counter->destroy(counter);
@@ -1977,6 +1982,8 @@ unlock:
return ret;
}
+int perf_counter_set_output(struct perf_counter *counter, int output_fd);
+
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct perf_counter *counter = file->private_data;
@@ -2000,6 +2007,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_COUNTER_IOC_PERIOD:
return perf_counter_period(counter, (u64 __user *)arg);
+ case PERF_COUNTER_IOC_SET_OUTPUT:
+ return perf_counter_set_output(counter, arg);
+
default:
return -ENOTTY;
}
@@ -2270,6 +2280,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
WARN_ON_ONCE(counter->ctx->parent_ctx);
mutex_lock(&counter->mmap_mutex);
+ if (counter->output) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
if (atomic_inc_not_zero(&counter->mmap_count)) {
if (nr_pages != counter->data->nr_pages)
ret = -EINVAL;
@@ -2655,6 +2670,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
struct perf_counter *counter, unsigned int size,
int nmi, int sample)
{
+ struct perf_counter *output_counter;
struct perf_mmap_data *data;
unsigned int offset, head;
int have_lost;
@@ -2664,13 +2680,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
u64 lost;
} lost_event;
+ rcu_read_lock();
/*
* For inherited counters we send all the output towards the parent.
*/
if (counter->parent)
counter = counter->parent;
- rcu_read_lock();
+ output_counter = rcu_dereference(counter->output);
+ if (output_counter)
+ counter = output_counter;
+
data = rcu_dereference(counter->data);
if (!data)
goto out;
@@ -4218,6 +4238,57 @@ err_size:
goto out;
}
+int perf_counter_set_output(struct perf_counter *counter, int output_fd)
+{
+ struct perf_counter *output_counter = NULL;
+ struct file *output_file = NULL;
+ struct perf_counter *old_output;
+ int fput_needed = 0;
+ int ret = -EINVAL;
+
+ if (!output_fd)
+ goto set;
+
+ output_file = fget_light(output_fd, &fput_needed);
+ if (!output_file)
+ return -EBADF;
+
+ if (output_file->f_op != &perf_fops)
+ goto out;
+
+ output_counter = output_file->private_data;
+
+ /* Don't chain output fds */
+ if (output_counter->output)
+ goto out;
+
+ /* Don't set an output fd when we already have an output channel */
+ if (counter->data)
+ goto out;
+
+ atomic_long_inc(&output_file->f_count);
+
+set:
+ mutex_lock(&counter->mmap_mutex);
+ old_output = counter->output;
+ rcu_assign_pointer(counter->output, output_counter);
+ mutex_unlock(&counter->mmap_mutex);
+
+ if (old_output) {
+ /*
+ * we need to make sure no existing perf_output_*()
+ * is still referencing this counter.
+ */
+ synchronize_rcu();
+ fput(old_output->filp);
+ }
+
+ ret = 0;
+out:
+ fput_light(output_file, fput_needed);
+ return ret;
+}
+
/**
* sys_perf_counter_open - open a performance counter, associate it to a task/cpu
*
@@ -4240,7 +4311,7 @@ SYSCALL_DEFINE5(perf_counter_open,
int ret;
/* for future expandability... */
- if (flags)
+ if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
return -EINVAL;
ret = perf_copy_attr(attr_uptr, &attr);
@@ -4268,7 +4339,7 @@ SYSCALL_DEFINE5(perf_counter_open,
* Look up the group leader (we will attach this counter to it):
*/
group_leader = NULL;
- if (group_fd != -1) {
+ if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
ret = -EINVAL;
group_file = fget_light(group_fd, &fput_needed);
if (!group_file)
@@ -4310,6 +4381,12 @@ SYSCALL_DEFINE5(perf_counter_open,
if (!counter_file)
goto err_free_put_context;
+ if (flags & PERF_FLAG_FD_OUTPUT) {
+ ret = perf_counter_set_output(counter, group_fd);
+ if (ret)
+ goto err_free_put_context;
+ }
+
counter->filp = counter_file;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
^ permalink raw reply related [flat|nested] 27+ messages in thread
end of thread, other threads:[~2009-08-25 7:41 UTC | newest]
Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-08-19 9:18 [PATCH 0/4] perf counter bits Peter Zijlstra
2009-08-19 9:18 ` [PATCH 1/4] perf_counter: Default to higher paranoia level Peter Zijlstra
2009-08-19 14:07 ` Peter Zijlstra
2009-08-19 16:04 ` Frederic Weisbecker
2009-08-20 12:00 ` Peter Zijlstra
2009-08-21 14:21 ` Ingo Molnar
2009-08-24 7:29 ` Peter Zijlstra
2009-08-24 7:37 ` Ingo Molnar
2009-08-19 9:18 ` [PATCH 2/4] perf_counter: powerpc: Support the anonymized kernel callchain bits Peter Zijlstra
2009-08-19 13:30 ` [tip:perfcounters/core] perf_counter: powerpc: Support the anonimized " tip-bot for Peter Zijlstra
2009-08-19 9:18 ` [PATCH 3/4] perf tools: Check perf.data owner Peter Zijlstra
2009-08-19 13:32 ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-08-19 9:18 ` [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels Peter Zijlstra
2009-08-19 10:58 ` Ingo Molnar
2009-08-19 11:07 ` Peter Zijlstra
2009-08-19 12:41 ` Paul Mackerras
2009-08-19 12:36 ` Paul Mackerras
2009-08-19 12:56 ` Ingo Molnar
2009-08-19 12:56 ` Peter Zijlstra
2009-08-19 13:00 ` Ingo Molnar
2009-08-20 10:13 ` stephane eranian
2009-08-20 10:24 ` Peter Zijlstra
2009-08-20 10:28 ` Ingo Molnar
2009-08-19 16:19 ` Frederic Weisbecker
2009-08-19 16:24 ` Peter Zijlstra
2009-08-19 16:27 ` Frederic Weisbecker
2009-08-25 7:39 ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).