All of lore.kernel.org
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>,
	Arnaldo Carvalho de Melo <acme@redhat.com>,
	Frederic Weisbecker <fweisbec@gmail.com>,
	Mike Galbraith <efault@gmx.de>,
	linux-kernel@vger.kernel.org, Jens Axboe <jens.axboe@oracle.com>,
	James Morris <jmorris@namei.org>,
	Linus Torvalds <torvalds@linux-foundation.org>
Subject: Re: [PATCH 1/4] perf_counter: Default to higher paranoia level
Date: Thu, 20 Aug 2009 14:00:06 +0200	[thread overview]
Message-ID: <1250769606.8282.181.camel@twins> (raw)
In-Reply-To: <1250690853.8282.59.camel@twins>

On Wed, 2009-08-19 at 16:07 +0200, Peter Zijlstra wrote:
> On Wed, 2009-08-19 at 11:18 +0200, Peter Zijlstra wrote:
> 
> > +static inline bool perf_paranoid_anon(void)
> > +{
> > +	return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 1;
> >  }
> >  
> >  static inline bool perf_paranoid_kernel(void)
> >  {
> > -	return sysctl_perf_counter_paranoid > 1;
> > +	return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 2;
> > +}
> 
> OK, this is buggy:
> 
>  - capable() uses current, which is unlikely to be counter->owner,
>  - but even security_real_capable(counter->owner, ...) wouldn't
>    work, since the ->capable() callback isn't NMI safe
>    (selinux takes locks and does allocations in that path).
> 
> This puts a severe strain on more complex anonymizers since its
> basically impossible to tell if counter->owner has permissions on
> current from NMI context.
> 
> I'll fix up this patch to pre-compute the perf_paranoid_anon_ip() per
> counter based on creation time state, unless somebody has a better idea.
> 
> I could possibly only anonymize IRQ context (SoftIRQ context is
> difficult since in_softirq() means both in-softirq and
> softirq-disabled).

Something like the below maybe.. its 3 patches folded and I've no clue
how to adapt the ppc code, what do people think?

compile tested on x86-64 only.

---
 arch/x86/include/asm/stacktrace.h  |   14 +++++--
 arch/x86/kernel/cpu/perf_counter.c |   66 +++++++++++++++++++++++++++++++----
 arch/x86/kernel/dumpstack.c        |   10 +++--
 arch/x86/kernel/dumpstack_32.c     |    8 +----
 arch/x86/kernel/dumpstack_64.c     |   13 ++-----
 arch/x86/kernel/stacktrace.c       |    8 +++--
 arch/x86/oprofile/backtrace.c      |    5 ++-
 include/linux/perf_counter.h       |    5 ++-
 kernel/perf_counter.c              |   30 ++++++++++++----
 kernel/trace/trace_sysprof.c       |    5 ++-
 10 files changed, 117 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index cf86a5e..7066caa 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,17 +3,23 @@
 
 extern int kstack_depth_to_print;
 
-int x86_is_stack_id(int id, char *name);
-
 /* Generic stack tracer with callbacks */
 
+enum stack_type {
+	STACK_UNKNOWN	= -1,
+	STACK_PROCESS   = 0,
+	STACK_INTERRUPT = 1,
+	STACK_EXCEPTION = 2,
+};
+
 struct stacktrace_ops {
 	void (*warning)(void *data, char *msg);
 	/* msg must contain %s for the symbol */
 	void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
-	void (*address)(void *data, unsigned long address, int reliable);
+	void (*address)(void *data, unsigned long stack,
+			unsigned long address, int reliable);
 	/* On negative return stop dumping */
-	int (*stack)(void *data, char *name);
+	int (*stack)(void *data, int type, char *name);
 };
 
 void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 396e35d..a6ddb3b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -2108,7 +2108,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
 
 static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
 static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
-static DEFINE_PER_CPU(int, in_nmi_frame);
+static DEFINE_PER_CPU(int, skip_frame);
 
 
 static void
@@ -2122,21 +2122,49 @@ static void backtrace_warning(void *data, char *msg)
 	/* Ignore warnings */
 }
 
-static int backtrace_stack(void *data, char *name)
+static int backtrace_stack(void *data, int type, char *name)
 {
-	per_cpu(in_nmi_frame, smp_processor_id()) =
-			x86_is_stack_id(NMI_STACK, name);
+	struct perf_callchain_entry *entry = data;
+	int skip = 0;
+
+	/*
+	 * Always skip exception (NMI) context
+	 */
+	if (type == STACK_EXCEPTION)
+		skip = 1;
+
+	/*
+	 * If we're restricted, don't show IRQ context
+	 */
+	if (entry->restricted && type == STACK_INTERRUPT)
+		skip = 1;
+
+	__get_cpu_var(skip_frame) = skip;
 
 	return 0;
 }
 
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static void backtrace_address(void *data, unsigned long stack,
+			      unsigned long addr, int reliable)
 {
 	struct perf_callchain_entry *entry = data;
 
-	if (per_cpu(in_nmi_frame, smp_processor_id()))
+	if (__get_cpu_var(skip_frame))
 		return;
 
+#ifdef CONFIG_4KSTACKS
+	if (entry->restricted) {
+		struct thread_info *tinfo = current_thread_info();
+		unsigned long task_stack = (unsigned long)tinfo;
+
+		/*
+		 * If its not on the task stack, don't show it.
+		 */
+		if (stack < task_stack || stack >= task_stack + PAGE_SIZE)
+			return;
+	}
+#endif
+
 	if (reliable)
 		callchain_store(entry, addr);
 }
@@ -2153,8 +2181,28 @@ static const struct stacktrace_ops backtrace_ops = {
 static void
 perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
+	int irq = hardirq_count() - (in_nmi() ? 0 : HARDIRQ_OFFSET);
+
 	callchain_store(entry, PERF_CONTEXT_KERNEL);
-	callchain_store(entry, regs->ip);
+	if (entry->restricted && !irq)
+		callchain_store(entry, regs->ip);
+
+#ifdef CONFIG_X86_32
+	/*
+	 * i386 is a friggin mess, 4KSTACKS makes it somewhat saner but
+	 * still don't have a way to identify NMI entries as they are
+	 * nested onto whatever stack is there.
+	 *
+	 * So until i386 starts using per context stacks unconditionally
+	 * and fixed up the unwinder (dump_trace behaves differently between
+	 * i386 and x86_64), we'll have to unconditionally truncate kernel
+	 * stacks that have IRQ context bits in them.
+	 */
+#ifndef CONFIG_4KSTACKS
+	if (irq)
+		return;
+#endif
+#endif
 
 	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
 }
@@ -2255,7 +2303,8 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		perf_callchain_user(regs, entry);
 }
 
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *
+perf_callchain(struct perf_counter *counter, struct pt_regs *regs)
 {
 	struct perf_callchain_entry *entry;
 
@@ -2265,6 +2314,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 		entry = &__get_cpu_var(irq_entry);
 
 	entry->nr = 0;
+	entry->restricted = counter->restricted;
 
 	perf_do_callchain(regs, entry);
 
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371..978ae89 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -51,7 +51,7 @@ print_ftrace_graph_addr(unsigned long addr, void *data,
 	index -= *graph;
 	ret_addr = task->ret_stack[index].ret;
 
-	ops->address(data, ret_addr, 1);
+	ops->address(data, 0, ret_addr, 1);
 
 	(*graph)++;
 }
@@ -96,12 +96,14 @@ print_context_stack(struct thread_info *tinfo,
 
 		addr = *stack;
 		if (__kernel_text_address(addr)) {
-			if ((unsigned long) stack == bp + sizeof(long)) {
-				ops->address(data, addr, 1);
+			unsigned long stk = (unsigned long)stack;
+
+			if (stk == bp + sizeof(long)) {
+				ops->address(data, stk, addr, 1);
 				frame = frame->next_frame;
 				bp = (unsigned long) frame;
 			} else {
-				ops->address(data, addr, 0);
+				ops->address(data, stk, addr, 0);
 			}
 			print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
 		}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba..be633ed 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -19,12 +19,6 @@
 
 #include "dumpstack.h"
 
-/* Just a stub for now */
-int x86_is_stack_id(int id, char *name)
-{
-	return 0;
-}
-
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *stack, unsigned long bp,
 		const struct stacktrace_ops *ops, void *data)
@@ -64,7 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		stack = (unsigned long *)context->previous_esp;
 		if (!stack)
 			break;
-		if (ops->stack(data, "IRQ") < 0)
+		if (ops->stack(data, STACK_UNKNOWN, "IRQ") < 0)
 			break;
 		touch_nmi_watchdog();
 	}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a32..9309c5c 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -32,11 +32,6 @@ static char x86_stack_ids[][8] = {
 #endif
 	};
 
-int x86_is_stack_id(int id, char *name)
-{
-	return x86_stack_ids[id - 1] == name;
-}
-
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
 					unsigned *usedp, char **idp)
 {
@@ -155,12 +150,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 						&used, &id);
 
 		if (estack_end) {
-			if (ops->stack(data, id) < 0)
+			if (ops->stack(data, STACK_EXCEPTION, id) < 0)
 				break;
 
 			bp = print_context_stack(tinfo, stack, bp, ops,
 						 data, estack_end, &graph);
-			ops->stack(data, "<EOE>");
+			ops->stack(data, STACK_PROCESS, "<EOE>");
 			/*
 			 * We link to the next stack via the
 			 * second-to-last pointer (index -2 to end) in the
@@ -175,7 +170,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 				(IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
 
 			if (stack >= irq_stack && stack < irq_stack_end) {
-				if (ops->stack(data, "IRQ") < 0)
+				if (ops->stack(data, STACK_INTERRUPT, "IRQ") < 0)
 					break;
 				bp = print_context_stack(tinfo, stack, bp,
 					ops, data, irq_stack_end, &graph);
@@ -186,7 +181,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 				 */
 				stack = (unsigned long *) (irq_stack_end[-1]);
 				irq_stack_end = NULL;
-				ops->stack(data, "EOI");
+				ops->stack(data, STACK_PROCESS, "EOI");
 				continue;
 			}
 		}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c3eb207..a1415f4 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -18,12 +18,13 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
 {
 }
 
-static int save_stack_stack(void *data, char *name)
+static int save_stack_stack(void *data, int type, char *name)
 {
 	return 0;
 }
 
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static void save_stack_address(void *data, unsigned long stack,
+			       unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = data;
 	if (!reliable)
@@ -37,7 +38,8 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
 }
 
 static void
-save_stack_address_nosched(void *data, unsigned long addr, int reliable)
+save_stack_address_nosched(void *data, unsigned long stack,
+			   unsigned long addr, int reliable)
 {
 	struct stack_trace *trace = (struct stack_trace *)data;
 	if (!reliable)
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 044897b..7232bc9 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -26,13 +26,14 @@ static void backtrace_warning(void *data, char *msg)
 	/* Ignore warnings */
 }
 
-static int backtrace_stack(void *data, char *name)
+static int backtrace_stack(void *data, int type, char *name)
 {
 	/* Yes, we want all stacks */
 	return 0;
 }
 
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static void backtrace_address(void *data, unsigned long stack,
+			      unsigned long addr, int reliable)
 {
 	unsigned int *depth = data;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9ba1822..2b0528f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -439,6 +439,7 @@ enum perf_callchain_context {
 struct perf_callchain_entry {
 	__u64				nr;
 	__u64				ip[PERF_MAX_STACK_DEPTH];
+	int				restricted;
 };
 
 struct perf_raw_record {
@@ -535,6 +536,7 @@ struct perf_counter {
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
 	int				nr_siblings;
+	int				restricted;
 	struct perf_counter		*group_leader;
 	const struct pmu		*pmu;
 
@@ -754,7 +756,8 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma)
 extern void perf_counter_comm(struct task_struct *tsk);
 extern void perf_counter_fork(struct task_struct *tsk);
 
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_counter *counter, struct pt_regs *regs);
 
 extern int sysctl_perf_counter_paranoid;
 extern int sysctl_perf_counter_mlock;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 36f65e2..cc8450e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -50,16 +50,28 @@ static atomic_t nr_task_counters __read_mostly;
  *  1 - disallow cpu counters to unpriv
  *  2 - disallow kernel profiling to unpriv
  */
-int sysctl_perf_counter_paranoid __read_mostly;
+int sysctl_perf_counter_paranoid __read_mostly = 1;
 
 static inline bool perf_paranoid_cpu(void)
 {
-	return sysctl_perf_counter_paranoid > 0;
+	return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 0;
 }
 
 static inline bool perf_paranoid_kernel(void)
 {
-	return sysctl_perf_counter_paranoid > 1;
+	return !capable(CAP_SYS_ADMIN) && sysctl_perf_counter_paranoid > 1;
+}
+
+/*
+ * When paranoid == 1 we're limiting kernel profiling to not include
+ * other users' information, this includes IRQ stack traces.
+ */
+static bool perf_counter_restricted(struct perf_counter *counter)
+{
+	if (counter->restricted)
+		return hardirq_count() - (in_nmi() ? 0 : HARDIRQ_OFFSET);
+
+	return 0;
 }
 
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
@@ -1571,7 +1583,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	 */
 	if (cpu != -1) {
 		/* Must be root to operate on a CPU counter: */
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_cpu())
 			return ERR_PTR(-EACCES);
 
 		if (cpu < 0 || cpu > num_possible_cpus())
@@ -2458,7 +2470,8 @@ void perf_counter_do_pending(void)
  * Callchain support -- arch specific
  */
 
-__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+__weak struct perf_callchain_entry *
+perf_callchain(struct perf_counter *counter, struct pt_regs *regs)
 {
 	return NULL;
 }
@@ -2846,6 +2859,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 
 	if (sample_type & PERF_SAMPLE_IP) {
 		ip = perf_instruction_pointer(data->regs);
+		if (perf_counter_restricted(counter))
+			ip = 0;
 		header.size += sizeof(ip);
 	}
 
@@ -2889,7 +2904,7 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 		header.size += perf_counter_read_size(counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		callchain = perf_callchain(data->regs);
+		callchain = perf_callchain(counter, data->regs);
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
@@ -4049,6 +4064,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	counter->pmu		= NULL;
 	counter->ctx		= ctx;
 	counter->oncpu		= -1;
+	counter->restricted	= perf_paranoid_cpu();
 
 	counter->parent		= parent_counter;
 
@@ -4231,7 +4247,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 		return ret;
 
 	if (!attr.exclude_kernel) {
-		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_kernel())
 			return -EACCES;
 	}
 
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index f669396..d7d2d0d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -71,13 +71,14 @@ static void backtrace_warning(void *data, char *msg)
 	/* Ignore warnings */
 }
 
-static int backtrace_stack(void *data, char *name)
+static int backtrace_stack(void *data, int type, char *name)
 {
 	/* Don't bother with IRQ stacks for now */
 	return -1;
 }
 
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static void backtrace_address(void *data, unsigned long stack,
+			      unsigned long addr, int reliable)
 {
 	struct backtrace_info *info = data;
 


  parent reply	other threads:[~2009-08-20 12:00 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-08-19  9:18 [PATCH 0/4] perf counter bits Peter Zijlstra
2009-08-19  9:18 ` [PATCH 1/4] perf_counter: Default to higher paranoia level Peter Zijlstra
2009-08-19 14:07   ` Peter Zijlstra
2009-08-19 16:04     ` Frederic Weisbecker
2009-08-20 12:00     ` Peter Zijlstra [this message]
2009-08-21 14:21       ` Ingo Molnar
2009-08-24  7:29         ` Peter Zijlstra
2009-08-24  7:37           ` Ingo Molnar
2009-08-19  9:18 ` [PATCH 2/4] perf_counter: powerpc: Support the anonymized kernel callchain bits Peter Zijlstra
2009-08-19 13:30   ` [tip:perfcounters/core] perf_counter: powerpc: Support the anonimized " tip-bot for Peter Zijlstra
2009-08-19  9:18 ` [PATCH 3/4] perf tools: Check perf.data owner Peter Zijlstra
2009-08-19 13:32   ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra
2009-08-19  9:18 ` [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels Peter Zijlstra
2009-08-19 10:58   ` Ingo Molnar
2009-08-19 11:07     ` Peter Zijlstra
2009-08-19 12:41     ` Paul Mackerras
2009-08-19 12:36   ` Paul Mackerras
2009-08-19 12:56     ` Ingo Molnar
2009-08-19 12:56     ` Peter Zijlstra
2009-08-19 13:00       ` Ingo Molnar
2009-08-20 10:13       ` stephane eranian
2009-08-20 10:24         ` Peter Zijlstra
2009-08-20 10:28         ` Ingo Molnar
2009-08-19 16:19   ` Frederic Weisbecker
2009-08-19 16:24     ` Peter Zijlstra
2009-08-19 16:27       ` Frederic Weisbecker
2009-08-25  7:39   ` [tip:perfcounters/core] " tip-bot for Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1250769606.8282.181.camel@twins \
    --to=peterz@infradead.org \
    --cc=acme@redhat.com \
    --cc=efault@gmx.de \
    --cc=fweisbec@gmail.com \
    --cc=jens.axboe@oracle.com \
    --cc=jmorris@namei.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=paulus@samba.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.