public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Jiri Olsa <jolsa@kernel.org>
To: Peter Zijlstra <a.p.zijlstra@chello.nl>, Ingo Molnar <mingo@kernel.org>
Cc: lkml <linux-kernel@vger.kernel.org>,
	Namhyung Kim <namhyung@kernel.org>,
	David Ahern <dsahern@gmail.com>, Andi Kleen <ak@linux.intel.com>,
	Alexander Shishkin <alexander.shishkin@linux.intel.com>,
	Andy Lutomirski <luto@amacapital.net>,
	Arnaldo Carvalho de Melo <acme@kernel.org>
Subject: [PATCH 06/21] perf: Add PERF_RECORD_USER_DATA event processing
Date: Wed, 24 Jan 2018 12:51:28 +0100	[thread overview]
Message-ID: <20180124115143.14322-7-jolsa@kernel.org> (raw)
In-Reply-To: <20180124115143.14322-1-jolsa@kernel.org>

Adding support to skip user data retrieval from event's
NMI processing and delay that to the time when task is
jumping back to user space.

Using task work to retrieve the needed user data and
store user data event with it linked with ID to the
original sample.

We can trigger the delayed task work only if the taskwork
gets executed before the process executes again after NMI,
because we need its stack as it was in NMI.

That leaves us with window during the slow syscall path
delimited by task_struct::perf_user_data_allowed.

Note that this change only adds skeleton of this code,
the data retrieval is coming in following patch.

This patch also adds the PERF_RECORD_USER_DATA event, which
is designed in a similar way as sample. Having data area
governed by 'sample' type will allow us to add multiple
user data such as callchain or user stack.

Link: http://lkml.kernel.org/n/tip-j8azyyhfgipc6mn8amfpy8hm@git.kernel.org
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
---
 arch/x86/events/core.c          |  18 +++++
 arch/x86/events/intel/ds.c      |   4 +-
 include/linux/init_task.h       |   4 +-
 include/linux/perf_event.h      |   1 +
 include/linux/sched.h           |  15 +++++
 include/uapi/linux/perf_event.h |  17 ++++-
 kernel/events/core.c            | 145 +++++++++++++++++++++++++++++++++++++++-
 7 files changed, 198 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 140d33288e78..8e7fe39a33b8 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2567,3 +2567,21 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
 	cap->events_mask_len	= x86_pmu.events_mask_len;
 }
 EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
+
+int arch_perf_set_user_data(struct task_struct *task, bool set)
+{
+	struct perf_user_data *ud = &task->perf_user_data;
+
+	mutex_lock(&ud->enabled_mutex);
+
+	ud->enabled_count += set ? 1 : -1;
+	WARN_ON_ONCE(ud->enabled_count < 0);
+
+	if (ud->enabled_count == 1)
+		set_tsk_thread_flag(task, TIF_PERF_USER_DATA);
+	else if (ud->enabled_count == 0)
+		clear_tsk_thread_flag(task, TIF_PERF_USER_DATA);
+
+	mutex_unlock(&ud->enabled_mutex);
+	return 0;
+}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 8156e47da7ba..a4329c59b195 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -637,8 +637,10 @@ int intel_pmu_drain_bts_buffer(void)
 	perf_prepare_sample(&header, &data, event, &regs);
 
 	if (perf_output_begin(&handle, event, header.size *
-			      (top - base - skip)))
+			      (top - base - skip))) {
+		perf_prepare_sample_fallback(event);
 		goto unlock;
+	}
 
 	for (at = base; at < top; at++) {
 		/* Filter out any records that contain kernel addresses. */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6a532629c983..55fa53ab9d91 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -157,7 +157,9 @@ extern struct cred init_cred;
 # define INIT_PERF_EVENTS(tsk)						\
 	.perf_event_mutex = 						\
 		 __MUTEX_INITIALIZER(tsk.perf_event_mutex),		\
-	.perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
+	.perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),		\
+	.perf_user_data.enabled_mutex =					\
+		 __MUTEX_INITIALIZER(tsk.perf_user_data.enabled_mutex),
 #else
 # define INIT_PERF_EVENTS(tsk)
 #endif
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7546822a1d74..b716bbca6f87 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -948,6 +948,7 @@ extern void perf_prepare_sample(struct perf_event_header *header,
 				struct perf_sample_data *data,
 				struct perf_event *event,
 				struct pt_regs *regs);
+void perf_prepare_sample_fallback(struct perf_event *event);
 
 extern int perf_event_overflow(struct perf_event *event,
 				 struct perf_sample_data *data,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e8079524010..101c49cdde09 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -506,6 +506,20 @@ union rcu_special {
 	u32 s; /* Set of bits. */
 };
 
+enum perf_user_data_state {
+	PERF_USER_DATA_STATE_OFF	= 0,
+	PERF_USER_DATA_STATE_ENABLE	= 1,
+	PERF_USER_DATA_STATE_ON		= 2,
+};
+
+struct perf_user_data {
+	struct callback_head		 work;
+	enum perf_user_data_state	 state;
+	u64				 type;
+	int				 enabled_count;
+	struct mutex			 enabled_mutex;
+};
+
 enum perf_event_task_context {
 	perf_invalid_context = -1,
 	perf_hw_context = 0,
@@ -917,6 +931,7 @@ struct task_struct {
 	struct perf_event_context	*perf_event_ctxp[perf_nr_task_contexts];
 	struct mutex			perf_event_mutex;
 	struct list_head		perf_event_list;
+	struct perf_user_data		perf_user_data;
 #endif
 #ifdef CONFIG_DEBUG_PREEMPT
 	unsigned long			preempt_disable_ip;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c77c9a2ebbbb..f7b152a2f004 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -370,7 +370,8 @@ struct perf_event_attr {
 				context_switch :  1, /* context switch data */
 				write_backward :  1, /* Write ring buffer from end to beginning */
 				namespaces     :  1, /* include namespaces data */
-				__reserved_1   : 35;
+				user_data      :  1, /* generate user data */
+				__reserved_1   : 34;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -618,10 +619,12 @@ struct perf_event_mmap_page {
  *   PERF_RECORD_MISC_MMAP_DATA  - PERF_RECORD_MMAP* events
  *   PERF_RECORD_MISC_COMM_EXEC  - PERF_RECORD_COMM event
  *   PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events
+ *   PERF_RECORD_MISC_USER_DATA  - PERF_RECORD_SAMPLE event
  */
 #define PERF_RECORD_MISC_MMAP_DATA		(1 << 13)
 #define PERF_RECORD_MISC_COMM_EXEC		(1 << 13)
 #define PERF_RECORD_MISC_SWITCH_OUT		(1 << 13)
+#define PERF_RECORD_MISC_USER_DATA		(1 << 13)
 /*
  * Indicates that the content of PERF_SAMPLE_IP points to
  * the actual instruction that triggered the event. See also
@@ -922,6 +925,18 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_NAMESPACES			= 16,
 
+	/*
+	 * Records the user space data for previous
+	 * kernel samples.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				sample_type;
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_USER_DATA			= 17,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4e1a1bf8d867..8162cadb6736 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -50,6 +50,7 @@
 #include <linux/sched/mm.h>
 #include <linux/proc_ns.h>
 #include <linux/mount.h>
+#include <linux/task_work.h>
 
 #include "internal.h"
 
@@ -4850,6 +4851,11 @@ void __weak arch_perf_update_userpage(
 {
 }
 
+int __weak arch_perf_set_user_data(struct task_struct *task, bool set)
+{
+	return -EINVAL;
+}
+
 /*
  * Callers need to ensure there can be no nesting of this function, otherwise
  * the seqlock logic goes bad. We can not serialize this because the arch
@@ -5938,6 +5944,23 @@ void perf_output_sample(struct perf_output_handle *handle,
 			}
 		}
 	}
+
+	if (event->attr.user_data) {
+		struct perf_user_data *user_data = &current->perf_user_data;
+
+		if (user_data->state == PERF_USER_DATA_STATE_ENABLE) {
+			user_data->state = PERF_USER_DATA_STATE_ON;
+
+			/*
+			 * We cannot do set_notify_resume() from NMI context,
+			 * also, knowing we are already in an interrupted
+			 * context and will pass return to userspace, we can
+			 * simply set TIF_NOTIFY_RESUME.
+			 */
+			task_work_add(current, &user_data->work, false);
+			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+		}
+	}
 }
 
 static u64 perf_virt_to_phys(u64 virt)
@@ -5972,6 +5995,20 @@ static u64 perf_virt_to_phys(u64 virt)
 	return phys_addr;
 }
 
+struct user_data {
+	u64	type;
+	bool	allow;
+};
+
+static void user_data(struct user_data *ud, struct perf_event *event)
+{
+	ud->allow = event->attr.user_data &&		/* is user data event	*/
+		    current->perf_user_data_allowed &&	/* is in allowed area	*/
+		    current->mm &&			/* is normal task	*/
+		    !(current->flags & PF_EXITING);	/* is not exiting task	*/
+	ud->type  = 0;
+}
+
 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
 
 static struct perf_callchain_entry *
@@ -5998,6 +6035,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 			 struct pt_regs *regs)
 {
 	u64 sample_type = event->attr.sample_type;
+	struct user_data ud;
+
+	user_data(&ud, event);
 
 	header->type = PERF_RECORD_SAMPLE;
 	header->size = sizeof(*header) + event->header_size;
@@ -6111,6 +6151,27 @@ void perf_prepare_sample(struct perf_event_header *header,
 
 	if (sample_type & PERF_SAMPLE_PHYS_ADDR)
 		data->phys_addr = perf_virt_to_phys(data->addr);
+
+	if (ud.allow && ud.type) {
+		struct perf_user_data *user_data = &current->perf_user_data;
+
+		header->misc |= PERF_RECORD_MISC_USER_DATA;
+		user_data->type |= ud.type;
+
+		if (!user_data->state)
+			user_data->state = PERF_USER_DATA_STATE_ENABLE;
+	}
+}
+
+void perf_prepare_sample_fallback(struct perf_event *event)
+{
+	struct perf_user_data *user_data = &current->perf_user_data;
+
+	if (!event->attr.user_data)
+		return;
+
+	if (user_data->state == PERF_USER_DATA_STATE_ENABLE)
+		user_data->state = PERF_USER_DATA_STATE_OFF;
 }
 
 static void __always_inline
@@ -6129,8 +6190,10 @@ __perf_event_output(struct perf_event *event,
 
 	perf_prepare_sample(&header, data, event, regs);
 
-	if (output_begin(&handle, event, header.size))
+	if (output_begin(&handle, event, header.size)) {
+		perf_prepare_sample_fallback(event);
 		goto exit;
+	}
 
 	perf_output_sample(&handle, &header, data, event);
 
@@ -6285,6 +6348,67 @@ perf_iterate_sb(perf_iterate_f output, void *data,
 	rcu_read_unlock();
 }
 
+struct perf_user_data_event {
+	struct {
+		struct perf_event_header	header;
+		u64				type;
+	} event_id;
+};
+
+static void perf_user_data_output(struct perf_event *event, void *data)
+{
+	struct perf_user_data *user_data = &current->perf_user_data;
+	struct perf_user_data_event *user = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	u16 header_size = user->event_id.header.size;
+
+	if (!event->attr.user_data)
+		return;
+
+	user->event_id.type  = event->attr.sample_type & user_data->type;
+
+	perf_event_header__init_id(&user->event_id.header, &sample, event);
+
+	if (perf_output_begin(&handle, event, user->event_id.header.size))
+		goto out;
+
+	perf_output_put(&handle, user->event_id);
+	perf_event__output_id_sample(event, &handle, &sample);
+	perf_output_end(&handle);
+out:
+	user->event_id.header.size = header_size;
+}
+
+static void perf_user_data_event(struct perf_user_data *user_data)
+{
+	struct perf_user_data_event event;
+
+	event = (struct perf_user_data_event) {
+		.event_id = {
+			.header	= {
+				.type = PERF_RECORD_USER_DATA,
+				.misc = 0,
+				.size = sizeof(event.event_id),
+			},
+		},
+	};
+
+	perf_iterate_sb(perf_user_data_output, &event, NULL);
+
+	/*
+	 * User data events are disabled (perf_user_data_allowed),
+	 * so there's no race and we can set new id and zero type.
+	 */
+	user_data->type  = 0;
+	user_data->state = PERF_USER_DATA_STATE_OFF;
+}
+
+static void perf_user_data_work(struct callback_head *work)
+{
+	perf_user_data_event(&current->perf_user_data);
+}
+
 /*
  * Clear all file-based filters at exec, they'll have to be
  * re-instated when/if these objects are mmapped again.
@@ -9919,16 +10043,26 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	}
 
+	if (attr.user_data) {
+		if (!task) {
+			err = -EINVAL;
+			goto err_group_fd;
+		}
+		err = arch_perf_set_user_data(task, true);
+		if (err)
+			goto err_task;
+	}
+
 	if (task && group_leader &&
 	    group_leader->attr.inherit != attr.inherit) {
 		err = -EINVAL;
-		goto err_task;
+		goto err_user_data;
 	}
 
 	if (task) {
 		err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
 		if (err)
-			goto err_task;
+			goto err_user_data;
 
 		/*
 		 * Reuse ptrace permission checks for now.
@@ -10252,6 +10386,9 @@ SYSCALL_DEFINE5(perf_event_open,
 err_cred:
 	if (task)
 		mutex_unlock(&task->signal->cred_guard_mutex);
+err_user_data:
+	if (attr.user_data && task)
+		arch_perf_set_user_data(task, false);
 err_task:
 	if (task)
 		put_task_struct(task);
@@ -10985,6 +11122,8 @@ int perf_event_init_task(struct task_struct *child)
 	memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
 	mutex_init(&child->perf_event_mutex);
 	INIT_LIST_HEAD(&child->perf_event_list);
+	init_task_work(&child->perf_user_data.work, perf_user_data_work);
+	mutex_init(&child->perf_user_data.enabled_mutex);
 
 	for_each_task_context_nr(ctxn) {
 		ret = perf_event_init_context(child, ctxn);
-- 
2.13.6

  parent reply	other threads:[~2018-01-24 11:52 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-01-24 11:51 [RFC 00/21] perf tools: Add perf_evsel__is_sample_bit function Jiri Olsa
2018-01-24 11:51 ` [PATCH 01/21] " Jiri Olsa
2018-01-24 11:51 ` [PATCH 02/21] perf tools: Add perf_sample__process function Jiri Olsa
2018-01-24 11:51 ` [PATCH 03/21] perf tools: Add callchain__printf for pure callchain dump Jiri Olsa
2018-01-24 11:51 ` [PATCH 04/21] perf tools: Add perf_sample__copy|free functions Jiri Olsa
2018-01-24 11:51 ` [PATCH 05/21] perf: Add TIF_PERF_USER_DATA bit Jiri Olsa
2018-01-24 11:51 ` Jiri Olsa [this message]
2018-01-24 11:51 ` [PATCH 07/21] perf: Add PERF_SAMPLE_USER_DATA_ID sample type Jiri Olsa
2018-01-24 11:51 ` [PATCH 08/21] perf: Add PERF_SAMPLE_CALLCHAIN to user data event Jiri Olsa
2018-01-24 11:51 ` [PATCH 09/21] perf: Export running sample length values through debugfs Jiri Olsa
2018-01-24 11:51 ` [PATCH 10/21] perf tools: Sync perf_event.h uapi header Jiri Olsa
2018-01-24 11:51 ` [PATCH 11/21] perf tools: Add perf_sample__parse function Jiri Olsa
2018-01-24 11:51 ` [PATCH 12/21] perf tools: Add struct parse_args arg to perf_sample__parse Jiri Olsa
2018-01-24 11:51 ` [PATCH 13/21] perf tools: Add support to parse user data event Jiri Olsa
2018-01-24 11:51 ` [PATCH 14/21] perf tools: Add support to dump user data event info Jiri Olsa
2018-01-24 11:51 ` [PATCH 15/21] perf report: Add delayed user data event processing Jiri Olsa
2018-01-24 11:51 ` [PATCH 16/21] perf record: Enable delayed user data events Jiri Olsa
2018-01-24 11:51 ` [PATCH 17/21] perf script: Add support to display " Jiri Olsa
2018-01-24 11:51 ` [PATCH 18/21] perf script: Add support to display user data ID Jiri Olsa
2018-01-24 11:51 ` [PATCH 19/21] perf script: Display USER_DATA misc char for sample Jiri Olsa
2018-01-24 11:51 ` [PATCH 20/21] perf report: Add user data processing stats Jiri Olsa
2018-01-24 11:51 ` [PATCH 21/21] perf report: Add --stats=ud option to display user data debug info Jiri Olsa
2018-01-24 12:11 ` [RFC 00/21] perf tools: Add user data delayed processing Jiri Olsa

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180124115143.14322-7-jolsa@kernel.org \
    --to=jolsa@kernel.org \
    --cc=a.p.zijlstra@chello.nl \
    --cc=acme@kernel.org \
    --cc=ak@linux.intel.com \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=dsahern@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@amacapital.net \
    --cc=mingo@kernel.org \
    --cc=namhyung@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox