* [PATCH v2 1/6] perf: Introduce extended syscall error reporting
2015-08-24 14:32 [PATCH v2 0/6] perf: Introduce extended syscall error reporting Alexander Shishkin
@ 2015-08-24 14:32 ` Alexander Shishkin
2015-08-31 18:47 ` Andy Shevchenko
2015-08-24 14:32 ` [PATCH v2 2/6] perf: Add file name and line number to perf extended error reports Alexander Shishkin
` (6 subsequent siblings)
7 siblings, 1 reply; 34+ messages in thread
From: Alexander Shishkin @ 2015-08-24 14:32 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar
Cc: linux-kernel, adrian.hunter, Arnaldo Carvalho de Melo,
Vince Weaver, Stephane Eranian, Johannes Berg, Alexander Shishkin
It has been pointed several times out that perf syscall error reporting
leaves a lot to be desired [1].
This patch introduces a fairly simple extension that allows call sites
to annotate their error codes with arbitrary strings, which will then
be copied to userspace (if they asked for it) along with the module
name that produced the error message in JSON format. This way, we can
provide both human-readable and machine-parsable information to user and
leave room for extensions in the future, such as file name and line
number if kernel debugging is enabled.
Each error "site" is referred to by its index, which is folded into an
integer error value within the range of [-PERF_ERRNO, -MAX_ERRNO], where
PERF_ERRNO is chosen to be below any known error codes, but still leaving
enough room to enumerate error sites. This way, all the traditional macros
will still handle these as error codes and we'd only have to convert them
to their original values right before returning to userspace. This way we
also don't have to worry about keeping a separate pointer to the error
message inside a perf_event.
[1] http://marc.info/?l=linux-kernel&m=141470811013082
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
include/linux/perf_event.h | 71 ++++++++++++++++++++++++++++++++++
include/uapi/linux/perf_event.h | 8 +++-
kernel/events/core.c | 86 ++++++++++++++++++++++++++++++++++++++---
3 files changed, 159 insertions(+), 6 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2027809433..8e37939d21 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -56,6 +56,77 @@ struct perf_guest_info_callbacks {
#include <linux/cgroup.h>
#include <asm/local.h>
+#ifndef PERF_MODNAME
+#define PERF_MODNAME "perf"
+#endif
+
+/*
+ * Extended error reporting: annotate an error code with a string
+ * and a module name to help users diagnase problems with their
+ * attributes and whatnot.
+ */
+struct perf_err_site {
+ const char *message;
+ const char *owner;
+ const int code;
+};
+
+#ifdef CONFIG_PERF_EVENTS
+
+/*
+ * Place all occurrences of struct perf_err_site into a special section,
+ * so that we can find out their offsets, which we'll use to refer back
+ * to the error sites.
+ */
+extern const struct perf_err_site __start___perf_err[], __stop___perf_err[];
+
+#define __perf_err(__c, __m) ({ \
+ static struct perf_err_site \
+ __attribute__ ((__section__("__perf_err"))) \
+ __err_site = { \
+ .message = (__m), \
+ .owner = PERF_MODNAME, \
+ .code = __builtin_constant_p((__c)) ? \
+ (__c) : -EINVAL, \
+ }; \
+ &__err_site; \
+})
+
+/*
+ * Use part of the [-1, -MAX_ERRNO] errno range for perf's extended error
+ * reporting. Anything within [-PERF_ERRNO, -MAX_ERRNO] is an index of a
+ * perf_err_site structure within __perf_err section. 3k should be enough
+ * for everybody, but let's add a boot-time warning just in case it overflows
+ * one day.
+ */
+#define PERF_ERRNO 1024
+
+static inline int perf_errno(const struct perf_err_site *site)
+{
+ unsigned long err = site - __start___perf_err;
+
+ return -(int)err - PERF_ERRNO;
+}
+
+static inline const struct perf_err_site *perf_errno_to_site(int err)
+{
+ return __start___perf_err - err - PERF_ERRNO;
+}
+
+#ifdef MODULE
+/*
+ * Module support is a tad trickier, but far from rocket surgery. Let's
+ * bypass it for now.
+ */
+#define perf_err(__c, __m) (__c)
+#else
+#define perf_err(__c, __m) (perf_errno(__perf_err((__c), (__m))))
+#endif
+
+#define PERF_ERR_PTR(__e, __m) (ERR_PTR(perf_err(__e, __m)))
+
+#endif
+
struct perf_callchain_entry {
__u64 nr;
__u64 ip[PERF_MAX_STACK_DEPTH];
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 2881145cda..73b6919954 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -264,6 +264,7 @@ enum perf_event_read_format {
/* add: sample_stack_user */
#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */
#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */
+#define PERF_ATTR_SIZE_VER6 120 /* add: perf_err */
/*
* Hardware event_id to monitor via a performance monitoring event:
@@ -375,7 +376,12 @@ struct perf_event_attr {
* Wakeup watermark for AUX area
*/
__u32 aux_watermark;
- __u32 __reserved_2; /* align to __u64 */
+
+ /*
+ * Extended error reporting buffer
+ */
+ __u32 perf_err_size;
+ __u64 perf_err;
};
#define perf_flags(attr) (*(&(attr)->read_format + 1))
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ae16867670..c6f43d1d5f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,6 +49,79 @@
#include <asm/irq_regs.h>
+static bool extended_reporting_enabled(struct perf_event_attr *attr)
+{
+ if (attr->size >= PERF_ATTR_SIZE_VER6 &&
+ attr->perf_err_size > 0)
+ return true;
+
+ return false;
+}
+
+/*
+ * Provide a JSON formatted error report to the user if they asked for it.
+ */
+static void perf_error_report_site(struct perf_event_attr *attr,
+ const struct perf_err_site *site)
+{
+ unsigned long len;
+ char *buffer;
+
+ if (!site || !extended_reporting_enabled(attr))
+ return;
+
+ /* in case of nested perf_err()s, which you shouldn't really do */
+ while (site->code <= -PERF_ERRNO)
+ site = perf_errno_to_site(site->code);
+
+ buffer = kasprintf(GFP_KERNEL,
+ "{\n"
+ "\t\"code\": %d,\n"
+ "\t\"module\": \"%s\",\n"
+ "\t\"message\": \"%s\"\n"
+ "}\n",
+ site->code, site->owner, site->message
+ );
+ if (!buffer)
+ return;
+
+ /* trim the buffer to the supplied boundary */
+ len = strlen(buffer);
+ if (len >= attr->perf_err_size) {
+ len = attr->perf_err_size - 1;
+ buffer[len] = 0;
+ }
+
+ if (copy_to_user((void __user *)attr->perf_err, buffer, len + 1)) {
+ /* if we failed to copy once, don't bother later */
+ attr->perf_err_size = 0;
+ }
+
+ kfree(buffer);
+}
+
+/*
+ * Synchronous version of perf_err(), for the paths where we return immediately
+ * back to userspace.
+ */
+#define perf_err_sync(__attr, __c, __m) ({ \
+ perf_error_report_site(__attr, __perf_err((__c), (__m))); \
+ (__c); \
+})
+
+static int perf_error_report(struct perf_event_attr *attr, int err)
+{
+ const struct perf_err_site *site;
+
+ if (err > -PERF_ERRNO)
+ return err;
+
+ site = perf_errno_to_site(err);
+ perf_error_report_site(attr, site);
+
+ return site->code;
+}
+
static struct workqueue_struct *perf_wq;
typedef int (*remote_function_f)(void *);
@@ -3904,7 +3977,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
ret = perf_read_hw(event, buf, count);
perf_event_ctx_unlock(event, ctx);
- return ret;
+ return perf_error_report(&event->attr, ret);
}
static unsigned int perf_poll(struct file *file, poll_table *wait)
@@ -4152,7 +4225,7 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
ret = _perf_ioctl(event, cmd, arg);
perf_event_ctx_unlock(event, ctx);
- return ret;
+ return perf_error_report(&event->attr, ret);
}
#ifdef CONFIG_COMPAT
@@ -4752,7 +4825,7 @@ aux_unlock:
if (event->pmu->event_mapped)
event->pmu->event_mapped(event);
- return ret;
+ return perf_error_report(&event->attr, ret);
}
static int perf_fasync(int fd, struct file *filp, int on)
@@ -4766,7 +4839,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
mutex_unlock(&inode->i_mutex);
if (retval < 0)
- return retval;
+ return perf_error_report(&event->attr, retval);
return 0;
}
@@ -8352,7 +8425,7 @@ err_group_fd:
fdput(group);
err_fd:
put_unused_fd(event_fd);
- return err;
+ return perf_error_report(&attr, err);
}
/**
@@ -9130,6 +9203,9 @@ void __init perf_event_init(void)
*/
BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
!= 1024);
+
+ /* Too many error sites; see the comment at PERF_ERRNO definition */
+ WARN_ON(__stop___perf_err - __start___perf_err > MAX_ERRNO - PERF_ERRNO);
}
ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
--
2.5.0
^ permalink raw reply related [flat|nested] 34+ messages in thread* Re: [PATCH v2 1/6] perf: Introduce extended syscall error reporting
2015-08-24 14:32 ` [PATCH v2 1/6] " Alexander Shishkin
@ 2015-08-31 18:47 ` Andy Shevchenko
2015-09-01 6:38 ` Alexander Shishkin
0 siblings, 1 reply; 34+ messages in thread
From: Andy Shevchenko @ 2015-08-31 18:47 UTC (permalink / raw)
To: Alexander Shishkin
Cc: Peter Zijlstra, Ingo Molnar, linux-kernel@vger.kernel.org,
Adrian Hunter, Arnaldo Carvalho de Melo, Vince Weaver,
Stephane Eranian, Johannes Berg
On Mon, Aug 24, 2015 at 5:32 PM, Alexander Shishkin
<alexander.shishkin@linux.intel.com> wrote:
One small comment below.
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -49,6 +49,79 @@
>
> #include <asm/irq_regs.h>
>
> +static bool extended_reporting_enabled(struct perf_event_attr *attr)
> +{
> + if (attr->size >= PERF_ATTR_SIZE_VER6 &&
> + attr->perf_err_size > 0)
> + return true;
> +
> + return false;
> +}
> +
> +/*
> + * Provide a JSON formatted error report to the user if they asked for it.
> + */
> +static void perf_error_report_site(struct perf_event_attr *attr,
> + const struct perf_err_site *site)
> +{
> + unsigned long len;
> + char *buffer;
> +
> + if (!site || !extended_reporting_enabled(attr))
> + return;
> +
> + /* in case of nested perf_err()s, which you shouldn't really do */
> + while (site->code <= -PERF_ERRNO)
> + site = perf_errno_to_site(site->code);
> +
> + buffer = kasprintf(GFP_KERNEL,
> + "{\n"
> + "\t\"code\": %d,\n"
> + "\t\"module\": \"%s\",\n"
> + "\t\"message\": \"%s\"\n"
> + "}\n",
> + site->code, site->owner, site->message
> + );
> + if (!buffer)
> + return;
> +
> + /* trim the buffer to the supplied boundary */
> + len = strlen(buffer);
> + if (len >= attr->perf_err_size) {
> + len = attr->perf_err_size - 1;
> + buffer[len] = 0;
> + }
len = strnlen(buffer, attr->perf_err_size);
buffer[len] = 0;
And perhaps perf_err_size has to be length (perf_err_len) ?
> +
> + if (copy_to_user((void __user *)attr->perf_err, buffer, len + 1)) {
> + /* if we failed to copy once, don't bother later */
> + attr->perf_err_size = 0;
Kaboom next time on buffer[-1] = 0; since len >= 0?
> + }
> +
> + kfree(buffer);
> +}
--
With Best Regards,
Andy Shevchenko
^ permalink raw reply [flat|nested] 34+ messages in thread* Re: [PATCH v2 1/6] perf: Introduce extended syscall error reporting
2015-08-31 18:47 ` Andy Shevchenko
@ 2015-09-01 6:38 ` Alexander Shishkin
0 siblings, 0 replies; 34+ messages in thread
From: Alexander Shishkin @ 2015-09-01 6:38 UTC (permalink / raw)
To: Andy Shevchenko
Cc: Peter Zijlstra, Ingo Molnar, linux-kernel@vger.kernel.org,
Adrian Hunter, Arnaldo Carvalho de Melo, Vince Weaver,
Stephane Eranian, Johannes Berg
Andy Shevchenko <andy.shevchenko@gmail.com> writes:
> On Mon, Aug 24, 2015 at 5:32 PM, Alexander Shishkin
> <alexander.shishkin@linux.intel.com> wrote:
>> + /* trim the buffer to the supplied boundary */
>> + len = strlen(buffer);
>> + if (len >= attr->perf_err_size) {
>> + len = attr->perf_err_size - 1;
>> + buffer[len] = 0;
>> + }
>
> len = strnlen(buffer, attr->perf_err_size);
> buffer[len] = 0;
>
> And perhaps perf_err_size has to be length (perf_err_len) ?
>
>> +
>> + if (copy_to_user((void __user *)attr->perf_err, buffer, len + 1)) {
>> + /* if we failed to copy once, don't bother later */
>> + attr->perf_err_size = 0;
>
> Kaboom next time on buffer[-1] = 0; since len >= 0?
Of course, we never get here if attr::perf_err_size is 0, there's an
explicit check for that, but nice try.
Regards,
--
Alex
^ permalink raw reply [flat|nested] 34+ messages in thread
* [PATCH v2 2/6] perf: Add file name and line number to perf extended error reports
2015-08-24 14:32 [PATCH v2 0/6] perf: Introduce extended syscall error reporting Alexander Shishkin
2015-08-24 14:32 ` [PATCH v2 1/6] " Alexander Shishkin
@ 2015-08-24 14:32 ` Alexander Shishkin
2015-08-24 14:32 ` [PATCH v2 3/6] perf: Annotate some of the error codes with perf_err() Alexander Shishkin
` (5 subsequent siblings)
7 siblings, 0 replies; 34+ messages in thread
From: Alexander Shishkin @ 2015-08-24 14:32 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar
Cc: linux-kernel, adrian.hunter, Arnaldo Carvalho de Melo,
Vince Weaver, Stephane Eranian, Johannes Berg, Alexander Shishkin
If kernel debugging is enabled, include additional information to extended
syscall error reports: file name and line number, to make error hunting even
easier.
And in really desperate times, this allows for such things (a variation on
Hugh Dickins' trick [1]):
#undef EINVAL
#define EINVAL perf_err(-22, "")
However, CONFIG_DEBUG_KERNEL might not be the right option for this as I
notice that, for example, debian enables it in its kernels.
[1] http://marc.info/?l=linux-kernel&m=141215166009542
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
include/linux/perf_event.h | 11 +++++++++++
kernel/events/core.c | 7 +++++++
2 files changed, 18 insertions(+)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e37939d21..f6dd3d6071 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -68,11 +68,21 @@ struct perf_guest_info_callbacks {
struct perf_err_site {
const char *message;
const char *owner;
+#ifdef CONFIG_DEBUG_KERNEL
+ const char *file;
+ const int line;
+#endif
const int code;
};
#ifdef CONFIG_PERF_EVENTS
+#ifdef CONFIG_DEBUG_KERNEL
+#define DEBUG_PERF_ERR .file = __FILE__, .line = __LINE__,
+#else
+#define DEBUG_PERF_ERR
+#endif
+
/*
* Place all occurrences of struct perf_err_site into a special section,
* so that we can find out their offsets, which we'll use to refer back
@@ -88,6 +98,7 @@ extern const struct perf_err_site __start___perf_err[], __stop___perf_err[];
.owner = PERF_MODNAME, \
.code = __builtin_constant_p((__c)) ? \
(__c) : -EINVAL, \
+ DEBUG_PERF_ERR \
}; \
&__err_site; \
})
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c6f43d1d5f..3ff28fc8bd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -76,10 +76,17 @@ static void perf_error_report_site(struct perf_event_attr *attr,
buffer = kasprintf(GFP_KERNEL,
"{\n"
+#ifdef CONFIG_DEBUG_KERNEL
+ "\t\"file\": \"%s\",\n"
+ "\t\"line\": %d,\n"
+#endif
"\t\"code\": %d,\n"
"\t\"module\": \"%s\",\n"
"\t\"message\": \"%s\"\n"
"}\n",
+#ifdef CONFIG_DEBUG_KERNEL
+ site->file, site->line,
+#endif
site->code, site->owner, site->message
);
if (!buffer)
--
2.5.0
^ permalink raw reply related [flat|nested] 34+ messages in thread* [PATCH v2 3/6] perf: Annotate some of the error codes with perf_err()
2015-08-24 14:32 [PATCH v2 0/6] perf: Introduce extended syscall error reporting Alexander Shishkin
2015-08-24 14:32 ` [PATCH v2 1/6] " Alexander Shishkin
2015-08-24 14:32 ` [PATCH v2 2/6] perf: Add file name and line number to perf extended error reports Alexander Shishkin
@ 2015-08-24 14:32 ` Alexander Shishkin
2015-08-24 14:32 ` [PATCH v2 4/6] perf/x86: " Alexander Shishkin
` (4 subsequent siblings)
7 siblings, 0 replies; 34+ messages in thread
From: Alexander Shishkin @ 2015-08-24 14:32 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar
Cc: linux-kernel, adrian.hunter, Arnaldo Carvalho de Melo,
Vince Weaver, Stephane Eranian, Johannes Berg, Alexander Shishkin
This patch annotates a few semi-random error paths in perf core to
illustrate the extended error reporting facility. Most of them can
be triggered from perf tools.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
kernel/events/core.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3ff28fc8bd..7beab37ea6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3382,10 +3382,10 @@ find_lively_task_by_vpid(pid_t vpid)
rcu_read_unlock();
if (!task)
- return ERR_PTR(-ESRCH);
+ return PERF_ERR_PTR(-ESRCH, "task not found");
/* Reuse ptrace permission checks for now. */
- err = -EACCES;
+ err = perf_err(-EACCES, "insufficient permissions for tracing this task");
if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto errout;
@@ -3413,7 +3413,8 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (!task) {
/* Must be root to operate on a CPU event: */
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return ERR_PTR(-EACCES);
+ return PERF_ERR_PTR(-EACCES,
+ "must be root to operate on a CPU event");
/*
* We could be clever and allow to attach a event to an
@@ -3421,7 +3422,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
* that's for later.
*/
if (!cpu_online(cpu))
- return ERR_PTR(-ENODEV);
+ return PERF_ERR_PTR(-ENODEV, "cpu is offline");
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -8134,15 +8135,16 @@ SYSCALL_DEFINE5(perf_event_open,
if (!attr.exclude_kernel) {
if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ return perf_err_sync(&attr, -EACCES,
+ "kernel tracing forbidden for the unprivileged");
}
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
- return -EINVAL;
+ return perf_err_sync(&attr, -EINVAL, "sample_freq too high");
} else {
if (attr.sample_period & (1ULL << 63))
- return -EINVAL;
+ return perf_err_sync(&attr, -EINVAL, "sample_period too high");
}
/*
@@ -8152,14 +8154,14 @@ SYSCALL_DEFINE5(perf_event_open,
* cgroup.
*/
if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
- return -EINVAL;
+ return perf_err_sync(&attr, -EINVAL, "pid and cpu need to be set in cgroup mode");
if (flags & PERF_FLAG_FD_CLOEXEC)
f_flags |= O_CLOEXEC;
event_fd = get_unused_fd_flags(f_flags);
if (event_fd < 0)
- return event_fd;
+ return perf_err_sync(&attr, event_fd, "can't obtain a file descriptor");
if (group_fd != -1) {
err = perf_fget_light(group_fd, &group);
--
2.5.0
^ permalink raw reply related [flat|nested] 34+ messages in thread* [PATCH v2 4/6] perf/x86: Annotate some of the error codes with perf_err()
2015-08-24 14:32 [PATCH v2 0/6] perf: Introduce extended syscall error reporting Alexander Shishkin
` (2 preceding siblings ...)
2015-08-24 14:32 ` [PATCH v2 3/6] perf: Annotate some of the error codes with perf_err() Alexander Shishkin
@ 2015-08-24 14:32 ` Alexander Shishkin
2015-08-24 14:32 ` [PATCH v2 5/6] perf/x86/intel/pt: Use extended error reporting in event initialization Alexander Shishkin
` (3 subsequent siblings)
7 siblings, 0 replies; 34+ messages in thread
From: Alexander Shishkin @ 2015-08-24 14:32 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar
Cc: linux-kernel, adrian.hunter, Arnaldo Carvalho de Melo,
Vince Weaver, Stephane Eranian, Johannes Berg, Alexander Shishkin
This patch annotates a few x86-specific error paths with perf's extended
error reporting facility.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event.c | 8 ++++++--
arch/x86/kernel/cpu/perf_event_intel_lbr.c | 2 +-
2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index f56cf074d0..b3b531beee 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -12,6 +12,8 @@
* For licencing details see kernel-base/COPYING
*/
+#define PERF_MODNAME "perf/x86"
+
#include <linux/perf_event.h>
#include <linux/capability.h>
#include <linux/notifier.h>
@@ -426,11 +428,13 @@ int x86_setup_perfctr(struct perf_event *event)
/* BTS is currently only allowed for user-mode. */
if (!attr->exclude_kernel)
- return -EOPNOTSUPP;
+ return perf_err(-EOPNOTSUPP,
+ "BTS sampling not allowed for kernel space");
/* disallow bts if conflicting events are present */
if (x86_add_exclusive(x86_lbr_exclusive_lbr))
- return -EBUSY;
+ return perf_err(-EBUSY,
+ "LBR conflicts with active events");
event->destroy = hw_perf_lbr_event_destroy;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index b2c9475b7f..222b259c5e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -607,7 +607,7 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
* no LBR on this PMU
*/
if (!x86_pmu.lbr_nr)
- return -EOPNOTSUPP;
+ return perf_err(-EOPNOTSUPP, "LBR is not supported by this cpu");
/*
* setup SW LBR filter
--
2.5.0
^ permalink raw reply related [flat|nested] 34+ messages in thread* [PATCH v2 5/6] perf/x86/intel/pt: Use extended error reporting in event initialization
2015-08-24 14:32 [PATCH v2 0/6] perf: Introduce extended syscall error reporting Alexander Shishkin
` (3 preceding siblings ...)
2015-08-24 14:32 ` [PATCH v2 4/6] perf/x86: " Alexander Shishkin
@ 2015-08-24 14:32 ` Alexander Shishkin
2015-08-24 14:33 ` [PATCH v2 6/6] perf/x86/intel/bts: " Alexander Shishkin
` (2 subsequent siblings)
7 siblings, 0 replies; 34+ messages in thread
From: Alexander Shishkin @ 2015-08-24 14:32 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar
Cc: linux-kernel, adrian.hunter, Arnaldo Carvalho de Melo,
Vince Weaver, Stephane Eranian, Johannes Berg, Alexander Shishkin
Event attribute validation is one of the biggest sources of EINVALs around
perf_event_open() syscall. This patch annotates error checks with extended
error reporting macros to provide the user with more details on what they
are doing wrong.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel_pt.c | 29 +++++++++++++++++------------
1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 4216928344..9a09c89928 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -18,6 +18,8 @@
#undef DEBUG
+#define PERF_MODNAME "perf/x86/intel/pt"
+
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
@@ -198,29 +200,29 @@ fail:
RTIT_CTL_CYC_PSB | \
RTIT_CTL_MTC)
-static bool pt_event_valid(struct perf_event *event)
+static int pt_event_valid(struct perf_event *event)
{
u64 config = event->attr.config;
u64 allowed, requested;
if ((config & PT_CONFIG_MASK) != config)
- return false;
+ return perf_err(-EINVAL, "PT config: disallowed bits");
if (config & RTIT_CTL_CYC_PSB) {
if (!pt_cap_get(PT_CAP_psb_cyc))
- return false;
+ return perf_err(-EINVAL, "PT config: PSB/CYC not supported");
allowed = pt_cap_get(PT_CAP_psb_periods);
requested = (config & RTIT_CTL_PSB_FREQ) >>
RTIT_CTL_PSB_FREQ_OFFSET;
if (requested && (!(allowed & BIT(requested))))
- return false;
+ return perf_err(-EINVAL, "PT config: unsupported PSB period");
allowed = pt_cap_get(PT_CAP_cycle_thresholds);
requested = (config & RTIT_CTL_CYC_THRESH) >>
RTIT_CTL_CYC_THRESH_OFFSET;
if (requested && (!(allowed & BIT(requested))))
- return false;
+ return perf_err(-EINVAL, "PT config: unsupported CYC threshold");
}
if (config & RTIT_CTL_MTC) {
@@ -232,20 +234,20 @@ static bool pt_event_valid(struct perf_event *event)
* CPUID is 0 will #GP, so better safe than sorry.
*/
if (!pt_cap_get(PT_CAP_mtc))
- return false;
+ return perf_err(-EINVAL, "PT config: MTC not supported");
allowed = pt_cap_get(PT_CAP_mtc_periods);
if (!allowed)
- return false;
+ return perf_err(-EINVAL, "PT config: MTC periods not supported");
requested = (config & RTIT_CTL_MTC_RANGE) >>
RTIT_CTL_MTC_RANGE_OFFSET;
if (!(allowed & BIT(requested)))
- return false;
+ return perf_err(-EINVAL, "PT config: unsupported MTC period");
}
- return true;
+ return 0;
}
/*
@@ -1111,14 +1113,17 @@ static void pt_event_destroy(struct perf_event *event)
static int pt_event_init(struct perf_event *event)
{
+ int err;
+
if (event->attr.type != pt_pmu.pmu.type)
return -ENOENT;
- if (!pt_event_valid(event))
- return -EINVAL;
+ err = pt_event_valid(event);
+ if (err)
+ return err;
if (x86_add_exclusive(x86_lbr_exclusive_pt))
- return -EBUSY;
+ return perf_err(-EBUSY, "PT conflicts with active events");
event->destroy = pt_event_destroy;
--
2.5.0
^ permalink raw reply related [flat|nested] 34+ messages in thread* [PATCH v2 6/6] perf/x86/intel/bts: Use extended error reporting in event initialization
2015-08-24 14:32 [PATCH v2 0/6] perf: Introduce extended syscall error reporting Alexander Shishkin
` (4 preceding siblings ...)
2015-08-24 14:32 ` [PATCH v2 5/6] perf/x86/intel/pt: Use extended error reporting in event initialization Alexander Shishkin
@ 2015-08-24 14:33 ` Alexander Shishkin
2015-08-25 8:22 ` [PATCH v2 0/6] perf: Introduce extended syscall error reporting Ingo Molnar
2015-08-25 8:52 ` Johannes Berg
7 siblings, 0 replies; 34+ messages in thread
From: Alexander Shishkin @ 2015-08-24 14:33 UTC (permalink / raw)
To: Peter Zijlstra, Ingo Molnar
Cc: linux-kernel, adrian.hunter, Arnaldo Carvalho de Melo,
Vince Weaver, Stephane Eranian, Johannes Berg, Alexander Shishkin
If intel_bts events would conflict with other events that are active in
the system, provide an extended error message to the user along with the
usual EBUSY.
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
---
arch/x86/kernel/cpu/perf_event_intel_bts.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
index 54690e8857..c6a8da8f5a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -16,6 +16,8 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define PERF_MODNAME "perf/x86/intel/bts"
+
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -492,7 +494,7 @@ static int bts_event_init(struct perf_event *event)
return -ENOENT;
if (x86_add_exclusive(x86_lbr_exclusive_bts))
- return -EBUSY;
+ return perf_err(-EBUSY, "BTS conflicts with active events");
ret = x86_reserve_hardware();
if (ret) {
--
2.5.0
^ permalink raw reply related [flat|nested] 34+ messages in thread* Re: [PATCH v2 0/6] perf: Introduce extended syscall error reporting
2015-08-24 14:32 [PATCH v2 0/6] perf: Introduce extended syscall error reporting Alexander Shishkin
` (5 preceding siblings ...)
2015-08-24 14:33 ` [PATCH v2 6/6] perf/x86/intel/bts: " Alexander Shishkin
@ 2015-08-25 8:22 ` Ingo Molnar
2015-08-25 8:52 ` Johannes Berg
7 siblings, 0 replies; 34+ messages in thread
From: Ingo Molnar @ 2015-08-25 8:22 UTC (permalink / raw)
To: Alexander Shishkin
Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, adrian.hunter,
Arnaldo Carvalho de Melo, Vince Weaver, Stephane Eranian,
Johannes Berg
* Alexander Shishkin <alexander.shishkin@linux.intel.com> wrote:
> Hi Peter and Ingo and everybody,
>
> Here's an update of my extended error reporting patchset, addressing
> review comments and adding a few more error messages to PT and BTS
> drivers.
>
> Changes since v1:
> * addressed Peter's comments,
> * added perf_err() annotation to intel_pt and intel_bts drivers;
> especially the former is rich in EINVALs in event validation path,
> so it should be immediately useful;
> * changed the error code to EINVAL for the case when perf_err() is
> supplied a non-constart error code;
> * dropped RFC, added sign-offs.
>
> This time around, I employed a linker trick to convert the structures
> containing extended error information into integers, which are then
> made to look just like normal error codes so that IS_ERR_VALUE() and
> friends would still work correctly on them. So no extra pointers in
> the struct perf_event or anywhere else; the extended error codes are
> passed around like normal error codes. They only need to be converted
> in syscalls' topmost return statements. This is done in 1/6.
Nice trick!
> Then, 2/6 illustrates how error sites can be extended to include more
> information such as file names and line numbers [1]. Side note, it can
> be dropped without impact on other patches.
>
> The other patches add perf_err() annotation to a few semi-randomly
> picked places in perf core (3/6), x86 bits (4/6), intel_pt (5/6) and
> intel_bts (6/6).
>
> [1] http://marc.info/?l=linux-kernel&m=141471816115946
>
> Alexander Shishkin (6):
> perf: Introduce extended syscall error reporting
> perf: Add file name and line number to perf extended error reports
> perf: Annotate some of the error codes with perf_err()
> perf/x86: Annotate some of the error codes with perf_err()
> perf/x86/intel/pt: Use extended error reporting in event
> initialization
> perf/x86/intel/bts: Use extended error reporting in event
> initialization
>
> arch/x86/kernel/cpu/perf_event.c | 8 +-
> arch/x86/kernel/cpu/perf_event_intel_bts.c | 4 +-
> arch/x86/kernel/cpu/perf_event_intel_lbr.c | 2 +-
> arch/x86/kernel/cpu/perf_event_intel_pt.c | 29 +++++---
> include/linux/perf_event.h | 82 +++++++++++++++++++++
> include/uapi/linux/perf_event.h | 8 +-
> kernel/events/core.c | 113 +++++++++++++++++++++++++----
> 7 files changed, 215 insertions(+), 31 deletions(-)
Ok, this looks pretty good to me.
Mind adding support on the perf tooling side as well, so that it can be tested and
applied in a single set of patches?
Thanks,
Ingo
^ permalink raw reply [flat|nested] 34+ messages in thread* Re: [PATCH v2 0/6] perf: Introduce extended syscall error reporting
2015-08-24 14:32 [PATCH v2 0/6] perf: Introduce extended syscall error reporting Alexander Shishkin
` (6 preceding siblings ...)
2015-08-25 8:22 ` [PATCH v2 0/6] perf: Introduce extended syscall error reporting Ingo Molnar
@ 2015-08-25 8:52 ` Johannes Berg
2015-08-25 9:02 ` Ingo Molnar
7 siblings, 1 reply; 34+ messages in thread
From: Johannes Berg @ 2015-08-25 8:52 UTC (permalink / raw)
To: Alexander Shishkin, Peter Zijlstra, Ingo Molnar
Cc: linux-kernel, adrian.hunter, Arnaldo Carvalho de Melo,
Vince Weaver, Stephane Eranian
On Mon, 2015-08-24 at 17:32 +0300, Alexander Shishkin wrote:
> This time around, I employed a linker trick to convert the structures
> containing extended error information into integers, which are then
> made to look just like normal error codes so that IS_ERR_VALUE() and
> friends would still work correctly on them. So no extra pointers in
> the struct perf_event or anywhere else; the extended error codes are
> passed around like normal error codes. They only need to be converted
> in syscalls' topmost return statements. This is done in 1/6.
>
For the record, as we discussed separately, I'd love to see this move
to more general infrastructure. In wireless (nl80211), for example, we
have a few hundred (!) callsites returning -EINVAL, mostly based on
malformed netlink attributes, and it can be very difficult to figure
out what went wrong; debugging mostly employs a variation of Hugh's
trick.
It would be absolutely necessary to support modules, however that could
get tricky: unless we pass a THIS_MODULE through in some way to
netlink_ack() (in the nl80211 case) we'd have to store the actual index
inside the err_site and assign it on module load somehow. Passing the
module pointer seems better, but has even more wrinkles like what to do
when the error came from a nested call (e.g. nla_parse()) that's in a
different modules ...
Unrelated to all that - maybe perf_errno_to_site() should have some
range checking?
johannes
^ permalink raw reply [flat|nested] 34+ messages in thread* Re: [PATCH v2 0/6] perf: Introduce extended syscall error reporting
2015-08-25 8:52 ` Johannes Berg
@ 2015-08-25 9:02 ` Ingo Molnar
2015-08-25 9:17 ` Ingo Molnar
0 siblings, 1 reply; 34+ messages in thread
From: Ingo Molnar @ 2015-08-25 9:02 UTC (permalink / raw)
To: Johannes Berg
Cc: Alexander Shishkin, Peter Zijlstra, Ingo Molnar, linux-kernel,
adrian.hunter, Arnaldo Carvalho de Melo, Vince Weaver,
Stephane Eranian
* Johannes Berg <johannes@sipsolutions.net> wrote:
> On Mon, 2015-08-24 at 17:32 +0300, Alexander Shishkin wrote:
>
> > This time around, I employed a linker trick to convert the structures
> > containing extended error information into integers, which are then
> > made to look just like normal error codes so that IS_ERR_VALUE() and
> > friends would still work correctly on them. So no extra pointers in
> > the struct perf_event or anywhere else; the extended error codes are
> > passed around like normal error codes. They only need to be converted
> > in syscalls' topmost return statements. This is done in 1/6.
>
> For the record, as we discussed separately, I'd love to see this move to more
> general infrastructure. In wireless (nl80211), for example, we have a few
> hundred (!) callsites returning -EINVAL, mostly based on malformed netlink
> attributes, and it can be very difficult to figure out what went wrong;
> debugging mostly employs a variation of Hugh's trick.
Absolutely, I suggested this as well earlier today, as the scheduler would like to
make use of it in syscalls with extensible ABIs, such as sched_setattr().
If people really like this then we could go farther as well and add a standalone
'extended errors system call' as well (SyS_errno_extended_get()), which would
allow the recovery of error strings even for system calls that are not easily
extensible. We could cache the last error description in the task struct.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH v2 0/6] perf: Introduce extended syscall error reporting
2015-08-25 9:02 ` Ingo Molnar
@ 2015-08-25 9:17 ` Ingo Molnar
2015-08-25 9:34 ` Johannes Berg
2015-08-26 11:37 ` Alexander Shishkin
0 siblings, 2 replies; 34+ messages in thread
From: Ingo Molnar @ 2015-08-25 9:17 UTC (permalink / raw)
To: Johannes Berg
Cc: Alexander Shishkin, Peter Zijlstra, Ingo Molnar, linux-kernel,
adrian.hunter, Arnaldo Carvalho de Melo, Vince Weaver,
Stephane Eranian, Linus Torvalds, Andrew Morton, Thomas Gleixner,
Borislav Petkov, H. Peter Anvin
* Ingo Molnar <mingo@kernel.org> wrote:
>
> * Johannes Berg <johannes@sipsolutions.net> wrote:
>
> > On Mon, 2015-08-24 at 17:32 +0300, Alexander Shishkin wrote:
> >
> > > This time around, I employed a linker trick to convert the structures
> > > containing extended error information into integers, which are then made to
> > > look just like normal error codes so that IS_ERR_VALUE() and friends would
> > > still work correctly on them. So no extra pointers in the struct perf_event
> > > or anywhere else; the extended error codes are passed around like normal
> > > error codes. They only need to be converted in syscalls' topmost return
> > > statements. This is done in 1/6.
> >
> > For the record, as we discussed separately, I'd love to see this move to more
> > general infrastructure. In wireless (nl80211), for example, we have a few
> > hundred (!) callsites returning -EINVAL, mostly based on malformed netlink
> > attributes, and it can be very difficult to figure out what went wrong;
> > debugging mostly employs a variation of Hugh's trick.
>
> Absolutely, I suggested this as well earlier today, as the scheduler would like
> to make use of it in syscalls with extensible ABIs, such as sched_setattr().
>
> If people really like this then we could go farther as well and add a standalone
> 'extended errors system call' as well (SyS_errno_extended_get()), which would
> allow the recovery of error strings even for system calls that are not easily
> extensible. We could cache the last error description in the task struct.
If we do that then we don't even have to introduce per system call error code
conversion, but could unconditionally save the last extended error info in the
task struct and continue - this could be done very cheaply with the linker trick
driven integer ID.
I.e. system calls could opt in to do:
return err_str(-EBUSY, "perf/x86: BTS conflicts with active events");
and the overhead of this would be minimal, we'd essentially do something like this
to save the error:
current->err_code = code;
where 'code' is a build time constant in essence.
We could use this even in system calls where the error path is performance
critical, as all the string recovery and copying overhead would be triggered by
applications that opt in via the new system call:
struct err_desc {
const char *message;
const char *owner;
const int code;
};
SyS_err_get_desc(struct err_desc *err_desc __user);
[ Which could perhaps be a prctl() extension as well (PR_GET_ERR_DESC): finally
some truly matching functionality for prctl(). ]
Hm?
Thanks,
Ingo
^ permalink raw reply [flat|nested] 34+ messages in thread* Re: [PATCH v2 0/6] perf: Introduce extended syscall error reporting
2015-08-25 9:17 ` Ingo Molnar
@ 2015-08-25 9:34 ` Johannes Berg
2015-08-25 10:07 ` Ingo Molnar
2015-08-26 11:37 ` Alexander Shishkin
1 sibling, 1 reply; 34+ messages in thread
From: Johannes Berg @ 2015-08-25 9:34 UTC (permalink / raw)
To: Ingo Molnar
Cc: Alexander Shishkin, Peter Zijlstra, Ingo Molnar, linux-kernel,
adrian.hunter, Arnaldo Carvalho de Melo, Vince Weaver,
Stephane Eranian, Linus Torvalds, Andrew Morton, Thomas Gleixner,
Borislav Petkov, H. Peter Anvin
On Tue, 2015-08-25 at 11:17 +0200, Ingo Molnar wrote:
>
> If we do that then we don't even have to introduce per system call error code
> conversion, but could unconditionally save the last extended error info in the
> task struct and continue - this could be done very cheaply with the linker trick
> driven integer ID.
>
> I.e. system calls could opt in to do:
>
> > return err_str(-EBUSY, "perf/x86: BTS conflicts with active events");
>
> and the overhead of this would be minimal, we'd essentially do something like this
> to save the error:
>
> > current->err_code = code;
>
> where 'code' is a build time constant in essence.
>
> We could use this even in system calls where the error path is performance
> critical, as all the string recovery and copying overhead would be triggered by
> applications that opt in via the new system call:
>
> > struct err_desc {
> > const char *message;
> > const char *owner;
> > const int code;
> > };
>
> > SyS_err_get_desc(struct err_desc *err_desc __user);
>
> [ Which could perhaps be a prctl() extension as well (PR_GET_ERR_DESC): finally
> some truly matching functionality for prctl(). ]
>
> Hm?
That's neat in a way, but doesn't work in general I think.
Considering the wifi case, or more generally any netlink based
protocol, the syscall (sendmsg) won't return an error, but a subsequent
recvmsg() (which also won't return an error) returns an error message
[in the sense of a protocol message, not a human readable message] to a
buffer provided by the application.
However, this message can be extended relatively easily to include the
string information, but the syscall/prctl wouldn't work since the
syscalls didn't actually fail.
However, it could possibly help with the namespace/module issue if you
also store THIS_MODULE (or perhaps instead a pointer to the module's
error table) in the task. Again not in the netlink case though, I
think, that will always require special handling [although there it
could be stored away in the socket or so, similar to the task]
johannes
^ permalink raw reply [flat|nested] 34+ messages in thread* Re: [PATCH v2 0/6] perf: Introduce extended syscall error reporting
2015-08-25 9:34 ` Johannes Berg
@ 2015-08-25 10:07 ` Ingo Molnar
2015-08-25 10:19 ` Johannes Berg
0 siblings, 1 reply; 34+ messages in thread
From: Ingo Molnar @ 2015-08-25 10:07 UTC (permalink / raw)
To: Johannes Berg
Cc: Alexander Shishkin, Peter Zijlstra, Ingo Molnar, linux-kernel,
adrian.hunter, Arnaldo Carvalho de Melo, Vince Weaver,
Stephane Eranian, Linus Torvalds, Andrew Morton, Thomas Gleixner,
Borislav Petkov, H. Peter Anvin
* Johannes Berg <johannes@sipsolutions.net> wrote:
> On Tue, 2015-08-25 at 11:17 +0200, Ingo Molnar wrote:
> >
> > If we do that then we don't even have to introduce per system call error code
> > conversion, but could unconditionally save the last extended error info in the
> > task struct and continue - this could be done very cheaply with the linker trick
> > driven integer ID.
> >
> > I.e. system calls could opt in to do:
> >
> > > return err_str(-EBUSY, "perf/x86: BTS conflicts with active events");
> >
> > and the overhead of this would be minimal, we'd essentially do something like this
> > to save the error:
> >
> > > current->err_code = code;
> >
> > where 'code' is a build time constant in essence.
> >
> > We could use this even in system calls where the error path is performance
> > critical, as all the string recovery and copying overhead would be triggered by
> > applications that opt in via the new system call:
> >
> > > struct err_desc {
> > > const char *message;
> > > const char *owner;
> > > const int code;
> > > };
> >
> > > SyS_err_get_desc(struct err_desc *err_desc __user);
> >
> > [ Which could perhaps be a prctl() extension as well (PR_GET_ERR_DESC): finally
> > some truly matching functionality for prctl(). ]
> >
> > Hm?
>
> That's neat in a way, but doesn't work in general I think.
Ok, I see the netlink problem - but it would work in the perf and scheduler cases,
except for the small wart that it's not signal safe by default. (Apps could either
save/restore it themselves in their signal handlers, via PR_SET_ERR_DESC, or we
could extend the signal frame with the code.)
Having a separate syscall has two (big!) appeals:
- we wouldn't have to touch existing system calls at all.
- extended error reporting would be available for any system call that opts to
use it. (The current scheme as submitted is only available to system calls
using the perf-style flexible attribute ABI.)
Regarding netlink:
> Considering the wifi case, or more generally any netlink based
> protocol, the syscall (sendmsg) won't return an error, but a subsequent
> recvmsg() (which also won't return an error) returns an error message
> [in the sense of a protocol message, not a human readable message] to a
> buffer provided by the application.
> However, this message can be extended relatively easily to include the
> string information, but the syscall/prctl wouldn't work since the
> syscalls didn't actually fail.
Ok. So assuming we can make a 1:1 mapping between the 'extended error code'
integer space and the message:owner strings, it would be enough for netlink to
pass along the integer code itself, not the full strings?
That would simplify things and make the scheme more robust from a security POV I
suspect.
> However, it could possibly help with the namespace/module issue if you
> also store THIS_MODULE (or perhaps instead a pointer to the module's
> error table) in the task. Again not in the netlink case though, I
> think, that will always require special handling [although there it
> could be stored away in the socket or so, similar to the task]
So my hope would be that we can represent this all with a single 'large' error
code integer space. That integer would be constant and translateable (as long as
the module is loaded).
That way the error passing mechanism wouldn't have to be specifically module-aware
- during build we generate the integer space, with all possible modules
considered.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 34+ messages in thread* Re: [PATCH v2 0/6] perf: Introduce extended syscall error reporting
2015-08-25 10:07 ` Ingo Molnar
@ 2015-08-25 10:19 ` Johannes Berg
2015-08-26 4:49 ` Ingo Molnar
0 siblings, 1 reply; 34+ messages in thread
From: Johannes Berg @ 2015-08-25 10:19 UTC (permalink / raw)
To: Ingo Molnar
Cc: Alexander Shishkin, Peter Zijlstra, Ingo Molnar, linux-kernel,
adrian.hunter, Arnaldo Carvalho de Melo, Vince Weaver,
Stephane Eranian, Linus Torvalds, Andrew Morton, Thomas Gleixner,
Borislav Petkov, H. Peter Anvin
On Tue, 2015-08-25 at 12:07 +0200, Ingo Molnar wrote:
> Having a separate syscall has two (big!) appeals:
>
> - we wouldn't have to touch existing system calls at all.
>
> - extended error reporting would be available for any system call that opts to
> use it. (The current scheme as submitted is only available to system calls
> using the perf-style flexible attribute ABI.)
Yeah, I agree this is nice. However, more generally, I think we need to
actually think more about the module problem then since while syscalls
can't be implemented in modules (I think) they can still end up calling
into modules.
Of course a first iteration could be exactly like what Alexander
posted.
The other issue with this is namespacing - can all syscalls, and
everything that eventually gets called, really use a single error code
namespace with its 3k limit? On the one hand I'm thinking "3k strings
are so big ... we don't want more", but on the other hand all kinds of
drivers etc. might start getting annotations?
> > Considering the wifi case, or more generally any netlink based
> > protocol, the syscall (sendmsg) won't return an error, but a subsequent
> > recvmsg() (which also won't return an error) returns an error message
> > [in the sense of a protocol message, not a human readable message] to a
> > buffer provided by the application.
> > However, this message can be extended relatively easily to include the
> > string information, but the syscall/prctl wouldn't work since the
> > syscalls didn't actually fail.
>
> Ok. So assuming we can make a 1:1 mapping between the 'extended error code'
> integer space and the message:owner strings, it would be enough for netlink to
> pass along the integer code itself, not the full strings?
Considering that this would likely have to be opt-in at the netlink
level (e.g. through a flag in the request message), perhaps. I'd say
it'd still be easier for the message to carry the intended error code
(e.g. -EINVAL) and the actual message in the ACK message [where
requested]. That way, applications that actually behave depending on
the error code can far more easily be extended.
> That would simplify things and make the scheme more robust from a security POV I
> suspect.
You could also argue the other way around, in that being able to look
up (from userspace) arbitrary extended error IDs, even those that
haven't ever been used, could be an information leak of sorts.
> So my hope would be that we can represent this all with a single 'large' error
> code integer space. That integer would be constant and translateable (as long as
> the module is loaded).
Ok, I wasn't really what I was assuming. As I said above, on the one
hand I agree, but on the other I'm looking at the reality of a few
hundred (!) -EINVAL callsites in net/wireless/nl80211.c alone, so
having an overall 3k limit seems somewhat low.
> That way the error passing mechanism wouldn't have to be specifically module-aware
> - during build we generate the integer space, with all possible modules
> considered.
>
That would be no improvement for me as I work heavily with (upstream)
modules that are compiled out-of-tree, so I'm not all inclined to spend
much time on it if that ends up being the solution ;)
There are, however, are other ways it seems: for example assigning each
module an offset and taking that into account if the code was built
modular. That needs a small allocator to put the module range somewhere
into the global range, of course:
#ifdef MODULE
#define mod_ext_err(errno, string)
struct ext_err { ... } _my_err __section(errors);
return mod_offset < 0 ?
errno :
mod_ext_err_offset + &_my_err - __start_errors;
#else
...
#endif
or something like that - that leaves the possibility of allocation
failures (e.g. due to error space fragmentation) which sets the offset
to -1. The only additional wrinkle would be that a lookup would have to
walk some kind of extents tree that says which module (table) to look
at.
johannes
^ permalink raw reply [flat|nested] 34+ messages in thread* Re: [PATCH v2 0/6] perf: Introduce extended syscall error reporting
2015-08-25 10:19 ` Johannes Berg
@ 2015-08-26 4:49 ` Ingo Molnar
[not found] ` <CA+55aFw--OFczoY=v17+e2-Q3O0GXnMKRuwzpYpB2qKBpZo=fw@mail.gmail.com>
0 siblings, 1 reply; 34+ messages in thread
From: Ingo Molnar @ 2015-08-26 4:49 UTC (permalink / raw)
To: Johannes Berg
Cc: Alexander Shishkin, Peter Zijlstra, Ingo Molnar, linux-kernel,
adrian.hunter, Arnaldo Carvalho de Melo, Vince Weaver,
Stephane Eranian, Linus Torvalds, Andrew Morton, Thomas Gleixner,
Borislav Petkov, H. Peter Anvin
* Johannes Berg <johannes@sipsolutions.net> wrote:
> On Tue, 2015-08-25 at 12:07 +0200, Ingo Molnar wrote:
> > Having a separate syscall has two (big!) appeals:
> >
> > - we wouldn't have to touch existing system calls at all.
> >
> > - extended error reporting would be available for any system call that opts to
> > use it. (The current scheme as submitted is only available to system calls
> > using the perf-style flexible attribute ABI.)
>
> Yeah, I agree this is nice. However, more generally, I think we need to
> actually think more about the module problem then since while syscalls
> can't be implemented in modules (I think) they can still end up calling
> into modules.
>
> Of course a first iteration could be exactly like what Alexander
> posted.
>
> The other issue with this is namespacing - can all syscalls, and
> everything that eventually gets called, really use a single error code
> namespace with its 3k limit? [...]
No, the current MAX_ERRNO is probably not big enough if this scheme is successful,
and I don't see any reason why it wouldn't be successful: I think this feature
would be the biggest usability feature added to Linux system calls and to Linux
system tooling in the last 10 years or so.
> [...] On the one hand I'm thinking "3k strings are so big ... we don't want
> more", but on the other hand all kinds of drivers etc. might start getting
> annotations?
We could extend it with some arch work. The per arch work involves making sure
there's no valid kernel address at [-MAX_ERRNO...-1].
So I wouldn't worry about it too much, let's agree on a good ABI and let's just
start using it, and if we grow out of -4K we can extend things step by step.
> > Ok. So assuming we can make a 1:1 mapping between the 'extended error code'
> > integer space and the message:owner strings, it would be enough for netlink to
> > pass along the integer code itself, not the full strings?
>
> Considering that this would likely have to be opt-in at the netlink level (e.g.
> through a flag in the request message), perhaps. I'd say it'd still be easier
> for the message to carry the intended error code (e.g. -EINVAL) and the actual
> message in the ACK message [where requested]. That way, applications that
> actually behave depending on the error code can far more easily be extended.
Ok. I think we should include the extended error code as well, in case an app
wants to pass it to some more generic library.
> > That would simplify things and make the scheme more robust from a security POV
> > I suspect.
>
> You could also argue the other way around, in that being able to look up (from
> userspace) arbitrary extended error IDs, even those that haven't ever been used,
> could be an information leak of sorts.
The fact is that kernel<->tooling error reporting sucks big time here and today,
in large part due to errno limitations, and arguing that it's somehow helping
security is the Stockholm Syndrome at its best.
> > So my hope would be that we can represent this all with a single 'large' error
> > code integer space. That integer would be constant and translateable (as long
> > as the module is loaded).
>
> Ok, I wasn't really what I was assuming. As I said above, on the one
> hand I agree, but on the other I'm looking at the reality of a few
> hundred (!) -EINVAL callsites in net/wireless/nl80211.c alone, so
> having an overall 3k limit seems somewhat low.
Agreed - but it's not a hard limit really.
> > That way the error passing mechanism wouldn't have to be specifically
> > module-aware - during build we generate the integer space, with all possible
> > modules considered.
>
> That would be no improvement for me as I work heavily with (upstream) modules
> that are compiled out-of-tree, so I'm not all inclined to spend much time on it
> if that ends up being the solution ;)
Perhaps, as long as the number allocation is dynamic and non-ABI there's no reason
why this couldn't be added later on.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 34+ messages in thread
* Re: [PATCH v2 0/6] perf: Introduce extended syscall error reporting
2015-08-25 9:17 ` Ingo Molnar
2015-08-25 9:34 ` Johannes Berg
@ 2015-08-26 11:37 ` Alexander Shishkin
1 sibling, 0 replies; 34+ messages in thread
From: Alexander Shishkin @ 2015-08-26 11:37 UTC (permalink / raw)
To: Ingo Molnar, Johannes Berg
Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, adrian.hunter,
Arnaldo Carvalho de Melo, Vince Weaver, Stephane Eranian,
Linus Torvalds, Andrew Morton, Thomas Gleixner, Borislav Petkov,
H. Peter Anvin
Ingo Molnar <mingo@kernel.org> writes:
> * Ingo Molnar <mingo@kernel.org> wrote:
>
>>
>> * Johannes Berg <johannes@sipsolutions.net> wrote:
>>
>> > On Mon, 2015-08-24 at 17:32 +0300, Alexander Shishkin wrote:
>> >
>> > > This time around, I employed a linker trick to convert the structures
>> > > containing extended error information into integers, which are then made to
>> > > look just like normal error codes so that IS_ERR_VALUE() and friends would
>> > > still work correctly on them. So no extra pointers in the struct perf_event
>> > > or anywhere else; the extended error codes are passed around like normal
>> > > error codes. They only need to be converted in syscalls' topmost return
>> > > statements. This is done in 1/6.
>> >
>> > For the record, as we discussed separately, I'd love to see this move to more
>> > general infrastructure. In wireless (nl80211), for example, we have a few
>> > hundred (!) callsites returning -EINVAL, mostly based on malformed netlink
>> > attributes, and it can be very difficult to figure out what went wrong;
>> > debugging mostly employs a variation of Hugh's trick.
>>
>> Absolutely, I suggested this as well earlier today, as the scheduler would like
>> to make use of it in syscalls with extensible ABIs, such as sched_setattr().
>>
>> If people really like this then we could go farther as well and add a standalone
>> 'extended errors system call' as well (SyS_errno_extended_get()), which would
>> allow the recovery of error strings even for system calls that are not easily
>> extensible. We could cache the last error description in the task struct.
>
> If we do that then we don't even have to introduce per system call error code
> conversion, but could unconditionally save the last extended error info in the
> task struct and continue - this could be done very cheaply with the linker trick
> driven integer ID.
>
> I.e. system calls could opt in to do:
>
> return err_str(-EBUSY, "perf/x86: BTS conflicts with active events");
>
> and the overhead of this would be minimal, we'd essentially do something like this
> to save the error:
>
> current->err_code = code;
>
> where 'code' is a build time constant in essence.
I'd propose a mixed approach here: err_str() would still return an
integer in the [-EXT_ERRNO, -MAX_ERRNO] range which would index the
err_site struct and upon returning to userspace we'd do
current->err_code = code;
return ext_errno(code); /* the traditional errno */
Reason: the lifetime of this extended error code would be exactly the
same as that of the traditional error value so that we'd always return
the most recent error and wouldn't be prone to something overwriting the
error code under us.
The problem with code checking for different types of errors has two
sides to it:
* most of those error codes that are check for shouldn't really be
annotated at all and should rather remain like they are;
* with the ones that actually do need to be checked for, the checks
would change from "if (err == EINTR)" to "if (ext_errno(err) ==
EINTR)", which doesn't seem like a big deal (with ext_errno() being a
O(1) lookup).
Side note: we should also make sure that only the userspace-visible
errors ever get annotated like that to prevent the error message creep
(which would be even a bigger problem if we go ahead to store the
extended error code in task_struct right at the topmost return
statement). Perf example: pretty much all errors that happen around
event scheduling, including stuff that pmu callbacks return, needn't and
shouldn't be annotated at all.
> We could use this even in system calls where the error path is performance
> critical, as all the string recovery and copying overhead would be triggered by
> applications that opt in via the new system call:
>
> struct err_desc {
> const char *message;
> const char *owner;
> const int code;
> };
>
> SyS_err_get_desc(struct err_desc *err_desc __user);
>
> [ Which could perhaps be a prctl() extension as well (PR_GET_ERR_DESC): finally
> some truly matching functionality for prctl(). ]
>
> Hm?
I like this.
Regards,
--
Alex
^ permalink raw reply [flat|nested] 34+ messages in thread