Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH 01/11] uapi: General notification ring definitions [ver #7]
From: David Howells @ 2019-08-30 13:57 UTC (permalink / raw)
  To: viro
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner
In-Reply-To: <156717343223.2204.15875738850129174524.stgit@warthog.procyon.org.uk>

Add UAPI definitions for the general notification ring, including the
following pieces:

 (1) struct watch_notification.

     This is the metadata header for each entry in the ring.  It includes a
     type and subtype that indicate the source of the message
     (eg. WATCH_TYPE_MOUNT_NOTIFY) and the kind of the message
     (eg. NOTIFY_MOUNT_NEW_MOUNT).

     The header also contains an information field that conveys the
     following information:

	- WATCH_INFO_LENGTH.  The size of the entry (entries are variable
          length).

	- WATCH_INFO_ID.  The watch ID specified when the watchpoint was
          set.

	- WATCH_INFO_TYPE_INFO.  (Sub)type-specific information.

	- WATCH_INFO_FLAG_*.  Flag bits overlain on the type-specific
          information.  For use by the type.

     All the information in the header can be used in filtering messages at
     the point of writing into the buffer.

 (2) struct watch_queue_buffer.

     This describes the layout of the ring.  Note that the first slots in
     the ring contain a special metadata entry that contains the ring
     pointers.  The producer in the kernel knows to skip this and it has a
     proper header (WATCH_TYPE_META, WATCH_META_SKIP_NOTIFICATION) that
     indicates the size so that the ring consumer can handle it the same as
     any other record and just skip it.

     Note that this means that ring entries can never be split over the end
     of the ring, so if an entry would need to be split, a skip record is
     inserted to wrap the ring first; this is also WATCH_TYPE_META,
     WATCH_META_SKIP_NOTIFICATION.

 (3) WATCH_INFO_NOTIFICATIONS_LOST.

     This is a flag that can be set in the metadata header by the kernel to
     indicate that at least one message was lost since it was last cleared
     by userspace.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---

 include/uapi/linux/watch_queue.h |   67 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 include/uapi/linux/watch_queue.h

diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
new file mode 100644
index 000000000000..70f575099968
--- /dev/null
+++ b/include/uapi/linux/watch_queue.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_WATCH_QUEUE_H
+#define _UAPI_LINUX_WATCH_QUEUE_H
+
+#include <linux/types.h>
+
+enum watch_notification_type {
+	WATCH_TYPE_META		= 0,	/* Special record */
+	WATCH_TYPE___NR		= 1
+};
+
+enum watch_meta_notification_subtype {
+	WATCH_META_SKIP_NOTIFICATION	= 0,	/* Just skip this record */
+	WATCH_META_REMOVAL_NOTIFICATION	= 1,	/* Watched object was removed */
+};
+
+#define WATCH_LENGTH_GRANULARITY sizeof(__u64)
+
+/*
+ * Notification record header.  This is aligned to 64-bits so that subclasses
+ * can contain __u64 fields.
+ */
+struct watch_notification {
+	__u32			type:24;	/* enum watch_notification_type */
+	__u32			subtype:8;	/* Type-specific subtype (filterable) */
+	__u32			info;
+#define WATCH_INFO_LENGTH	0x0000003f	/* Length of record / sizeof(watch_notification) */
+#define WATCH_INFO_LENGTH__SHIFT 0
+#define WATCH_INFO_ID		0x0000ff00	/* ID of watchpoint, if type-appropriate */
+#define WATCH_INFO_ID__SHIFT	8
+#define WATCH_INFO_TYPE_INFO	0xffff0000	/* Type-specific info */
+#define WATCH_INFO_TYPE_INFO__SHIFT 16
+#define WATCH_INFO_FLAG_0	0x00010000	/* Type-specific info, flag bit 0 */
+#define WATCH_INFO_FLAG_1	0x00020000	/* ... */
+#define WATCH_INFO_FLAG_2	0x00040000
+#define WATCH_INFO_FLAG_3	0x00080000
+#define WATCH_INFO_FLAG_4	0x00100000
+#define WATCH_INFO_FLAG_5	0x00200000
+#define WATCH_INFO_FLAG_6	0x00400000
+#define WATCH_INFO_FLAG_7	0x00800000
+} __attribute__((aligned(WATCH_LENGTH_GRANULARITY)));
+
+struct watch_queue_buffer {
+	union {
+		/* The first few entries are special, containing the
+		 * ring management variables.
+		 */
+		struct {
+			struct watch_notification watch; /* WATCH_TYPE_META */
+			__u32		head;		/* Ring head index */
+			__u32		tail;		/* Ring tail index */
+			__u32		mask;		/* Ring index mask */
+			__u32		__reserved;
+		} meta;
+		struct watch_notification slots[0];
+	};
+};
+
+/*
+ * The Metadata pseudo-notification message uses a flag bits in the information
+ * field to convey the fact that messages have been lost.  We can only use a
+ * single bit in this manner per word as some arches that support SMP
+ * (eg. parisc) have no kernel<->user atomic bit ops.
+ */
+#define WATCH_INFO_NOTIFICATIONS_LOST WATCH_INFO_FLAG_0
+
+#endif /* _UAPI_LINUX_WATCH_QUEUE_H */

^ permalink raw reply related

* [PATCH 00/11] Keyrings, Block and USB notifications [ver #7]
From: David Howells @ 2019-08-30 13:57 UTC (permalink / raw)
  To: viro
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner


Here's a set of patches to add a general notification queue concept and to
add sources of events for:

 (1) Key/keyring events, such as creating, linking and removal of keys.

 (2) General device events (single common queue) including:

     - Block layer events, such as device errors

     - USB subsystem events, such as device/bus attach/remove, device
       reset, device errors.

Tests for the key/keyring events can be found on the keyutils next branch:

	https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/keyutils.git/log/?h=next

Notifications are done automatically inside of the testing infrastructure
on every change to that every test makes to a key or keyring.

Manual pages can be found there also, including pages for watch_queue(7)
and the watch_devices(2) system call (these should be transferred to the
manpages package if taken upstream).

LSM hooks are included:

 (1) A set of hooks are provided that allow an LSM to rule on whether or
     not a watch may be set.  Each of these hooks takes a different
     "watched object" parameter, so they're not really shareable.  The LSM
     should use current's credentials.  [Wanted by SELinux & Smack]

 (2) A hook is provided to allow an LSM to rule on whether or not a
     particular message may be posted to a particular queue.  This is given
     the credentials from the event generator (which may be the system) and
     the watch setter.  [Wanted by Smack]

I've provided a preliminary attempt to provide SELinux and Smack with
implementations of some of these hooks.


Design decisions:

 (1) A misc chardev is used to create and open a ring buffer:

	fd = open("/dev/watch_queue", O_RDWR);

     which is then configured and mmap'd into userspace:

	ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE);
	ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter);
	buf = mmap(NULL, BUF_SIZE * page_size, PROT_READ | PROT_WRITE,
		   MAP_SHARED, fd, 0);

     The fd cannot be read or written (though there is a facility to use
     write to inject records for debugging) and userspace just pulls data
     directly out of the buffer.

 (2) The ring index pointers are stored inside the ring and are thus
     accessible to userspace.  Userspace should only update the tail
     pointer and never the head pointer or risk breaking the buffer.  The
     kernel checks that the pointers appear valid before trying to use
     them.  A 'skip' record is maintained around the pointers.

 (3) poll() can be used to wait for data to appear in the buffer.

 (4) Records in the buffer are binary, typed and have a length so that they
     can be of varying size.

     This means that multiple heterogeneous sources can share a common
     buffer.  Tags may be specified when a watchpoint is created to help
     distinguish the sources.

 (5) The queue is reusable as there are 16 million types available, of
     which I've used just a few, so there is scope for others to be used.

 (6) Records are filterable as types have up to 256 subtypes that can be
     individually filtered.  Other filtration is also available.

 (7) Each time the buffer is opened, a new buffer is created - this means
     that there's no interference between watchers.

 (8) When recording a notification, the kernel will not sleep, but will
     rather mark a queue as overrun if there's insufficient space, thereby
     avoiding userspace causing the kernel to hang.

 (9) The 'watchpoint' should be specific where possible, meaning that you
     specify the object that you want to watch.

(10) The buffer is created and then watchpoints are attached to it, using
     one of:

	keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01);
	watch_devices(fd, 0x02, 0);

     where in both cases, fd indicates the queue and the number after is a
     tag between 0 and 255.

(11) The watch must be removed if either the watch buffer is destroyed or
     the watched object is destroyed.


Things I want to avoid:

 (1) Introducing features that make the core VFS dependent on the network
     stack or networking namespaces (ie. usage of netlink).

 (2) Dumping all this stuff into dmesg and having a daemon that sits there
     parsing the output and distributing it as this then puts the
     responsibility for security into userspace and makes handling
     namespaces tricky.  Further, dmesg might not exist or might be
     inaccessible inside a container.

 (3) Letting users see events they shouldn't be able to see.


The patches can be found here also:

	http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=notifications-core

Changes:

 ver #7:

 (*) Removed the 'watch' argument from the security_watch_key() and
     security_watch_devices() hooks as current_cred() can be used instead
     of watch->cred.

 ver #6:

 (*) Fix mmap bug in watch_queue driver.

 (*) Add an extended removal notification that can transmit an identifier
     to userspace (such as a key ID).

 (*) Don't produce a instantiation notification in mark_key_instantiated()
     but rather do it in the caller to prevent key updates from producing
     an instantiate notification as well as an update notification.

 (*) Set the right number of filters in the sample program.

 (*) Provide preliminary hook implementations for SELinux and Smack.

 ver #5:

 (*) Split the superblock watch and mount watch parts out into their own
     branch (notifications-mount) as they really need certain fsinfo()
     attributes.

 (*) Rearrange the watch notification UAPI header to push the length down
     to bits 0-5 and remove the lost-message bits.  The userspace's watch
     ID tag is moved to bits 8-15 and then the message type is allocated
     all of bits 16-31 for its own purposes.

     The lost-message bit is moved over to the header, rather than being
     placed in the next message to be generated and given its own word so
     it can be cleared with xchg(,0) for parisc.

 (*) The security_post_notification() hook is no longer called with the
     spinlock held and softirqs disabled - though the RCU readlock is still
     held.

 (*) Buffer pages are now accounted towards RLIMIT_MEMLOCK and CAP_IPC_LOCK
     will skip the overuse check.

 (*) The buffer is marked VM_DONTEXPAND.

 (*) Save the watch-setter's creds in struct watch and give that to the LSM
     hook for posting a message.

 ver #4:

 (*) Split the basic UAPI bits out into their own patch and then split the
     LSM hooks out into an intermediate patch.  Add LSM hooks for setting
     watches.

     Rename the *_notify() system calls to watch_*() for consistency.

 ver #3:

 (*) I've added a USB notification source and reformulated the block
     notification source so that there's now a common watch list, for which
     the system call is now device_notify().

     I've assigned a pair of unused ioctl numbers in the 'W' series to the
     ioctls added by this series.

     I've also added a description of the kernel API to the documentation.

 ver #2:

 (*) I've fixed various issues raised by Jann Horn and GregKH and moved to
     krefs for refcounting.  I've added some security features to try and
     give Casey Schaufler the LSM control he wants.

David
---
David Howells (11):
      uapi: General notification ring definitions
      security: Add hooks to rule on setting a watch
      security: Add a hook for the point of notification insertion
      General notification queue with user mmap()'able ring buffer
      keys: Add a notification facility
      Add a general, global device notification watch list
      block: Add block layer notifications
      usb: Add USB subsystem notifications
      Add sample notification program
      selinux: Implement the watch_key security hook
      smack: Implement the watch_key and post_notification hooks [untested]


 Documentation/ioctl/ioctl-number.rst        |    1 
 Documentation/security/keys/core.rst        |   58 ++
 Documentation/watch_queue.rst               |  460 ++++++++++++++
 arch/alpha/kernel/syscalls/syscall.tbl      |    1 
 arch/arm/tools/syscall.tbl                  |    1 
 arch/ia64/kernel/syscalls/syscall.tbl       |    1 
 arch/m68k/kernel/syscalls/syscall.tbl       |    1 
 arch/microblaze/kernel/syscalls/syscall.tbl |    1 
 arch/mips/kernel/syscalls/syscall_n32.tbl   |    1 
 arch/mips/kernel/syscalls/syscall_n64.tbl   |    1 
 arch/mips/kernel/syscalls/syscall_o32.tbl   |    1 
 arch/parisc/kernel/syscalls/syscall.tbl     |    1 
 arch/powerpc/kernel/syscalls/syscall.tbl    |    1 
 arch/s390/kernel/syscalls/syscall.tbl       |    1 
 arch/sh/kernel/syscalls/syscall.tbl         |    1 
 arch/sparc/kernel/syscalls/syscall.tbl      |    1 
 arch/x86/entry/syscalls/syscall_32.tbl      |    1 
 arch/x86/entry/syscalls/syscall_64.tbl      |    1 
 arch/xtensa/kernel/syscalls/syscall.tbl     |    1 
 block/Kconfig                               |    9 
 block/blk-core.c                            |   29 +
 drivers/base/Kconfig                        |    9 
 drivers/base/Makefile                       |    1 
 drivers/base/watch.c                        |   90 +++
 drivers/misc/Kconfig                        |   13 
 drivers/misc/Makefile                       |    1 
 drivers/misc/watch_queue.c                  |  893 +++++++++++++++++++++++++++
 drivers/usb/core/Kconfig                    |    9 
 drivers/usb/core/devio.c                    |   56 ++
 drivers/usb/core/hub.c                      |    4 
 include/linux/blkdev.h                      |   15 
 include/linux/device.h                      |    7 
 include/linux/key.h                         |    3 
 include/linux/lsm_audit.h                   |    1 
 include/linux/lsm_hooks.h                   |   38 +
 include/linux/security.h                    |   32 +
 include/linux/syscalls.h                    |    1 
 include/linux/usb.h                         |   18 +
 include/linux/watch_queue.h                 |   94 +++
 include/uapi/asm-generic/unistd.h           |    4 
 include/uapi/linux/keyctl.h                 |    2 
 include/uapi/linux/watch_queue.h            |  183 ++++++
 kernel/sys_ni.c                             |    1 
 samples/Kconfig                             |    6 
 samples/Makefile                            |    1 
 samples/watch_queue/Makefile                |    8 
 samples/watch_queue/watch_test.c            |  233 +++++++
 security/keys/Kconfig                       |    9 
 security/keys/compat.c                      |    3 
 security/keys/gc.c                          |    5 
 security/keys/internal.h                    |   30 +
 security/keys/key.c                         |   38 +
 security/keys/keyctl.c                      |   99 +++
 security/keys/keyring.c                     |   20 -
 security/keys/request_key.c                 |    4 
 security/security.c                         |   23 +
 security/selinux/hooks.c                    |   14 
 security/smack/smack_lsm.c                  |   82 ++
 58 files changed, 2593 insertions(+), 30 deletions(-)
 create mode 100644 Documentation/watch_queue.rst
 create mode 100644 drivers/base/watch.c
 create mode 100644 drivers/misc/watch_queue.c
 create mode 100644 include/linux/watch_queue.h
 create mode 100644 include/uapi/linux/watch_queue.h
 create mode 100644 samples/watch_queue/Makefile
 create mode 100644 samples/watch_queue/watch_test.c

^ permalink raw reply

* Re: [PATCH v14 5/6] sched/core: uclamp: Update CPU's refcount on TG's clamp changes
From: Peter Zijlstra @ 2019-08-30  9:48 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, linux-api, cgroups, Ingo Molnar,
	Tejun Heo, Rafael J . Wysocki, Vincent Guittot, Viresh Kumar,
	Paul Turner, Michal Koutny, Quentin Perret, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle, Suren Baghdasaryan, Alessio Balsini
In-Reply-To: <20190822132811.31294-6-patrick.bellasi@arm.com>

On Thu, Aug 22, 2019 at 02:28:10PM +0100, Patrick Bellasi wrote:

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 04fc161e4dbe..fc2dc86a2abe 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1043,6 +1043,57 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>  		uclamp_rq_dec_id(rq, p, clamp_id);
>  }
>  
> +static inline void
> +uclamp_update_active(struct task_struct *p, unsigned int clamp_id)
> +{
> +	struct rq_flags rf;
> +	struct rq *rq;
> +
> +	/*
> +	 * Lock the task and the rq where the task is (or was) queued.
> +	 *
> +	 * We might lock the (previous) rq of a !RUNNABLE task, but that's the
> +	 * price to pay to safely serialize util_{min,max} updates with
> +	 * enqueues, dequeues and migration operations.
> +	 * This is the same locking schema used by __set_cpus_allowed_ptr().
> +	 */
> +	rq = task_rq_lock(p, &rf);

Since modifying cgroup parameters is priv only, this should be OK I
suppose. Priv can already DoS the system anyway.

> +	/*
> +	 * Setting the clamp bucket is serialized by task_rq_lock().
> +	 * If the task is not yet RUNNABLE and its task_struct is not
> +	 * affecting a valid clamp bucket, the next time it's enqueued,
> +	 * it will already see the updated clamp bucket value.
> +	 */
> +	if (!p->uclamp[clamp_id].active)
> +		goto done;
> +
> +	uclamp_rq_dec_id(rq, p, clamp_id);
> +	uclamp_rq_inc_id(rq, p, clamp_id);
> +
> +done:

I'm thinking that:

	if (p->uclamp[clamp_id].active) {
		uclamp_rq_dec_id(rq, p, clamp_id);
		uclamp_rq_inc_id(rq, p, clamp_id);
	}

was too obvious? ;-)

> +
> +	task_rq_unlock(rq, p, &rf);
> +}

^ permalink raw reply

* Re: [PATCH v14 1/6] sched/core: uclamp: Extend CPU's cgroup controller
From: Peter Zijlstra @ 2019-08-30  9:45 UTC (permalink / raw)
  To: Patrick Bellasi
  Cc: linux-kernel, linux-pm, linux-api, cgroups, Ingo Molnar,
	Tejun Heo, Rafael J . Wysocki, Vincent Guittot, Viresh Kumar,
	Paul Turner, Michal Koutny, Quentin Perret, Dietmar Eggemann,
	Morten Rasmussen, Juri Lelli, Todd Kjos, Joel Fernandes,
	Steve Muckle, Suren Baghdasaryan, Alessio Balsini
In-Reply-To: <20190822132811.31294-2-patrick.bellasi@arm.com>

On Thu, Aug 22, 2019 at 02:28:06PM +0100, Patrick Bellasi wrote:
> +#define _POW10(exp) ((unsigned int)1e##exp)
> +#define POW10(exp) _POW10(exp)

What is this magic? You're forcing a float literal into an integer.
Surely that deserves a comment!

> +struct uclamp_request {
> +#define UCLAMP_PERCENT_SHIFT	2
> +#define UCLAMP_PERCENT_SCALE	(100 * POW10(UCLAMP_PERCENT_SHIFT))
> +	s64 percent;
> +	u64 util;
> +	int ret;
> +};
> +
> +static inline struct uclamp_request
> +capacity_from_percent(char *buf)
> +{
> +	struct uclamp_request req = {
> +		.percent = UCLAMP_PERCENT_SCALE,
> +		.util = SCHED_CAPACITY_SCALE,
> +		.ret = 0,
> +	};
> +
> +	buf = strim(buf);
> +	if (strncmp("max", buf, 4)) {

That is either a bug, and you meant to write: strncmp(buf, "max", 3), or
it is not, and then you could've written: strcmp(buf, "max")

But as written it doesn't make sense.

> +		req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
> +					     &req.percent);
> +		if (req.ret)
> +			return req;
> +		if (req.percent > UCLAMP_PERCENT_SCALE) {
> +			req.ret = -ERANGE;
> +			return req;
> +		}
> +
> +		req.util = req.percent << SCHED_CAPACITY_SHIFT;
> +		req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
> +	}
> +
> +	return req;
> +}

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/3] capability: introduce CAP_BPF and CAP_TRACING
From: Alexei Starovoitov @ 2019-08-30  4:16 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Alexei Starovoitov, Andy Lutomirski, David S. Miller,
	Peter Zijlstra, Steven Rostedt, Network Development, bpf,
	Kernel Team, Linux API
In-Reply-To: <536636ad-0baf-31e9-85fe-2591b65068df@iogearbox.net>

On Thu, Aug 29, 2019 at 8:47 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 8/29/19 7:12 AM, Alexei Starovoitov wrote:
> [...]
> >
> > +/*
> > + * CAP_BPF allows the following BPF operations:
> > + * - Loading all types of BPF programs
> > + * - Creating all types of BPF maps except:
> > + *    - stackmap that needs CAP_TRACING
> > + *    - devmap that needs CAP_NET_ADMIN
> > + *    - cpumap that needs CAP_SYS_ADMIN
> > + * - Advanced verifier features
> > + *   - Indirect variable access
> > + *   - Bounded loops
> > + *   - BPF to BPF function calls
> > + *   - Scalar precision tracking
> > + *   - Larger complexity limits
> > + *   - Dead code elimination
> > + *   - And potentially other features
> > + * - Use of pointer-to-integer conversions in BPF programs
> > + * - Bypassing of speculation attack hardening measures
> > + * - Loading BPF Type Format (BTF) data
> > + * - Iterate system wide loaded programs, maps, BTF objects
> > + * - Retrieve xlated and JITed code of BPF programs
> > + * - Access maps and programs via id
> > + * - Use bpf_spin_lock() helper
>
> This is still very wide.

'still very wide' ? you make it sound like it's a bad thing.
Covering all of bpf with single CAP_BPF is #1 goal of this set.

> Consider following example: app has CAP_BPF +> CAP_NET_ADMIN. Why can't we in this case *only* allow loading networking
> related [plus generic] maps and programs? If it doesn't have CAP_TRACING,
> what would be a reason to allow loading it? Same vice versa. There are
> some misc program types like the infraread stuff, but they could continue
> to live under [CAP_BPF +] CAP_SYS_ADMIN as fallback. I think categorizing
> a specific list of prog and map types might be more clear than disallowing
> some helpers like below (e.g. why choice of bpf_probe_read() but not
> bpf_probe_write_user() etc).

It kinda makes sense:
cap_bpf+cap_net_admin allows networking progs.
cap_bpf+cap_tracing allows tracing progs.
But what to do with cg_sysctl, cg_device, lirc ?
They are clearly neither.
Invent yet another cap_foo for them?
or let them under cap_bpf alone?
If cap_bpf alone is enough then why bother with bpf+net_admin for networking?
It's not making anything cleaner. Only confuses users.

Also bpf_trace_printk() is using ftrace and can print arbitrary memory.
In that sense it's no different than bpf_probe_read.
Both should be under CAP_TRACING.
But bpf_trace_printk() is available to all progs.
Even to socket filters under cap_sys_admin today.
With this patch set I'm allowing bpf_trace_printk() under CAP_TRACING.
Same goes to bpf_probe_read.

High level description:
cap_bpf alone allows loading of all progs except when
later cap_net_admin or cap_tracing will _not_ be able to
filter out the helper at attach time that shouldn't be there.

Example of how this patch set works:
- to load and attach networking progs
both cap_bpf and cap_net_admin are necessary.
- to load and attach tracing progs
both cap_bpf and cap_tracing are necessary.

when networking prog is using bpf_trace_printk
then cap_tracing is needed too.
And it's checked at load time.
If we do what you're proposing:
"lets allow load of all networking with bpf+net_admin"
then this won't work for bpf_trace_printk.
Per helper function capability check is still needed.
And since it's needed I see no reason to limit
networking progs to bpf+net_admin at load time.
Load time is still cap_bpf only.
And helpers will be filtered out at attach by net_admin.

^ permalink raw reply

* Re: [PATCH 0/7] Rework random blocking
From: Andy Lutomirski @ 2019-08-30  2:01 UTC (permalink / raw)
  To: Theodore Y. Ts'o
  Cc: Andy Lutomirski, Theodore Tso, LKML, Linux API, Kees Cook,
	Jason A. Donenfeld
In-Reply-To: <20190830014906.GD10779@mit.edu>



> On Aug 29, 2019, at 6:49 PM, Theodore Y. Ts'o <tytso@mit.edu> wrote:
> 
>> On Thu, Aug 29, 2019 at 06:11:35PM -0700, Andy Lutomirski wrote:
>> This series also removes the blocking pool and makes /dev/random
>> work just like getentropy(..., 0) and makes GRND_RANDOM a no-op.  I
>> believe that Linux's blocking pool has outlived its usefulness.
>> Linux's CRNG generates output that is good enough to use even for
>> key generation.  The blocking pool is not stronger in any material
>> way, and keeping it around requires a lot of infrastructure of
>> dubious value.
> 
> It's too late for the 5.4 cycle for a change of this magnitude, and
> I'd just as soon let this wait until *after* the LTS kernel gets cut.
> The reason for this is because at the moment, there are some PCI
> compliance labs who believe that the "true randomness" of /dev/random
> is necessary for PCI compliance and so they mandate the use of
> /dev/random over /dev/urandom's "cryptographic randomness" for that
> reason.  A lot of things which are thought to be needed for PCI
> compliance that are about as useful as eye of newt and toe of frog,
> but nothing says that PCI compliance (and enterprise customer
> requirements :-) have to make sense.
> 
> It may be that what we might need to really support people (or stupid
> compliance labs) who have a fetish for "true randomness" to get a
> better interface for hardware random number generators than
> /dev/hwrng.  Specifically, one which allows for a more sane way of
> selecting which hardware random number generator to use if there are
> multiple available, and also one where we mix in some CRNG as a
> whitening step just case the hardware number generator is busted in
> some way.  (And to fix the issue that at the moment, if someone evil
> fakes up a USB device with the USB manufacturer and minor device
> number for a ChosKey device that generates a insecure sequence, it
> will still get blindly trusted by the kernel without any kind of
> authentication of said hardware device.)
> 
> That probably means we need to come up with a new interface than
> /dev/hwrng, or have some way of configuring /dev/random to use a
> hardware RNG device for those people who really care about "true
> randomness".  The current /dev/hwrng interface and how it is
> configured via sysfs is pretty baroque IMO.
> 
>                     

Hmm. Does this really need to be in the kernel?  ISTM it should be straightforward to write a little CUSE program that grabs bytes from RDSEED or RDRAND, TPM, ChaosKey (if enabled, with a usb slot selected!), and whatever other sources are requested and, configurable to satisfy whoever actually cares, mixes some or all with a FIPS-compliant, provably-indististinguishable-from-random, definitely not Dual-EC mixer, and spits out the result.  And filters it and checks all the sources for credibility, and generally does whatever the user actually needs.

And the really over-the-top auditors can symlink it to /dev/random.

Do the PCI folks actually care that it’s in the kernel?


As an aside, the first two patches could plausibly land before the rest of the series if that seems appropriate.

^ permalink raw reply

* Re: [PATCH 0/7] Rework random blocking
From: Theodore Y. Ts'o @ 2019-08-30  1:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Theodore Tso, LKML, Linux API, Kees Cook, Jason A. Donenfeld
In-Reply-To: <cover.1567126741.git.luto@kernel.org>

On Thu, Aug 29, 2019 at 06:11:35PM -0700, Andy Lutomirski wrote:
> This series also removes the blocking pool and makes /dev/random
> work just like getentropy(..., 0) and makes GRND_RANDOM a no-op.  I
> believe that Linux's blocking pool has outlived its usefulness.
> Linux's CRNG generates output that is good enough to use even for
> key generation.  The blocking pool is not stronger in any material
> way, and keeping it around requires a lot of infrastructure of
> dubious value.

It's too late for the 5.4 cycle for a change of this magnitude, and
I'd just as soon let this wait until *after* the LTS kernel gets cut.
The reason for this is because at the moment, there are some PCI
compliance labs who believe that the "true randomness" of /dev/random
is necessary for PCI compliance and so they mandate the use of
/dev/random over /dev/urandom's "cryptographic randomness" for that
reason.  A lot of things which are thought to be needed for PCI
compliance that are about as useful as eye of newt and toe of frog,
but nothing says that PCI compliance (and enterprise customer
requirements :-) have to make sense.

It may be that what we might need to really support people (or stupid
compliance labs) who have a fetish for "true randomness" to get a
better interface for hardware random number generators than
/dev/hwrng.  Specifically, one which allows for a more sane way of
selecting which hardware random number generator to use if there are
multiple available, and also one where we mix in some CRNG as a
whitening step just case the hardware number generator is busted in
some way.  (And to fix the issue that at the moment, if someone evil
fakes up a USB device with the USB manufacturer and minor device
number for a ChosKey device that generates a insecure sequence, it
will still get blindly trusted by the kernel without any kind of
authentication of said hardware device.)

That probably means we need to come up with a new interface than
/dev/hwrng, or have some way of configuring /dev/random to use a
hardware RNG device for those people who really care about "true
randomness".  The current /dev/hwrng interface and how it is
configured via sysfs is pretty baroque IMO.

	      	  	     	       	  	 - Ted

^ permalink raw reply

* [PATCH 7/7] random: Remove kernel.random.read_wakeup_threshold
From: Andy Lutomirski @ 2019-08-30  1:11 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Andy Lutomirski
In-Reply-To: <cover.1567126741.git.luto@kernel.org>

It has no effect any more, so remove it.  We can revert this if
there is some user code that expects to be able to set this sysctl.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 99fea5cc29a8..2a284f30cac4 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -369,12 +369,6 @@
 #define ENTROPY_SHIFT 3
 #define ENTROPY_BITS(r) ((r)->entropy_count >> ENTROPY_SHIFT)
 
-/*
- * The minimum number of bits of entropy before we wake up a read on
- * /dev/random.  Should be enough to do a significant reseed.
- */
-static int random_read_wakeup_bits = 64;
-
 /*
  * If the entropy count falls under this number of bits, then we
  * should wake up processes which are selecting or polling on write
@@ -1982,8 +1976,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 
 #include <linux/sysctl.h>
 
-static int min_read_thresh = 8, min_write_thresh;
-static int max_read_thresh = OUTPUT_POOL_WORDS * 32;
+static int min_write_thresh;
 static int max_write_thresh = INPUT_POOL_WORDS * 32;
 static int random_min_urandom_seed = 60;
 static char sysctl_bootid[16];
@@ -2058,15 +2051,6 @@ struct ctl_table random_table[] = {
 		.proc_handler	= proc_do_entropy,
 		.data		= &input_pool.entropy_count,
 	},
-	{
-		.procname	= "read_wakeup_threshold",
-		.data		= &random_read_wakeup_bits,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &min_read_thresh,
-		.extra2		= &max_read_thresh,
-	},
 	{
 		.procname	= "write_wakeup_threshold",
 		.data		= &random_write_wakeup_bits,
-- 
2.21.0

^ permalink raw reply related

* [PATCH 6/7] random: Delete code to pull data into pools
From: Andy Lutomirski @ 2019-08-30  1:11 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Andy Lutomirski
In-Reply-To: <cover.1567126741.git.luto@kernel.org>

There is no pool that pulls, so it was just dead code.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c | 40 ----------------------------------------
 1 file changed, 40 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 4521138231ed..99fea5cc29a8 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -528,10 +528,8 @@ struct entropy_store {
 	const struct poolinfo *poolinfo;
 	__u32 *pool;
 	const char *name;
-	struct entropy_store *pull;
 
 	/* read-write data: */
-	unsigned long last_pulled;
 	spinlock_t lock;
 	unsigned short add_ptr;
 	unsigned short input_rotate;
@@ -1347,41 +1345,6 @@ EXPORT_SYMBOL_GPL(add_disk_randomness);
  *
  *********************************************************************/
 
-/*
- * This utility inline function is responsible for transferring entropy
- * from the primary pool to the secondary extraction pool. We make
- * sure we pull enough for a 'catastrophic reseed'.
- */
-static void _xfer_secondary_pool(struct entropy_store *r, size_t nbytes);
-static void xfer_secondary_pool(struct entropy_store *r, size_t nbytes)
-{
-	if (!r->pull ||
-	    r->entropy_count >= (nbytes << (ENTROPY_SHIFT + 3)) ||
-	    r->entropy_count > r->poolinfo->poolfracbits)
-		return;
-
-	_xfer_secondary_pool(r, nbytes);
-}
-
-static void _xfer_secondary_pool(struct entropy_store *r, size_t nbytes)
-{
-	__u32	tmp[OUTPUT_POOL_WORDS];
-
-	int bytes = nbytes;
-
-	/* pull at least as much as a wakeup */
-	bytes = max_t(int, bytes, random_read_wakeup_bits / 8);
-	/* but never more than the buffer size */
-	bytes = min_t(int, bytes, sizeof(tmp));
-
-	trace_xfer_secondary_pool(r->name, bytes * 8, nbytes * 8,
-				  ENTROPY_BITS(r), ENTROPY_BITS(r->pull));
-	bytes = extract_entropy(r->pull, tmp, bytes,
-				random_read_wakeup_bits / 8, 0);
-	mix_pool_bytes(r, tmp, bytes);
-	credit_entropy_bits(r, bytes*8);
-}
-
 /*
  * This function decides how many bytes to actually take from the
  * given pool, and also debits the entropy count accordingly.
@@ -1545,7 +1508,6 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf,
 			spin_unlock_irqrestore(&r->lock, flags);
 			trace_extract_entropy(r->name, EXTRACT_SIZE,
 					      ENTROPY_BITS(r), _RET_IP_);
-			xfer_secondary_pool(r, EXTRACT_SIZE);
 			extract_buf(r, tmp);
 			spin_lock_irqsave(&r->lock, flags);
 			memcpy(r->last_data, tmp, EXTRACT_SIZE);
@@ -1554,7 +1516,6 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf,
 	}
 
 	trace_extract_entropy(r->name, nbytes, ENTROPY_BITS(r), _RET_IP_);
-	xfer_secondary_pool(r, nbytes);
 	nbytes = account(r, nbytes, min, reserved);
 
 	return _extract_entropy(r, buf, nbytes, fips_enabled);
@@ -1765,7 +1726,6 @@ static void __init init_std_data(struct entropy_store *r)
 	ktime_t now = ktime_get_real();
 	unsigned long rv;
 
-	r->last_pulled = jiffies;
 	mix_pool_bytes(r, &now, sizeof(now));
 	for (i = r->poolinfo->poolbytes; i > 0; i -= sizeof(rv)) {
 		if (!arch_get_random_seed_long(&rv) &&
-- 
2.21.0

^ permalink raw reply related

* [PATCH 5/7] random: Remove the blocking pool
From: Andy Lutomirski @ 2019-08-30  1:11 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Andy Lutomirski
In-Reply-To: <cover.1567126741.git.luto@kernel.org>

There is no longer any interface to read data from the blocking
pool, so remove it.

This enables quite a bit of code deletion, much of which will be
done in subsequent patches.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c | 106 ------------------------------------------
 1 file changed, 106 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 29a158d9353c..4521138231ed 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -470,7 +470,6 @@ static const struct poolinfo {
 /*
  * Static global variables
  */
-static DECLARE_WAIT_QUEUE_HEAD(random_read_wait);
 static DECLARE_WAIT_QUEUE_HEAD(random_write_wait);
 static struct fasync_struct *fasync;
 
@@ -530,7 +529,6 @@ struct entropy_store {
 	__u32 *pool;
 	const char *name;
 	struct entropy_store *pull;
-	struct work_struct push_work;
 
 	/* read-write data: */
 	unsigned long last_pulled;
@@ -549,9 +547,7 @@ static ssize_t _extract_entropy(struct entropy_store *r, void *buf,
 				size_t nbytes, int fips);
 
 static void crng_reseed(struct crng_state *crng, struct entropy_store *r);
-static void push_to_pool(struct work_struct *work);
 static __u32 input_pool_data[INPUT_POOL_WORDS] __latent_entropy;
-static __u32 blocking_pool_data[OUTPUT_POOL_WORDS] __latent_entropy;
 
 static struct entropy_store input_pool = {
 	.poolinfo = &poolinfo_table[0],
@@ -560,16 +556,6 @@ static struct entropy_store input_pool = {
 	.pool = input_pool_data
 };
 
-static struct entropy_store blocking_pool = {
-	.poolinfo = &poolinfo_table[1],
-	.name = "blocking",
-	.pull = &input_pool,
-	.lock = __SPIN_LOCK_UNLOCKED(blocking_pool.lock),
-	.pool = blocking_pool_data,
-	.push_work = __WORK_INITIALIZER(blocking_pool.push_work,
-					push_to_pool),
-};
-
 static __u32 const twist_table[8] = {
 	0x00000000, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
 	0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
@@ -765,15 +751,11 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits)
 		entropy_count = 0;
 	} else if (entropy_count > pool_size)
 		entropy_count = pool_size;
-	if ((r == &blocking_pool) && !r->initialized &&
-	    (entropy_count >> ENTROPY_SHIFT) > 128)
-		has_initialized = 1;
 	if (cmpxchg(&r->entropy_count, orig, entropy_count) != orig)
 		goto retry;
 
 	if (has_initialized) {
 		r->initialized = 1;
-		wake_up_interruptible(&random_read_wait);
 		kill_fasync(&fasync, SIGIO, POLL_IN);
 	}
 
@@ -782,7 +764,6 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits)
 
 	if (r == &input_pool) {
 		int entropy_bits = entropy_count >> ENTROPY_SHIFT;
-		struct entropy_store *other = &blocking_pool;
 
 		if (crng_init < 2) {
 			if (entropy_bits < 128)
@@ -790,27 +771,6 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits)
 			crng_reseed(&primary_crng, r);
 			entropy_bits = r->entropy_count >> ENTROPY_SHIFT;
 		}
-
-		/* initialize the blocking pool if necessary */
-		if (entropy_bits >= random_read_wakeup_bits &&
-		    !other->initialized) {
-			schedule_work(&other->push_work);
-			return;
-		}
-
-		/* should we wake readers? */
-		if (entropy_bits >= random_read_wakeup_bits &&
-		    wq_has_sleeper(&random_read_wait)) {
-			wake_up_interruptible(&random_read_wait);
-		}
-		/* If the input pool is getting full, and the blocking
-		 * pool has room, send some entropy to the blocking
-		 * pool.
-		 */
-		if (!work_pending(&other->push_work) &&
-		    (ENTROPY_BITS(r) > 6 * r->poolinfo->poolbytes) &&
-		    (ENTROPY_BITS(other) <= 6 * other->poolinfo->poolbytes))
-			schedule_work(&other->push_work);
 	}
 }
 
@@ -1422,22 +1382,6 @@ static void _xfer_secondary_pool(struct entropy_store *r, size_t nbytes)
 	credit_entropy_bits(r, bytes*8);
 }
 
-/*
- * Used as a workqueue function so that when the input pool is getting
- * full, we can "spill over" some entropy to the output pools.  That
- * way the output pools can store some of the excess entropy instead
- * of letting it go to waste.
- */
-static void push_to_pool(struct work_struct *work)
-{
-	struct entropy_store *r = container_of(work, struct entropy_store,
-					      push_work);
-	BUG_ON(!r);
-	_xfer_secondary_pool(r, random_read_wakeup_bits/8);
-	trace_push_to_pool(r->name, r->entropy_count >> ENTROPY_SHIFT,
-			   r->pull->entropy_count >> ENTROPY_SHIFT);
-}
-
 /*
  * This function decides how many bytes to actually take from the
  * given pool, and also debits the entropy count accordingly.
@@ -1616,54 +1560,6 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf,
 	return _extract_entropy(r, buf, nbytes, fips_enabled);
 }
 
-/*
- * This function extracts randomness from the "entropy pool", and
- * returns it in a userspace buffer.
- */
-static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf,
-				    size_t nbytes)
-{
-	ssize_t ret = 0, i;
-	__u8 tmp[EXTRACT_SIZE];
-	int large_request = (nbytes > 256);
-
-	trace_extract_entropy_user(r->name, nbytes, ENTROPY_BITS(r), _RET_IP_);
-	if (!r->initialized && r->pull) {
-		xfer_secondary_pool(r, ENTROPY_BITS(r->pull)/8);
-		if (!r->initialized)
-			return 0;
-	}
-	xfer_secondary_pool(r, nbytes);
-	nbytes = account(r, nbytes, 0, 0);
-
-	while (nbytes) {
-		if (large_request && need_resched()) {
-			if (signal_pending(current)) {
-				if (ret == 0)
-					ret = -ERESTARTSYS;
-				break;
-			}
-			schedule();
-		}
-
-		extract_buf(r, tmp);
-		i = min_t(int, nbytes, EXTRACT_SIZE);
-		if (copy_to_user(buf, tmp, i)) {
-			ret = -EFAULT;
-			break;
-		}
-
-		nbytes -= i;
-		buf += i;
-		ret += i;
-	}
-
-	/* Wipe data just returned from memory */
-	memzero_explicit(tmp, sizeof(tmp));
-
-	return ret;
-}
-
 #define warn_unseeded_randomness(previous) \
 	_warn_unseeded_randomness(__func__, (void *) _RET_IP_, (previous))
 
@@ -1893,7 +1789,6 @@ static void __init init_std_data(struct entropy_store *r)
 int __init rand_initialize(void)
 {
 	init_std_data(&input_pool);
-	init_std_data(&blocking_pool);
 	crng_initialize(&primary_crng);
 	crng_global_init_time = jiffies;
 	if (ratelimit_disable) {
@@ -2053,7 +1948,6 @@ static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 		input_pool.entropy_count = 0;
-		blocking_pool.entropy_count = 0;
 		return 0;
 	case RNDRESEEDCRNG:
 		if (!capable(CAP_SYS_ADMIN))
-- 
2.21.0

^ permalink raw reply related

* [PATCH 4/7] random: Make /dev/random be almost like /dev/urandom
From: Andy Lutomirski @ 2019-08-30  1:11 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Andy Lutomirski
In-Reply-To: <cover.1567126741.git.luto@kernel.org>

This patch changes the read semantics of /dev/random to be the same
as /dev/urandom except that reads will block until the CRNG is
ready.

None of the cleanups that this enables have been done yet.  As a
result, this gives a warning about an unused function.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c | 55 +++++++++++--------------------------------
 1 file changed, 14 insertions(+), 41 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 1ad2c7eaf675..29a158d9353c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -354,7 +354,6 @@
 #define INPUT_POOL_WORDS	(1 << (INPUT_POOL_SHIFT-5))
 #define OUTPUT_POOL_SHIFT	10
 #define OUTPUT_POOL_WORDS	(1 << (OUTPUT_POOL_SHIFT-5))
-#define SEC_XFER_SIZE		512
 #define EXTRACT_SIZE		10
 
 
@@ -803,7 +802,6 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits)
 		if (entropy_bits >= random_read_wakeup_bits &&
 		    wq_has_sleeper(&random_read_wait)) {
 			wake_up_interruptible(&random_read_wait);
-			kill_fasync(&fasync, SIGIO, POLL_IN);
 		}
 		/* If the input pool is getting full, and the blocking
 		 * pool has room, send some entropy to the blocking
@@ -1031,6 +1029,7 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r)
 		crng_init = 2;
 		process_random_ready_list();
 		wake_up_interruptible(&crng_init_wait);
+		kill_fasync(&fasync, SIGIO, POLL_IN);
 		pr_notice("random: crng init done\n");
 		if (unseeded_warning.missed) {
 			pr_notice("random: %d get_random_xx warning(s) missed "
@@ -1921,43 +1920,6 @@ void rand_initialize_disk(struct gendisk *disk)
 }
 #endif
 
-static ssize_t
-_random_read(int nonblock, char __user *buf, size_t nbytes)
-{
-	ssize_t n;
-
-	if (nbytes == 0)
-		return 0;
-
-	nbytes = min_t(size_t, nbytes, SEC_XFER_SIZE);
-	while (1) {
-		n = extract_entropy_user(&blocking_pool, buf, nbytes);
-		if (n < 0)
-			return n;
-		trace_random_read(n*8, (nbytes-n)*8,
-				  ENTROPY_BITS(&blocking_pool),
-				  ENTROPY_BITS(&input_pool));
-		if (n > 0)
-			return n;
-
-		/* Pool is (near) empty.  Maybe wait and retry. */
-		if (nonblock)
-			return -EAGAIN;
-
-		wait_event_interruptible(random_read_wait,
-		    blocking_pool.initialized &&
-		    (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits));
-		if (signal_pending(current))
-			return -ERESTARTSYS;
-	}
-}
-
-static ssize_t
-random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
-{
-	return _random_read(file->f_flags & O_NONBLOCK, buf, nbytes);
-}
-
 static ssize_t
 urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
@@ -1981,15 +1943,26 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 	return ret;
 }
 
+static ssize_t
+random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
+{
+	int ret;
+
+	ret = wait_for_random_bytes();
+	if (ret != 0)
+		return ret;
+	return urandom_read(file, buf, nbytes, ppos);
+}
+
 static __poll_t
 random_poll(struct file *file, poll_table * wait)
 {
 	__poll_t mask;
 
-	poll_wait(file, &random_read_wait, wait);
+	poll_wait(file, &crng_init_wait, wait);
 	poll_wait(file, &random_write_wait, wait);
 	mask = 0;
-	if (ENTROPY_BITS(&input_pool) >= random_read_wakeup_bits)
+	if (crng_ready())
 		mask |= EPOLLIN | EPOLLRDNORM;
 	if (ENTROPY_BITS(&input_pool) < random_write_wakeup_bits)
 		mask |= EPOLLOUT | EPOLLWRNORM;
-- 
2.21.0

^ permalink raw reply related

* [PATCH 3/7] random: Ignore GRND_RANDOM in getentropy(2)
From: Andy Lutomirski @ 2019-08-30  1:11 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Andy Lutomirski
In-Reply-To: <cover.1567126741.git.luto@kernel.org>

The separate blocking pool is going away.  Start by ignoring
GRND_RANDOM in getentropy(2).

This should not materially break any API.  Any code that worked
without this change should work at least as well with this change.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c       | 3 ---
 include/uapi/linux/random.h | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index acabb870f222..1ad2c7eaf675 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -2135,9 +2135,6 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 	if (count > INT_MAX)
 		count = INT_MAX;
 
-	if (flags & GRND_RANDOM)
-		return _random_read(flags & GRND_NONBLOCK, buf, count);
-
 	if (!(flags & GRND_INSECURE) && !crng_ready()) {
 		if (flags & GRND_NONBLOCK)
 			return -EAGAIN;
diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h
index c092d20088d3..dcc1b3e6106f 100644
--- a/include/uapi/linux/random.h
+++ b/include/uapi/linux/random.h
@@ -48,7 +48,7 @@ struct rand_pool_info {
  * Flags for getrandom(2)
  *
  * GRND_NONBLOCK	Don't block and return EAGAIN instead
- * GRND_RANDOM		Use the /dev/random pool instead of /dev/urandom
+ * GRND_RANDOM		No effect
  * GRND_INSECURE	Return non-cryptographic random bytes
  */
 #define GRND_NONBLOCK	0x0001
-- 
2.21.0

^ permalink raw reply related

* [PATCH 2/7] random: Add GRND_INSECURE to return best-effort non-cryptographic bytes
From: Andy Lutomirski @ 2019-08-30  1:11 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Andy Lutomirski
In-Reply-To: <cover.1567126741.git.luto@kernel.org>

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c       | 11 +++++++++--
 include/uapi/linux/random.h |  2 ++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index d152612e08fc..acabb870f222 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -2122,7 +2122,14 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 {
 	int ret;
 
-	if (flags & ~(GRND_NONBLOCK|GRND_RANDOM))
+	if (flags & ~(GRND_NONBLOCK|GRND_RANDOM|GRND_INSECURE))
+		return -EINVAL;
+
+	/*
+	 * Requesting insecure and blocking randomness at the same time makes
+	 * no sense.
+	 */
+	if ((flags & (GRND_INSECURE|GRND_RANDOM)) == (GRND_INSECURE|GRND_RANDOM))
 		return -EINVAL;
 
 	if (count > INT_MAX)
@@ -2131,7 +2138,7 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
 	if (flags & GRND_RANDOM)
 		return _random_read(flags & GRND_NONBLOCK, buf, count);
 
-	if (!crng_ready()) {
+	if (!(flags & GRND_INSECURE) && !crng_ready()) {
 		if (flags & GRND_NONBLOCK)
 			return -EAGAIN;
 		ret = wait_for_random_bytes();
diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h
index 26ee91300e3e..c092d20088d3 100644
--- a/include/uapi/linux/random.h
+++ b/include/uapi/linux/random.h
@@ -49,8 +49,10 @@ struct rand_pool_info {
  *
  * GRND_NONBLOCK	Don't block and return EAGAIN instead
  * GRND_RANDOM		Use the /dev/random pool instead of /dev/urandom
+ * GRND_INSECURE	Return non-cryptographic random bytes
  */
 #define GRND_NONBLOCK	0x0001
 #define GRND_RANDOM	0x0002
+#define GRND_INSECURE	0x0004
 
 #endif /* _UAPI_LINUX_RANDOM_H */
-- 
2.21.0

^ permalink raw reply related

* [PATCH 1/7] random: Don't wake crng_init_wait when crng_init == 1
From: Andy Lutomirski @ 2019-08-30  1:11 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Andy Lutomirski
In-Reply-To: <cover.1567126741.git.luto@kernel.org>

crng_init_wait is only used to wayt for crng_init to be set to 2, so
there's no point to waking it when crng_init is set to 1.  Remove the
unnecessary wake_up_interruptible() call.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 drivers/char/random.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 5d5ea4ce1442..d152612e08fc 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -946,7 +946,6 @@ static int crng_fast_load(const char *cp, size_t len)
 	if (crng_init_cnt >= CRNG_INIT_CNT_THRESH) {
 		invalidate_batched_entropy();
 		crng_init = 1;
-		wake_up_interruptible(&crng_init_wait);
 		pr_notice("random: fast init done\n");
 	}
 	return 1;
-- 
2.21.0

^ permalink raw reply related

* [PATCH 0/7] Rework random blocking
From: Andy Lutomirski @ 2019-08-30  1:11 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Linux API, Kees Cook, Jason A. Donenfeld, Andy Lutomirski

This makes two major semantic changes to Linux's random APIs:

It adds getentropy(..., GRND_INSECURE).  This causes getentropy to
always return *something*.  There is no guarantee whatsoever that
the result will be cryptographically random or even unique, but the
kernel will give the best quality random output it can.  The name is
a big hint: the resulting output is INSECURE.

The purpose of this is to allow programs that genuinely want
best-effort entropy to get it without resorting to /dev/urandom.
Plenty of programs do this because they need to do *something*
during boot and they can't afford to wait.  Calling it "INSECURE" is
probably the best we can do to discourage using this API for things
that need security.

This series also removes the blocking pool and makes /dev/random
work just like getentropy(..., 0) and makes GRND_RANDOM a no-op.  I
believe that Linux's blocking pool has outlived its usefulness.
Linux's CRNG generates output that is good enough to use even for
key generation.  The blocking pool is not stronger in any material
way, and keeping it around requires a lot of infrastructure of
dubious value.

This series should not break any existing programs.  /dev/urandom is
unchanged.  /dev/random will still block just after booting, but it
will block less than it used to.  getentropy() with existing flags
will return output that is, for practical purposes, just as strong
as before.

Andy Lutomirski (7):
  random: Don't wake crng_init_wait when crng_init == 1
  random: Add GRND_INSECURE to return best-effort non-cryptographic
    bytes
  random: Ignore GRND_RANDOM in getentropy(2)
  random: Make /dev/random be almost like /dev/urandom
  random: Remove the blocking pool
  random: Delete code to pull data into pools
  random: Remove kernel.random.read_wakeup_threshold

 drivers/char/random.c       | 234 ++++--------------------------------
 include/uapi/linux/random.h |   4 +-
 2 files changed, 27 insertions(+), 211 deletions(-)

-- 
2.21.0

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/3] capability: introduce CAP_BPF and CAP_TRACING
From: Alexei Starovoitov @ 2019-08-29 21:10 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Toke Høiland-Jørgensen, Alexei Starovoitov, luto, davem,
	peterz, rostedt, netdev, bpf, kernel-team, linux-api
In-Reply-To: <20190829222530.3c6163ac@carbon>

On Thu, Aug 29, 2019 at 10:25:30PM +0200, Jesper Dangaard Brouer wrote:
> On Thu, 29 Aug 2019 20:05:49 +0200
> Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> 
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> > 
> > > On Thu, Aug 29, 2019 at 09:44:18AM +0200, Toke Høiland-Jørgensen wrote:  
> > >> Alexei Starovoitov <ast@kernel.org> writes:
> > >>   
> > >> > CAP_BPF allows the following BPF operations:
> > >> > - Loading all types of BPF programs
> > >> > - Creating all types of BPF maps except:
> > >> >    - stackmap that needs CAP_TRACING
> > >> >    - devmap that needs CAP_NET_ADMIN
> > >> >    - cpumap that needs CAP_SYS_ADMIN  
> > >> 
> > >> Why CAP_SYS_ADMIN instead of CAP_NET_ADMIN for cpumap?  
> > >
> > > Currently it's cap_sys_admin and I think it should stay this way
> > > because it creates kthreads.  
> > 
> > Ah, right. I can sorta see that makes sense because of the kthreads, but
> > it also means that you can use all of XDP *except* cpumap with
> > CAP_NET_ADMIN+CAP_BPF. That is bound to create confusion, isn't it?
>  
> Hmm... I see 'cpumap' primarily as a network stack feature.  It is about
> starting the network stack on a specific CPU, allocating and building
> SKBs on that remote CPU.  It can only be used together with XDP_REDIRECT.
> I would prefer CAP_NET_ADMIN like the devmap, to keep the XDP
> capabilities consistent.

I don't mind relaxing cpumap to cap_net_admin.
Looking at the reaction to the rest of the set. I'd rather discuss it
and do it later after basic cap_bpf is in.

^ permalink raw reply

* Re: [PATCH v2 bpf-next 1/3] capability: introduce CAP_BPF and CAP_TRACING
From: Jesper Dangaard Brouer @ 2019-08-29 20:25 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: Alexei Starovoitov, Alexei Starovoitov, luto, davem, peterz,
	rostedt, netdev, bpf, kernel-team, linux-api, brouer
In-Reply-To: <87imqfhmo2.fsf@toke.dk>

On Thu, 29 Aug 2019 20:05:49 +0200
Toke Høiland-Jørgensen <toke@redhat.com> wrote:

> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> 
> > On Thu, Aug 29, 2019 at 09:44:18AM +0200, Toke Høiland-Jørgensen wrote:  
> >> Alexei Starovoitov <ast@kernel.org> writes:
> >>   
> >> > CAP_BPF allows the following BPF operations:
> >> > - Loading all types of BPF programs
> >> > - Creating all types of BPF maps except:
> >> >    - stackmap that needs CAP_TRACING
> >> >    - devmap that needs CAP_NET_ADMIN
> >> >    - cpumap that needs CAP_SYS_ADMIN  
> >> 
> >> Why CAP_SYS_ADMIN instead of CAP_NET_ADMIN for cpumap?  
> >
> > Currently it's cap_sys_admin and I think it should stay this way
> > because it creates kthreads.  
> 
> Ah, right. I can sorta see that makes sense because of the kthreads, but
> it also means that you can use all of XDP *except* cpumap with
> CAP_NET_ADMIN+CAP_BPF. That is bound to create confusion, isn't it?
 
Hmm... I see 'cpumap' primarily as a network stack feature.  It is about
starting the network stack on a specific CPU, allocating and building
SKBs on that remote CPU.  It can only be used together with XDP_REDIRECT.
I would prefer CAP_NET_ADMIN like the devmap, to keep the XDP
capabilities consistent.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH 10/11] selinux: Implement the watch_key security hook [ver #6]
From: David Howells @ 2019-08-29 19:11 UTC (permalink / raw)
  To: Stephen Smalley
  Cc: dhowells, viro, Casey Schaufler, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-security-module, linux-fsdevel, linux-api, linux-block,
	linux-kernel
In-Reply-To: <03eb0974-3996-f356-5fbe-17cf598b0e31@tycho.nsa.gov>

Stephen Smalley <sds@tycho.nsa.gov> wrote:

> Can watch->cred ever differ from current's cred here?  If not, why can't we
> just use current_sid() here

Um.  Not currently.  I'm not sure whether its ever likely to be otherwise.
Probably we could just use that and fix it up later if we do find otherwise.

> and why do we need the watch object at all?

It carries more than just the creds for the caller of keyctl_watch_key(), it
also carries information about the queue to which notifications will be
written, including the creds that were active when that was set up.

Note that there's no requirement that the process that opened /dev/watch_queue
be the one that sets the watch.  In the keyutils testsuite, I 'leak' a file
descriptor from the session wrangler into the program that it runs so that
tests running inside the test script can add watches to it.

David

^ permalink raw reply

* Re: [PATCH 10/11] selinux: Implement the watch_key security hook [ver #6]
From: Stephen Smalley @ 2019-08-29 18:44 UTC (permalink / raw)
  To: David Howells, viro
  Cc: Casey Schaufler, Greg Kroah-Hartman, nicolas.dichtel, raven,
	Christian Brauner, keyrings, linux-usb, linux-security-module,
	linux-fsdevel, linux-api, linux-block, linux-kernel
In-Reply-To: <156710348066.10009.17986469867635955040.stgit@warthog.procyon.org.uk>

On 8/29/19 2:31 PM, David Howells wrote:
> Implement the watch_key security hook to make sure that a key grants the
> caller View permission in order to set a watch on a key.
> 
> For the moment, the watch_devices security hook is left unimplemented as
> it's not obvious what the object should be since the queue is global and
> didn't previously exist.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
> 
>   security/selinux/hooks.c |   17 +++++++++++++++++
>   1 file changed, 17 insertions(+)
> 
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index 74dd46de01b6..371f2ebc879b 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -6533,6 +6533,20 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer)
>   	*_buffer = context;
>   	return rc;
>   }
> +
> +#ifdef CONFIG_KEY_NOTIFICATIONS
> +static int selinux_watch_key(struct watch *watch, struct key *key)
> +{
> +	struct key_security_struct *ksec;
> +	u32 sid;
> +
> +	sid = cred_sid(watch->cred);

Can watch->cred ever differ from current's cred here?  If not, why can't 
we just use current_sid() here and why do we need the watch object at all?

> +	ksec = key->security;
> +
> +	return avc_has_perm(&selinux_state,
> +			    sid, ksec->sid, SECCLASS_KEY, KEY_NEED_VIEW, NULL);
> +}
> +#endif
>   #endif
>   
>   #ifdef CONFIG_SECURITY_INFINIBAND
> @@ -6965,6 +6979,9 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
>   	LSM_HOOK_INIT(key_free, selinux_key_free),
>   	LSM_HOOK_INIT(key_permission, selinux_key_permission),
>   	LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity),
> +#ifdef CONFIG_KEY_NOTIFICATIONS
> +	LSM_HOOK_INIT(watch_key, selinux_watch_key),
> +#endif
>   #endif
>   
>   #ifdef CONFIG_AUDIT
> 

^ permalink raw reply

* [PATCH 11/11] smack: Implement the watch_key and post_notification hooks [untested] [ver #6]
From: David Howells @ 2019-08-29 18:31 UTC (permalink / raw)
  To: viro
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner
In-Reply-To: <156710338860.10009.12524626894838499011.stgit@warthog.procyon.org.uk>

Implement the watch_key security hook in Smack to make sure that a key
grants the caller Read permission in order to set a watch on a key.

Also implement the post_notification security hook to make sure that the
notification source is granted Write permission by the watch queue.

For the moment, the watch_devices security hook is left unimplemented as
it's not obvious what the object should be since the queue is global and
didn't previously exist.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 include/linux/lsm_audit.h  |    1 +
 security/smack/smack_lsm.c |   81 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 915330abf6e5..734d67889826 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -74,6 +74,7 @@ struct common_audit_data {
 #define LSM_AUDIT_DATA_FILE	12
 #define LSM_AUDIT_DATA_IBPKEY	13
 #define LSM_AUDIT_DATA_IBENDPORT 14
+#define LSM_AUDIT_DATA_NOTIFICATION 15
 	union 	{
 		struct path path;
 		struct dentry *dentry;
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 4c5e5a438f8b..ef3fb70649f0 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4320,8 +4320,82 @@ static int smack_key_getsecurity(struct key *key, char **_buffer)
 	return length;
 }
 
+
+#ifdef CONFIG_KEY_NOTIFICATIONS
+/**
+ * smack_watch_key - Smack access to watch a key for notifications.
+ * @watch: The watch to be set
+ * @key: The key to be watched
+ *
+ * Return 0 if the @watch->cred has permission to read from the key object and
+ * an error otherwise.
+ */
+static int smack_watch_key(struct watch *watch, struct key *key)
+{
+	struct smk_audit_info ad;
+	struct smack_known *tkp = smk_of_task(smack_cred(watch->cred));
+	int rc;
+
+	if (key == NULL)
+		return -EINVAL;
+	/*
+	 * If the key hasn't been initialized give it access so that
+	 * it may do so.
+	 */
+	if (key->security == NULL)
+		return 0;
+	/*
+	 * This should not occur
+	 */
+	if (tkp == NULL)
+		return -EACCES;
+
+	if (smack_privileged_cred(CAP_MAC_OVERRIDE, watch->cred))
+		return 0;
+
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY);
+	ad.a.u.key_struct.key = key->serial;
+	ad.a.u.key_struct.key_desc = key->description;
+#endif
+	rc = smk_access(tkp, key->security, MAY_READ, &ad);
+	rc = smk_bu_note("key watch", tkp, key->security, MAY_READ, rc);
+	return rc;
+}
+#endif /* CONFIG_KEY_NOTIFICATIONS */
 #endif /* CONFIG_KEYS */
 
+#ifdef CONFIG_WATCH_QUEUE
+/**
+ * smack_post_notification - Smack access to post a notification to a queue
+ * @w_cred: The credentials of the watcher.
+ * @cred: The credentials of the event source (may be NULL).
+ * @n: The notification message to be posted.
+ */
+static int smack_post_notification(const struct cred *w_cred,
+				   const struct cred *cred,
+				   struct watch_notification *n)
+{
+	struct smk_audit_info ad;
+	struct smack_known *subj, *obj;
+	int rc;
+
+	/* Always let maintenance notifications through. */
+	if (n->type == WATCH_TYPE_META)
+		return 0;
+
+	if (!cred)
+		return 0;
+	subj = smk_of_task(smack_cred(cred));
+	obj = smk_of_task(smack_cred(w_cred));
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NOTIFICATION);
+	rc = smk_access(subj, obj, MAY_WRITE, &ad);
+	rc = smk_bu_note("notification", subj, obj, MAY_WRITE, rc);
+	return rc;
+}
+#endif /* CONFIG_WATCH_QUEUE */
+
 /*
  * Smack Audit hooks
  *
@@ -4710,8 +4784,15 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(key_free, smack_key_free),
 	LSM_HOOK_INIT(key_permission, smack_key_permission),
 	LSM_HOOK_INIT(key_getsecurity, smack_key_getsecurity),
+#ifdef CONFIG_KEY_NOTIFICATIONS
+	LSM_HOOK_INIT(watch_key, smack_watch_key),
+#endif
 #endif /* CONFIG_KEYS */
 
+#ifdef CONFIG_WATCH_QUEUE
+	LSM_HOOK_INIT(post_notification, smack_post_notification),
+#endif
+
  /* Audit hooks */
 #ifdef CONFIG_AUDIT
 	LSM_HOOK_INIT(audit_rule_init, smack_audit_rule_init),

^ permalink raw reply related

* [PATCH 10/11] selinux: Implement the watch_key security hook [ver #6]
From: David Howells @ 2019-08-29 18:31 UTC (permalink / raw)
  To: viro
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner
In-Reply-To: <156710338860.10009.12524626894838499011.stgit@warthog.procyon.org.uk>

Implement the watch_key security hook to make sure that a key grants the
caller View permission in order to set a watch on a key.

For the moment, the watch_devices security hook is left unimplemented as
it's not obvious what the object should be since the queue is global and
didn't previously exist.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 security/selinux/hooks.c |   17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 74dd46de01b6..371f2ebc879b 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6533,6 +6533,20 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer)
 	*_buffer = context;
 	return rc;
 }
+
+#ifdef CONFIG_KEY_NOTIFICATIONS
+static int selinux_watch_key(struct watch *watch, struct key *key)
+{
+	struct key_security_struct *ksec;
+	u32 sid;
+
+	sid = cred_sid(watch->cred);
+	ksec = key->security;
+
+	return avc_has_perm(&selinux_state,
+			    sid, ksec->sid, SECCLASS_KEY, KEY_NEED_VIEW, NULL);
+}
+#endif
 #endif
 
 #ifdef CONFIG_SECURITY_INFINIBAND
@@ -6965,6 +6979,9 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(key_free, selinux_key_free),
 	LSM_HOOK_INIT(key_permission, selinux_key_permission),
 	LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity),
+#ifdef CONFIG_KEY_NOTIFICATIONS
+	LSM_HOOK_INIT(watch_key, selinux_watch_key),
+#endif
 #endif
 
 #ifdef CONFIG_AUDIT

^ permalink raw reply related

* [PATCH 09/11] Add sample notification program [ver #6]
From: David Howells @ 2019-08-29 18:31 UTC (permalink / raw)
  To: viro
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner
In-Reply-To: <156710338860.10009.12524626894838499011.stgit@warthog.procyon.org.uk>

This needs to be linked with -lkeyutils.

It is run like:

	./watch_test

and watches "/" for mount changes and the current session keyring for key
changes:

	# keyctl add user a a @s
	1035096409
	# keyctl unlink 1035096409 @s

producing:

	# ./watch_test
	ptrs h=4 t=2 m=20003
	NOTIFY[00000004-00000002] ty=0003 sy=0002 i=01000010
	KEY 2ffc2e5d change=2[linked] aux=1035096409
	ptrs h=6 t=4 m=20003
	NOTIFY[00000006-00000004] ty=0003 sy=0003 i=01000010
	KEY 2ffc2e5d change=3[unlinked] aux=1035096409

Other events may be produced, such as with a failing disk:

	ptrs h=5 t=2 m=6000004
	NOTIFY[00000005-00000002] ty=0004 sy=0006 i=04000018
	BLOCK 00800050 e=6[critical medium] s=5be8

This corresponds to:

	print_req_error: critical medium error, dev sdf, sector 23528 flags 0

in dmesg.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 samples/Kconfig                  |    6 +
 samples/Makefile                 |    1 
 samples/watch_queue/Makefile     |    8 +
 samples/watch_queue/watch_test.c |  233 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 248 insertions(+)
 create mode 100644 samples/watch_queue/Makefile
 create mode 100644 samples/watch_queue/watch_test.c

diff --git a/samples/Kconfig b/samples/Kconfig
index c8dacb4dda80..2c3e07addd38 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -169,4 +169,10 @@ config SAMPLE_VFS
 	  as mount API and statx().  Note that this is restricted to the x86
 	  arch whilst it accesses system calls that aren't yet in all arches.
 
+config SAMPLE_WATCH_QUEUE
+	bool "Build example /dev/watch_queue notification consumer"
+	help
+	  Build example userspace program to use the new mount_notify(),
+	  sb_notify() syscalls and the KEYCTL_WATCH_KEY keyctl() function.
+
 endif # SAMPLES
diff --git a/samples/Makefile b/samples/Makefile
index 7d6e4ca28d69..a61a39047d02 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_SAMPLE_TRACE_PRINTK)	+= trace_printk/
 obj-$(CONFIG_VIDEO_PCI_SKELETON)	+= v4l/
 obj-y					+= vfio-mdev/
 subdir-$(CONFIG_SAMPLE_VFS)		+= vfs
+subdir-$(CONFIG_SAMPLE_WATCH_QUEUE)	+= watch_queue
diff --git a/samples/watch_queue/Makefile b/samples/watch_queue/Makefile
new file mode 100644
index 000000000000..6ee61e3ca8d2
--- /dev/null
+++ b/samples/watch_queue/Makefile
@@ -0,0 +1,8 @@
+# List of programs to build
+hostprogs-y := watch_test
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_watch_test.o += -I$(objtree)/usr/include
+HOSTLDLIBS_watch_test += -lkeyutils
diff --git a/samples/watch_queue/watch_test.c b/samples/watch_queue/watch_test.c
new file mode 100644
index 000000000000..6cd7101cb28c
--- /dev/null
+++ b/samples/watch_queue/watch_test.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Use /dev/watch_queue to watch for notifications.
+ *
+ * Copyright (C) 2019 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <stdbool.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <errno.h>
+#include <sys/wait.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
+#include <limits.h>
+#include <linux/watch_queue.h>
+#include <linux/unistd.h>
+#include <linux/keyctl.h>
+
+#ifndef KEYCTL_WATCH_KEY
+#define KEYCTL_WATCH_KEY -1
+#endif
+#ifndef __NR_watch_devices
+#define __NR_watch_devices -1
+#endif
+
+#define BUF_SIZE 4
+
+static long keyctl_watch_key(int key, int watch_fd, int watch_id)
+{
+	return syscall(__NR_keyctl, KEYCTL_WATCH_KEY, key, watch_fd, watch_id);
+}
+
+static const char *key_subtypes[256] = {
+	[NOTIFY_KEY_INSTANTIATED]	= "instantiated",
+	[NOTIFY_KEY_UPDATED]		= "updated",
+	[NOTIFY_KEY_LINKED]		= "linked",
+	[NOTIFY_KEY_UNLINKED]		= "unlinked",
+	[NOTIFY_KEY_CLEARED]		= "cleared",
+	[NOTIFY_KEY_REVOKED]		= "revoked",
+	[NOTIFY_KEY_INVALIDATED]	= "invalidated",
+	[NOTIFY_KEY_SETATTR]		= "setattr",
+};
+
+static void saw_key_change(struct watch_notification *n)
+{
+	struct key_notification *k = (struct key_notification *)n;
+	unsigned int len = (n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT;
+
+	if (len != sizeof(struct key_notification) / WATCH_LENGTH_GRANULARITY)
+		return;
+
+	printf("KEY %08x change=%u[%s] aux=%u\n",
+	       k->key_id, n->subtype, key_subtypes[n->subtype], k->aux);
+}
+
+static const char *block_subtypes[256] = {
+	[NOTIFY_BLOCK_ERROR_TIMEOUT]			= "timeout",
+	[NOTIFY_BLOCK_ERROR_NO_SPACE]			= "critical space allocation",
+	[NOTIFY_BLOCK_ERROR_RECOVERABLE_TRANSPORT]	= "recoverable transport",
+	[NOTIFY_BLOCK_ERROR_CRITICAL_TARGET]		= "critical target",
+	[NOTIFY_BLOCK_ERROR_CRITICAL_NEXUS]		= "critical nexus",
+	[NOTIFY_BLOCK_ERROR_CRITICAL_MEDIUM]		= "critical medium",
+	[NOTIFY_BLOCK_ERROR_PROTECTION]			= "protection",
+	[NOTIFY_BLOCK_ERROR_KERNEL_RESOURCE]		= "kernel resource",
+	[NOTIFY_BLOCK_ERROR_DEVICE_RESOURCE]		= "device resource",
+	[NOTIFY_BLOCK_ERROR_IO]				= "I/O",
+};
+
+static void saw_block_change(struct watch_notification *n)
+{
+	struct block_notification *b = (struct block_notification *)n;
+	unsigned int len = (n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT;
+
+	if (len < sizeof(struct block_notification) / WATCH_LENGTH_GRANULARITY)
+		return;
+
+	printf("BLOCK %08llx e=%u[%s] s=%llx\n",
+	       (unsigned long long)b->dev,
+	       n->subtype, block_subtypes[n->subtype],
+	       (unsigned long long)b->sector);
+}
+
+static const char *usb_subtypes[256] = {
+	[NOTIFY_USB_DEVICE_ADD]		= "dev-add",
+	[NOTIFY_USB_DEVICE_REMOVE]	= "dev-remove",
+	[NOTIFY_USB_BUS_ADD]		= "bus-add",
+	[NOTIFY_USB_BUS_REMOVE]		= "bus-remove",
+	[NOTIFY_USB_DEVICE_RESET]	= "dev-reset",
+	[NOTIFY_USB_DEVICE_ERROR]	= "dev-error",
+};
+
+static void saw_usb_event(struct watch_notification *n)
+{
+	struct usb_notification *u = (struct usb_notification *)n;
+	unsigned int len = (n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT;
+
+	if (len < sizeof(struct usb_notification) / WATCH_LENGTH_GRANULARITY)
+		return;
+
+	printf("USB %*.*s %s e=%x r=%x\n",
+	       u->name_len, u->name_len, u->name,
+	       usb_subtypes[n->subtype],
+	       u->error, u->reserved);
+}
+
+/*
+ * Consume and display events.
+ */
+static int consumer(int fd, struct watch_queue_buffer *buf)
+{
+	struct watch_notification *n;
+	struct pollfd p[1];
+	unsigned int head, tail, mask = buf->meta.mask;
+
+	for (;;) {
+		p[0].fd = fd;
+		p[0].events = POLLIN | POLLERR;
+		p[0].revents = 0;
+
+		if (poll(p, 1, -1) == -1) {
+			perror("poll");
+			break;
+		}
+
+		printf("ptrs h=%x t=%x m=%x\n",
+		       buf->meta.head, buf->meta.tail, buf->meta.mask);
+
+		while (head = __atomic_load_n(&buf->meta.head, __ATOMIC_ACQUIRE),
+		       tail = buf->meta.tail,
+		       tail != head
+		       ) {
+			n = &buf->slots[tail & mask];
+			printf("NOTIFY[%08x-%08x] ty=%04x sy=%04x i=%08x\n",
+			       head, tail, n->type, n->subtype, n->info);
+			if ((n->info & WATCH_INFO_LENGTH) == 0)
+				goto out;
+
+			switch (n->type) {
+			case WATCH_TYPE_META:
+				if (n->subtype == WATCH_META_REMOVAL_NOTIFICATION)
+					printf("REMOVAL of watchpoint %08x\n",
+					       (n->info & WATCH_INFO_ID) >>
+					       WATCH_INFO_ID__SHIFT);
+				break;
+			case WATCH_TYPE_KEY_NOTIFY:
+				saw_key_change(n);
+				break;
+			case WATCH_TYPE_BLOCK_NOTIFY:
+				saw_block_change(n);
+				break;
+			case WATCH_TYPE_USB_NOTIFY:
+				saw_usb_event(n);
+				break;
+			}
+
+			tail += (n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT;
+			__atomic_store_n(&buf->meta.tail, tail, __ATOMIC_RELEASE);
+		}
+	}
+
+out:
+	return 0;
+}
+
+static struct watch_notification_filter filter = {
+	.nr_filters	= 3,
+	.__reserved	= 0,
+	.filters = {
+		[0]	= {
+			.type			= WATCH_TYPE_KEY_NOTIFY,
+			.subtype_filter[0]	= UINT_MAX,
+		},
+		[1]	= {
+			.type			= WATCH_TYPE_BLOCK_NOTIFY,
+			.subtype_filter[0]	= UINT_MAX,
+		},
+		[2]	= {
+			.type			= WATCH_TYPE_USB_NOTIFY,
+			.subtype_filter[0]	= UINT_MAX,
+		},
+	},
+};
+
+int main(int argc, char **argv)
+{
+	struct watch_queue_buffer *buf;
+	size_t page_size;
+	int fd;
+
+	fd = open("/dev/watch_queue", O_RDWR);
+	if (fd == -1) {
+		perror("/dev/watch_queue");
+		exit(1);
+	}
+
+	if (ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, BUF_SIZE) == -1) {
+		perror("/dev/watch_queue(size)");
+		exit(1);
+	}
+
+	if (ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter) == -1) {
+		perror("/dev/watch_queue(filter)");
+		exit(1);
+	}
+
+	page_size = sysconf(_SC_PAGESIZE);
+	buf = mmap(NULL, BUF_SIZE * page_size, PROT_READ | PROT_WRITE,
+		   MAP_SHARED, fd, 0);
+	if (buf == MAP_FAILED) {
+		perror("mmap");
+		exit(1);
+	}
+
+	if (keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fd, 0x01) == -1) {
+		perror("keyctl");
+		exit(1);
+	}
+
+	if (syscall(__NR_watch_devices, fd, 0x04, 0) == -1) {
+		perror("watch_devices");
+		exit(1);
+	}
+
+	return consumer(fd, buf);
+}

^ permalink raw reply related

* [PATCH 08/11] usb: Add USB subsystem notifications [ver #6]
From: David Howells @ 2019-08-29 18:31 UTC (permalink / raw)
  To: viro
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner
In-Reply-To: <156710338860.10009.12524626894838499011.stgit@warthog.procyon.org.uk>

Add a USB subsystem notification mechanism whereby notifications about
hardware events such as device connection, disconnection, reset and I/O
errors, can be reported to a monitoring process asynchronously.

Firstly, an event queue needs to be created:

	fd = open("/dev/event_queue", O_RDWR);
	ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, page_size << n);

then a notification can be set up to report USB notifications via that
queue:

	struct watch_notification_filter filter = {
		.nr_filters = 1,
		.filters = {
			[0] = {
				.type = WATCH_TYPE_USB_NOTIFY,
				.subtype_filter[0] = UINT_MAX;
			},
		},
	};
	ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter);
	notify_devices(fd, 12);

After that, records will be placed into the queue when events occur on a
USB device or bus.  Records are of the following format:

	struct usb_notification {
		struct watch_notification watch;
		__u32	error;
		__u32	reserved;
		__u8	name_len;
		__u8	name[0];
	} *n;

Where:

	n->watch.type will be WATCH_TYPE_USB_NOTIFY

	n->watch.subtype will be the type of notification, such as
	NOTIFY_USB_DEVICE_ADD.

	n->watch.info & WATCH_INFO_LENGTH will indicate the length of the
	record.

	n->watch.info & WATCH_INFO_ID will be the second argument to
	device_notify(), shifted.

	n->error and n->reserved are intended to convey information such as
	error codes, but are currently not used

	n->name_len and n->name convey the USB device name as an
	unterminated string.  This may be truncated - it is currently
	limited to a maximum 63 chars.

Note that it is permissible for event records to be of variable length -
or, at least, the length may be dependent on the subtype.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: linux-usb@vger.kernel.org
---

 Documentation/watch_queue.rst    |    9 ++++++
 drivers/usb/core/Kconfig         |    9 ++++++
 drivers/usb/core/devio.c         |   56 ++++++++++++++++++++++++++++++++++++++
 drivers/usb/core/hub.c           |    4 +++
 include/linux/usb.h              |   18 ++++++++++++
 include/uapi/linux/watch_queue.h |   30 ++++++++++++++++++++
 6 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst
index 5cc9c6924727..4087a8e670a8 100644
--- a/Documentation/watch_queue.rst
+++ b/Documentation/watch_queue.rst
@@ -11,6 +11,8 @@ receive notifications from the kernel.  This can be used in conjunction with::
 
     * Block layer event notifications
 
+    * USB subsystem event notifications
+
 
 The notifications buffers can be enabled by:
 
@@ -315,6 +317,13 @@ Any particular buffer can be fed from multiple sources.  Sources include:
     or temporary link loss.  Watches of this type are set on the global device
     watch list.
 
+  * WATCH_TYPE_USB_NOTIFY
+
+    Notifications of this type indicate USB subsystem events, such as
+    attachment, removal, reset and I/O errors.  Separate events are generated
+    for buses and devices.  Watchpoints of this type are set on the global
+    device watch list.
+
 
 Event Filtering
 ===============
diff --git a/drivers/usb/core/Kconfig b/drivers/usb/core/Kconfig
index ecaacc8ed311..57e7b649e48b 100644
--- a/drivers/usb/core/Kconfig
+++ b/drivers/usb/core/Kconfig
@@ -102,3 +102,12 @@ config USB_AUTOSUSPEND_DELAY
 	  The default value Linux has always had is 2 seconds.  Change
 	  this value if you want a different delay and cannot modify
 	  the command line or module parameter.
+
+config USB_NOTIFICATIONS
+	bool "Provide USB hardware event notifications"
+	depends on USB && DEVICE_NOTIFICATIONS
+	help
+	  This option provides support for getting hardware event notifications
+	  on USB devices and interfaces.  This makes use of the
+	  /dev/watch_queue misc device to handle the notification buffer.
+	  device_notify(2) is used to set/remove watches.
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 9063ede411ae..b8572e4d6a1b 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -41,6 +41,7 @@
 #include <linux/dma-mapping.h>
 #include <asm/byteorder.h>
 #include <linux/moduleparam.h>
+#include <linux/watch_queue.h>
 
 #include "usb.h"
 
@@ -2660,13 +2661,68 @@ static void usbdev_remove(struct usb_device *udev)
 	}
 }
 
+#ifdef CONFIG_USB_NOTIFICATIONS
+static noinline void post_usb_notification(const char *devname,
+					   enum usb_notification_type subtype,
+					   u32 error)
+{
+	unsigned int gran = WATCH_LENGTH_GRANULARITY;
+	unsigned int name_len, n_len;
+	u64 id = 0; /* Might want to put a dev# here. */
+
+	struct {
+		struct usb_notification n;
+		char more_name[USB_NOTIFICATION_MAX_NAME_LEN -
+			       (sizeof(struct usb_notification) -
+				offsetof(struct usb_notification, name))];
+	} n;
+
+	name_len = strlen(devname);
+	name_len = min_t(size_t, name_len, USB_NOTIFICATION_MAX_NAME_LEN);
+	n_len = round_up(offsetof(struct usb_notification, name) + name_len,
+			 gran) / gran;
+
+	memset(&n, 0, sizeof(n));
+	memcpy(n.n.name, devname, n_len);
+
+	n.n.watch.type		= WATCH_TYPE_USB_NOTIFY;
+	n.n.watch.subtype	= subtype;
+	n.n.watch.info		= n_len;
+	n.n.error		= error;
+	n.n.name_len		= name_len;
+
+	post_device_notification(&n.n.watch, id);
+}
+
+void post_usb_device_notification(const struct usb_device *udev,
+				  enum usb_notification_type subtype, u32 error)
+{
+	post_usb_notification(dev_name(&udev->dev), subtype, error);
+}
+
+void post_usb_bus_notification(const struct usb_bus *ubus,
+			       enum usb_notification_type subtype, u32 error)
+{
+	post_usb_notification(ubus->bus_name, subtype, error);
+}
+#endif
+
 static int usbdev_notify(struct notifier_block *self,
 			       unsigned long action, void *dev)
 {
 	switch (action) {
 	case USB_DEVICE_ADD:
+		post_usb_device_notification(dev, NOTIFY_USB_DEVICE_ADD, 0);
 		break;
 	case USB_DEVICE_REMOVE:
+		post_usb_device_notification(dev, NOTIFY_USB_DEVICE_REMOVE, 0);
+		usbdev_remove(dev);
+		break;
+	case USB_BUS_ADD:
+		post_usb_bus_notification(dev, NOTIFY_USB_BUS_ADD, 0);
+		break;
+	case USB_BUS_REMOVE:
+		post_usb_bus_notification(dev, NOTIFY_USB_BUS_REMOVE, 0);
 		usbdev_remove(dev);
 		break;
 	}
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 236313f41f4a..e8ebacc15a32 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -29,6 +29,7 @@
 #include <linux/random.h>
 #include <linux/pm_qos.h>
 #include <linux/kobject.h>
+#include <linux/watch_queue.h>
 
 #include <linux/uaccess.h>
 #include <asm/byteorder.h>
@@ -4605,6 +4606,9 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
 				(udev->config) ? "reset" : "new", speed,
 				devnum, driver_name);
 
+	if (udev->config)
+		post_usb_device_notification(udev, NOTIFY_USB_DEVICE_RESET, 0);
+
 	/* Set up TT records, if needed  */
 	if (hdev->tt) {
 		udev->tt = hdev->tt;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index e87826e23d59..ddfb9dc2473e 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -26,6 +26,7 @@
 struct usb_device;
 struct usb_driver;
 struct wusb_dev;
+enum usb_notification_type;
 
 /*-------------------------------------------------------------------------*/
 
@@ -2010,6 +2011,23 @@ extern void usb_led_activity(enum usb_led_event ev);
 static inline void usb_led_activity(enum usb_led_event ev) {}
 #endif
 
+/*
+ * Notification functions.
+ */
+#ifdef CONFIG_USB_NOTIFICATIONS
+extern void post_usb_device_notification(const struct usb_device *udev,
+					 enum usb_notification_type subtype,
+					 u32 error);
+extern void post_usb_bus_notification(const struct usb_bus *ubus,
+				      enum usb_notification_type subtype,
+				      u32 error);
+#else
+static inline void post_usb_device_notification(const struct usb_device *udev,
+						unsigned int subtype, u32 error) {}
+static inline void post_usb_bus_notification(const struct usb_bus *ubus,
+					     unsigned int subtype, u32 error) {}
+#endif
+
 #endif  /* __KERNEL__ */
 
 #endif
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
index 9a6c059af09d..bc5183e10d8c 100644
--- a/include/uapi/linux/watch_queue.h
+++ b/include/uapi/linux/watch_queue.h
@@ -12,7 +12,8 @@ enum watch_notification_type {
 	WATCH_TYPE_META		= 0,	/* Special record */
 	WATCH_TYPE_KEY_NOTIFY	= 1,	/* Key change event notification */
 	WATCH_TYPE_BLOCK_NOTIFY	= 2,	/* Block layer event notification */
-	WATCH_TYPE___NR		= 3
+	WATCH_TYPE_USB_NOTIFY	= 3,	/* USB subsystem event notification */
+	WATCH_TYPE___NR		= 4
 };
 
 enum watch_meta_notification_subtype {
@@ -152,4 +153,31 @@ struct block_notification {
 	__u64	sector;			/* Affected sector */
 };
 
+/*
+ * Type of USB layer notification.
+ */
+enum usb_notification_type {
+	NOTIFY_USB_DEVICE_ADD		= 0, /* USB device added */
+	NOTIFY_USB_DEVICE_REMOVE	= 1, /* USB device removed */
+	NOTIFY_USB_BUS_ADD		= 2, /* USB bus added */
+	NOTIFY_USB_BUS_REMOVE		= 3, /* USB bus removed */
+	NOTIFY_USB_DEVICE_RESET		= 4, /* USB device reset */
+	NOTIFY_USB_DEVICE_ERROR		= 5, /* USB device error */
+};
+
+/*
+ * USB subsystem notification record.
+ * - watch.type = WATCH_TYPE_USB_NOTIFY
+ * - watch.subtype = enum usb_notification_type
+ */
+struct usb_notification {
+	struct watch_notification watch; /* WATCH_TYPE_USB_NOTIFY */
+	__u32	error;
+	__u32	reserved;
+	__u8	name_len;		/* Length of device name */
+	__u8	name[0];		/* Device name (padded to __u64, truncated at 63 chars) */
+};
+
+#define USB_NOTIFICATION_MAX_NAME_LEN 63
+
 #endif /* _UAPI_LINUX_WATCH_QUEUE_H */

^ permalink raw reply related

* [PATCH 07/11] block: Add block layer notifications [ver #6]
From: David Howells @ 2019-08-29 18:30 UTC (permalink / raw)
  To: viro
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner
In-Reply-To: <156710338860.10009.12524626894838499011.stgit@warthog.procyon.org.uk>

Add a block layer notification mechanism whereby notifications about
block-layer events such as I/O errors, can be reported to a monitoring
process asynchronously.

Firstly, an event queue needs to be created:

	fd = open("/dev/event_queue", O_RDWR);
	ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, page_size << n);

then a notification can be set up to report block notifications via that
queue:

	struct watch_notification_filter filter = {
		.nr_filters = 1,
		.filters = {
			[0] = {
				.type = WATCH_TYPE_BLOCK_NOTIFY,
				.subtype_filter[0] = UINT_MAX;
			},
		},
	};
	ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter);
	watch_devices(fd, 12);

After that, records will be placed into the queue when, for example, errors
occur on a block device.  Records are of the following format:

	struct block_notification {
		struct watch_notification watch;
		__u64	dev;
		__u64	sector;
	} *n;

Where:

	n->watch.type will be WATCH_TYPE_BLOCK_NOTIFY

	n->watch.subtype will be the type of notification, such as
	NOTIFY_BLOCK_ERROR_CRITICAL_MEDIUM.

	n->watch.info & WATCH_INFO_LENGTH will indicate the length of the
	record.

	n->watch.info & WATCH_INFO_ID will be the second argument to
	watch_devices(), shifted.

	n->dev will be the device numbers munged together.

	n->sector will indicate the affected sector (if appropriate for the
	event).

Note that it is permissible for event records to be of variable length -
or, at least, the length may be dependent on the subtype.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 Documentation/watch_queue.rst    |    4 +++-
 block/Kconfig                    |    9 +++++++++
 block/blk-core.c                 |   29 +++++++++++++++++++++++++++++
 include/linux/blkdev.h           |   15 +++++++++++++++
 include/uapi/linux/watch_queue.h |   30 +++++++++++++++++++++++++++++-
 5 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst
index 393905b904c8..5cc9c6924727 100644
--- a/Documentation/watch_queue.rst
+++ b/Documentation/watch_queue.rst
@@ -7,7 +7,9 @@ receive notifications from the kernel.  This can be used in conjunction with::
 
   * Key/keyring notifications
 
-  * General device event notifications
+  * General device event notifications, including::
+
+    * Block layer event notifications
 
 
 The notifications buffers can be enabled by:
diff --git a/block/Kconfig b/block/Kconfig
index 8b5f8e560eb4..cc93e4ca29a7 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -164,6 +164,15 @@ config BLK_SED_OPAL
 	Enabling this option enables users to setup/unlock/lock
 	Locking ranges for SED devices using the Opal protocol.
 
+config BLK_NOTIFICATIONS
+	bool "Block layer event notifications"
+	depends on DEVICE_NOTIFICATIONS
+	help
+	  This option provides support for getting block layer event
+	  notifications.  This makes use of the /dev/watch_queue misc device to
+	  handle the notification buffer and provides the device_notify() system
+	  call to enable/disable watches.
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
diff --git a/block/blk-core.c b/block/blk-core.c
index d0cc6e14d2f0..8ab1e07aa311 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -181,6 +181,22 @@ static const struct {
 	[BLK_STS_IOERR]		= { -EIO,	"I/O" },
 };
 
+#ifdef CONFIG_BLK_NOTIFICATIONS
+static const
+enum block_notification_type blk_notifications[ARRAY_SIZE(blk_errors)] = {
+	[BLK_STS_TIMEOUT]	= NOTIFY_BLOCK_ERROR_TIMEOUT,
+	[BLK_STS_NOSPC]		= NOTIFY_BLOCK_ERROR_NO_SPACE,
+	[BLK_STS_TRANSPORT]	= NOTIFY_BLOCK_ERROR_RECOVERABLE_TRANSPORT,
+	[BLK_STS_TARGET]	= NOTIFY_BLOCK_ERROR_CRITICAL_TARGET,
+	[BLK_STS_NEXUS]		= NOTIFY_BLOCK_ERROR_CRITICAL_NEXUS,
+	[BLK_STS_MEDIUM]	= NOTIFY_BLOCK_ERROR_CRITICAL_MEDIUM,
+	[BLK_STS_PROTECTION]	= NOTIFY_BLOCK_ERROR_PROTECTION,
+	[BLK_STS_RESOURCE]	= NOTIFY_BLOCK_ERROR_KERNEL_RESOURCE,
+	[BLK_STS_DEV_RESOURCE]	= NOTIFY_BLOCK_ERROR_DEVICE_RESOURCE,
+	[BLK_STS_IOERR]		= NOTIFY_BLOCK_ERROR_IO,
+};
+#endif
+
 blk_status_t errno_to_blk_status(int errno)
 {
 	int i;
@@ -221,6 +237,19 @@ static void print_req_error(struct request *req, blk_status_t status,
 		req->cmd_flags & ~REQ_OP_MASK,
 		req->nr_phys_segments,
 		IOPRIO_PRIO_CLASS(req->ioprio));
+
+#ifdef CONFIG_BLK_NOTIFICATIONS
+	if (blk_notifications[idx]) {
+		struct block_notification n = {
+			.watch.type	= WATCH_TYPE_BLOCK_NOTIFY,
+			.watch.subtype	= blk_notifications[idx],
+			.watch.info	= watch_sizeof(n),
+			.dev		= req->rq_disk ? disk_devt(req->rq_disk) : 0,
+			.sector		= blk_rq_pos(req),
+		};
+		post_block_notification(&n);
+	}
+#endif
 }
 
 static void req_bio_endio(struct request *rq, struct bio *bio,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1ef375dafb1c..5d856f670a8f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,6 +27,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
 #include <linux/blkzoned.h>
+#include <linux/watch_queue.h>
 
 struct module;
 struct scsi_ioctl_command;
@@ -1742,6 +1743,20 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
 }
 #endif /* CONFIG_BLK_DEV_ZONED */
 
+#ifdef CONFIG_BLK_NOTIFICATIONS
+static inline void post_block_notification(struct block_notification *n)
+{
+	u64 id = 0; /* Might want to allow dev# here. */
+
+	post_device_notification(&n->watch, id);
+}
+#else
+static inline void post_block_notification(struct block_notification *n)
+{
+}
+#endif
+
+
 #else /* CONFIG_BLOCK */
 
 struct block_device;
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
index 654d4ba8b909..9a6c059af09d 100644
--- a/include/uapi/linux/watch_queue.h
+++ b/include/uapi/linux/watch_queue.h
@@ -11,7 +11,8 @@
 enum watch_notification_type {
 	WATCH_TYPE_META		= 0,	/* Special record */
 	WATCH_TYPE_KEY_NOTIFY	= 1,	/* Key change event notification */
-	WATCH_TYPE___NR		= 2
+	WATCH_TYPE_BLOCK_NOTIFY	= 2,	/* Block layer event notification */
+	WATCH_TYPE___NR		= 3
 };
 
 enum watch_meta_notification_subtype {
@@ -124,4 +125,31 @@ struct key_notification {
 	__u32	aux;		/* Per-type auxiliary data */
 };
 
+/*
+ * Type of block layer notification.
+ */
+enum block_notification_type {
+	NOTIFY_BLOCK_ERROR_TIMEOUT		= 1, /* Timeout error */
+	NOTIFY_BLOCK_ERROR_NO_SPACE		= 2, /* Critical space allocation error */
+	NOTIFY_BLOCK_ERROR_RECOVERABLE_TRANSPORT = 3, /* Recoverable transport error */
+	NOTIFY_BLOCK_ERROR_CRITICAL_TARGET	= 4, /* Critical target error */
+	NOTIFY_BLOCK_ERROR_CRITICAL_NEXUS	= 5, /* Critical nexus error */
+	NOTIFY_BLOCK_ERROR_CRITICAL_MEDIUM	= 6, /* Critical medium error */
+	NOTIFY_BLOCK_ERROR_PROTECTION		= 7, /* Protection error */
+	NOTIFY_BLOCK_ERROR_KERNEL_RESOURCE	= 8, /* Kernel resource error */
+	NOTIFY_BLOCK_ERROR_DEVICE_RESOURCE	= 9, /* Device resource error */
+	NOTIFY_BLOCK_ERROR_IO			= 10, /* Other I/O error */
+};
+
+/*
+ * Block layer notification record.
+ * - watch.type = WATCH_TYPE_BLOCK_NOTIFY
+ * - watch.subtype = enum block_notification_type
+ */
+struct block_notification {
+	struct watch_notification watch; /* WATCH_TYPE_BLOCK_NOTIFY */
+	__u64	dev;			/* Device number */
+	__u64	sector;			/* Affected sector */
+};
+
 #endif /* _UAPI_LINUX_WATCH_QUEUE_H */

^ permalink raw reply related

* [PATCH 06/11] Add a general, global device notification watch list [ver #6]
From: David Howells @ 2019-08-29 18:30 UTC (permalink / raw)
  To: viro
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner
In-Reply-To: <156710338860.10009.12524626894838499011.stgit@warthog.procyon.org.uk>

Create a general, global watch list that can be used for the posting of
device notification events, for such things as device attachment,
detachment and errors on sources such as block devices and USB devices.
This can be enabled with:

	CONFIG_DEVICE_NOTIFICATIONS

To add a watch on this list, an event queue must be created and configured:

        fd = open("/dev/event_queue", O_RDWR);
        ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, page_size << n);

and then a watch can be placed upon it using a system call:

        watch_devices(fd, 12, 0);

Unless the application wants to receive all events, it should employ
appropriate filters.  For example, to receive just USB notifications, it
could do:

	struct watch_notification_filter filter = {
		.nr_filters = 1,
		.filters = {
			[0] = {
				.type = WATCH_TYPE_USB_NOTIFY,
				.subtype_filter[0] = UINT_MAX;
			},
		},
	};
	ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter);

Signed-off-by: David Howells <dhowells@redhat.com>
---

 Documentation/watch_queue.rst               |   22 ++++++
 arch/alpha/kernel/syscalls/syscall.tbl      |    1 
 arch/arm/tools/syscall.tbl                  |    1 
 arch/ia64/kernel/syscalls/syscall.tbl       |    1 
 arch/m68k/kernel/syscalls/syscall.tbl       |    1 
 arch/microblaze/kernel/syscalls/syscall.tbl |    1 
 arch/mips/kernel/syscalls/syscall_n32.tbl   |    1 
 arch/mips/kernel/syscalls/syscall_n64.tbl   |    1 
 arch/mips/kernel/syscalls/syscall_o32.tbl   |    1 
 arch/parisc/kernel/syscalls/syscall.tbl     |    1 
 arch/powerpc/kernel/syscalls/syscall.tbl    |    1 
 arch/s390/kernel/syscalls/syscall.tbl       |    1 
 arch/sh/kernel/syscalls/syscall.tbl         |    1 
 arch/sparc/kernel/syscalls/syscall.tbl      |    1 
 arch/x86/entry/syscalls/syscall_32.tbl      |    1 
 arch/x86/entry/syscalls/syscall_64.tbl      |    1 
 arch/xtensa/kernel/syscalls/syscall.tbl     |    1 
 drivers/base/Kconfig                        |    9 +++
 drivers/base/Makefile                       |    1 
 drivers/base/watch.c                        |   94 +++++++++++++++++++++++++++
 include/linux/device.h                      |    7 ++
 include/linux/syscalls.h                    |    1 
 include/uapi/asm-generic/unistd.h           |    4 +
 kernel/sys_ni.c                             |    1 
 24 files changed, 153 insertions(+), 2 deletions(-)
 create mode 100644 drivers/base/watch.c

diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst
index 6fb3aa3356d3..393905b904c8 100644
--- a/Documentation/watch_queue.rst
+++ b/Documentation/watch_queue.rst
@@ -276,6 +276,25 @@ The ``id`` is the ID of the source object (such as the serial number on a key).
 Only watches that have the same ID set in them will see this notification.
 
 
+Global Device Watch List
+========================
+
+There is a global watch list that hardware generated events, such as device
+connection, disconnection, failure and error can be posted upon.  It must be
+enabled using::
+
+	CONFIG_DEVICE_NOTIFICATIONS
+
+Watchpoints are set in userspace using the device_notify(2) system call.
+Within the kernel events are posted upon it using::
+
+	void post_device_notification(struct watch_notification *n, u64 id);
+
+where ``n`` is the formatted notification record to post.  ``id`` is an
+identifier that can be used to direct to specific watches, but it should be 0
+for general use on this queue.
+
+
 Watch Sources
 =============
 
@@ -291,7 +310,8 @@ Any particular buffer can be fed from multiple sources.  Sources include:
   * WATCH_TYPE_BLOCK_NOTIFY
 
     Notifications of this type indicate block layer events, such as I/O errors
-    or temporary link loss.  Watches of this type are set on a global queue.
+    or temporary link loss.  Watches of this type are set on the global device
+    watch list.
 
 
 Event Filtering
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 728fe028c02c..8e841d8e4c22 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -475,3 +475,4 @@
 543	common	fspick				sys_fspick
 544	common	pidfd_open			sys_pidfd_open
 # 545 reserved for clone3
+546	common	watch_devices			sys_watch_devices
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 6da7dc4d79cc..0f080cf44cc9 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -449,3 +449,4 @@
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 435	common	clone3				sys_clone3
+436	common	watch_devices			sys_watch_devices
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 36d5faf4c86c..2f33f5db2fed 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -356,3 +356,4 @@
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 # 435 reserved for clone3
+436	common	watch_devices			sys_watch_devices
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index a88a285a0e5f..83e4e8784b88 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -435,3 +435,4 @@
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 # 435 reserved for clone3
+436	common	watch_devices			sys_watch_devices
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 09b0cd7dab0a..9a70a3be3b7b 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -441,3 +441,4 @@
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 435	common	clone3				sys_clone3
+436	common	watch_devices			sys_watch_devices
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index c9c879ec9b6d..2ba5b649f0ab 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -374,3 +374,4 @@
 433	n32	fspick				sys_fspick
 434	n32	pidfd_open			sys_pidfd_open
 # 435 reserved for clone3
+436	n32	watch_devices			sys_watch_devices
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index bbce9159caa1..ff350988584d 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -350,3 +350,4 @@
 433	n64	fspick				sys_fspick
 434	n64	pidfd_open			sys_pidfd_open
 # 435 reserved for clone3
+436	n64	watch_devices			sys_watch_devices
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 9653591428ec..7b26bd39900e 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -423,3 +423,4 @@
 433	o32	fspick				sys_fspick
 434	o32	pidfd_open			sys_pidfd_open
 # 435 reserved for clone3
+436	o32	watch_devices			sys_watch_devices
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 670d1371aca1..d846365a4f7c 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -432,3 +432,4 @@
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 435	common	clone3				sys_clone3_wrapper
+436	common	watch_devices			sys_watch_devices
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 43f736ed47f2..0a503239ab5c 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -517,3 +517,4 @@
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 435	nospu	clone3				ppc_clone3
+436	common	watch_devices			sys_watch_devices
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 3054e9c035a3..19b43c0d928a 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -438,3 +438,4 @@
 433  common	fspick			sys_fspick			sys_fspick
 434  common	pidfd_open		sys_pidfd_open			sys_pidfd_open
 435  common	clone3			sys_clone3			sys_clone3
+436  common	watch_devices		sys_watch_devices		sys_watch_devices
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index b5ed26c4c005..b454e07c9372 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -438,3 +438,4 @@
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 # 435 reserved for clone3
+436	common	watch_devices			sys_watch_devices
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 8c8cc7537fb2..8ef43c27457e 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -481,3 +481,4 @@
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 # 435 reserved for clone3
+436	common	watch_devices			sys_watch_devices
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index c00019abd076..0e34ddeb97a1 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -440,3 +440,4 @@
 433	i386	fspick			sys_fspick			__ia32_sys_fspick
 434	i386	pidfd_open		sys_pidfd_open			__ia32_sys_pidfd_open
 435	i386	clone3			sys_clone3			__ia32_sys_clone3
+436	i386	watch_devices		sys_watch_devices		__ia32_sys_watch_devices
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index c29976eca4a8..29293d103829 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -357,6 +357,7 @@
 433	common	fspick			__x64_sys_fspick
 434	common	pidfd_open		__x64_sys_pidfd_open
 435	common	clone3			__x64_sys_clone3/ptregs
+436	common	watch_devices		__x64_sys_watch_devices
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 25f4de729a6d..243fa18b8d1e 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -406,3 +406,4 @@
 433	common	fspick				sys_fspick
 434	common	pidfd_open			sys_pidfd_open
 435	common	clone3				sys_clone3
+436	common	watch_devices			sys_watch_devices
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index dc404492381d..7f899cae41a0 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -1,6 +1,15 @@
 # SPDX-License-Identifier: GPL-2.0
 menu "Generic Driver Options"
 
+config DEVICE_NOTIFICATIONS
+	bool "Provide device event notifications"
+	depends on WATCH_QUEUE
+	help
+	  This option provides support for getting hardware event notifications
+	  on devices, buses and interfaces.  This makes use of the
+	  /dev/watch_queue misc device to handle the notification buffer.
+	  device_notify(2) is used to set/remove watches.
+
 config UEVENT_HELPER
 	bool "Support for uevent helper"
 	help
diff --git a/drivers/base/Makefile b/drivers/base/Makefile
index 157452080f3d..4db2e8f1a1f4 100644
--- a/drivers/base/Makefile
+++ b/drivers/base/Makefile
@@ -7,6 +7,7 @@ obj-y			:= component.o core.o bus.o dd.o syscore.o \
 			   attribute_container.o transport_class.o \
 			   topology.o container.o property.o cacheinfo.o \
 			   devcon.o swnode.o
+obj-$(CONFIG_DEVICE_NOTIFICATIONS) += watch.o
 obj-$(CONFIG_DEVTMPFS)	+= devtmpfs.o
 obj-y			+= power/
 obj-$(CONFIG_ISA_BUS_API)	+= isa.o
diff --git a/drivers/base/watch.c b/drivers/base/watch.c
new file mode 100644
index 000000000000..879f82225979
--- /dev/null
+++ b/drivers/base/watch.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Event notifications.
+ *
+ * Copyright (C) 2019 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/device.h>
+#include <linux/watch_queue.h>
+#include <linux/syscalls.h>
+#include <linux/init_task.h>
+#include <linux/security.h>
+
+/*
+ * Global queue for watching for device layer events.
+ */
+static struct watch_list device_watchers = {
+	.watchers	= HLIST_HEAD_INIT,
+	.lock		= __SPIN_LOCK_UNLOCKED(&device_watchers.lock),
+};
+
+static DEFINE_SPINLOCK(device_watchers_lock);
+
+/**
+ * post_device_notification - Post notification of a device event
+ * @n - The notification to post
+ * @id - The device ID
+ *
+ * Note that there's only a global queue to which all events are posted.  Might
+ * want to provide per-dev queues also.
+ */
+void post_device_notification(struct watch_notification *n, u64 id)
+{
+	post_watch_notification(&device_watchers, n, &init_cred, id);
+}
+EXPORT_SYMBOL(post_device_notification);
+
+/**
+ * sys_watch_devices - Watch for device events.
+ * @watch_fd: The watch queue to send notifications to.
+ * @watch_id: The watch ID to be placed in the notification (-1 to remove watch)
+ * @flags: Flags (reserved for future)
+ */
+SYSCALL_DEFINE3(watch_devices, int, watch_fd, int, watch_id, unsigned int, flags)
+{
+	struct watch_queue *wqueue;
+	struct watch *watch = NULL;
+	long ret = -ENOMEM;
+
+	if (watch_id < -1 || watch_id > 0xff || flags)
+		return -EINVAL;
+
+	wqueue = get_watch_queue(watch_fd);
+	if (IS_ERR(wqueue)) {
+		ret = PTR_ERR(wqueue);
+		goto err;
+	}
+
+	if (watch_id >= 0) {
+		watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+		if (!watch)
+			goto err_wqueue;
+
+		init_watch(watch, wqueue);
+		watch->info_id = (u32)watch_id << WATCH_INFO_ID__SHIFT;
+		watch->cred = get_current_cred();
+
+		ret = security_watch_devices(watch);
+		if (ret < 0)
+			goto err_watch;
+
+		spin_lock(&device_watchers_lock);
+		ret = add_watch_to_object(watch, &device_watchers);
+		spin_unlock(&device_watchers_lock);
+		if (ret == 0)
+			watch = NULL;
+	} else {
+		spin_lock(&device_watchers_lock);
+		ret = remove_watch_from_object(&device_watchers, wqueue, 0,
+					       false);
+		spin_unlock(&device_watchers_lock);
+	}
+
+err_watch:
+	if (watch) {
+		put_cred(watch->cred);
+		kfree(watch);
+	}
+err_wqueue:
+	put_watch_queue(wqueue);
+err:
+	return ret;
+}
diff --git a/include/linux/device.h b/include/linux/device.h
index 6717adee33f0..9def6a53b598 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -43,6 +43,7 @@ struct iommu_group;
 struct iommu_fwspec;
 struct dev_pin_info;
 struct iommu_param;
+struct watch_notification;
 
 struct bus_attribute {
 	struct attribute	attr;
@@ -1412,6 +1413,12 @@ struct device_link *device_link_add(struct device *consumer,
 void device_link_del(struct device_link *link);
 void device_link_remove(void *consumer, struct device *supplier);
 
+#ifdef CONFIG_DEVICE_NOTIFICATIONS
+extern void post_device_notification(struct watch_notification *n, u64 id);
+#else
+static inline void post_device_notification(struct watch_notification *n, u64 id) {}
+#endif
+
 #ifndef dev_fmt
 #define dev_fmt(fmt) fmt
 #endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 88145da7d140..5bac5daec51e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1000,6 +1000,7 @@ asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags)
 asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
+asmlinkage long sys_watch_devices(int watch_fd, int watch_id, unsigned int flags);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 1be0e798e362..fd63ff0196fd 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -850,9 +850,11 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open)
 #define __NR_clone3 435
 __SYSCALL(__NR_clone3, sys_clone3)
 #endif
+#define __NR_watch_devices 436
+__SYSCALL(__NR_watch_devices, sys_watch_devices)
 
 #undef __NR_syscalls
-#define __NR_syscalls 436
+#define __NR_syscalls 437
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 34b76895b81e..184ad68c087f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -51,6 +51,7 @@ COND_SYSCALL_COMPAT(io_pgetevents);
 COND_SYSCALL(io_uring_setup);
 COND_SYSCALL(io_uring_enter);
 COND_SYSCALL(io_uring_register);
+COND_SYSCALL(watch_devices);
 
 /* fs/xattr.c */
 

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox