Linux Security Modules development
 help / color / mirror / Atom feed
* [RFC PATCH 05/21] pipe: Conditionalise wakeup in pipe_read()
From: David Howells @ 2019-10-15 21:48 UTC (permalink / raw)
  To: torvalds
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
	linux-usb, linux-block, linux-security-module, linux-fsdevel,
	linux-api, linux-security-module, linux-kernel
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>

Only do a wakeup in pipe_read() if we made space in a completely full
buffer.  The producer shouldn't be waiting on pipe->wait otherwise.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/pipe.c |   15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 08af7e7bbea2..0d25cb090a03 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -327,11 +327,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 				spin_lock_irq(&pipe->wait.lock);
 				tail++;
 				pipe_commit_read(pipe, tail);
-				do_wakeup = 0;
-				prelocked_wake_up_interruptible_sync_poll(
-					&pipe->wait, EPOLLOUT | EPOLLWRNORM);
+				do_wakeup = 1;
+				if (head - (tail - 1) == pipe->max_usage)
+					prelocked_wake_up_interruptible_sync_poll(
+						&pipe->wait, EPOLLOUT | EPOLLWRNORM);
 				spin_unlock_irq(&pipe->wait.lock);
-				kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+				if (head - (tail - 1) == pipe->max_usage)
+					kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 			}
 			total_len -= chars;
 			if (!total_len)
@@ -360,11 +362,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 				ret = -ERESTARTSYS;
 			break;
 		}
-		if (do_wakeup) {
-			wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
- 			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
-			do_wakeup = 0;
-		}
 		pipe_wait(pipe);
 	}
 	__pipe_unlock(pipe);


^ permalink raw reply related

* [RFC PATCH 04/21] pipe: Advance tail pointer inside of wait spinlock in pipe_read()
From: David Howells @ 2019-10-15 21:48 UTC (permalink / raw)
  To: torvalds
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
	linux-usb, linux-block, linux-security-module, linux-fsdevel,
	linux-api, linux-security-module, linux-kernel
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>

Advance the pipe ring tail pointer inside of wait spinlock in pipe_read()
so that the pipe can be written into with kernel notifications from
contexts where pipe->mutex cannot be taken.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/pipe.c |    8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 0574277bad18..08af7e7bbea2 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -324,9 +324,14 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 
 			if (!buf->len) {
 				pipe_buf_release(pipe, buf);
+				spin_lock_irq(&pipe->wait.lock);
 				tail++;
 				pipe_commit_read(pipe, tail);
-				do_wakeup = 1;
+				do_wakeup = 0;
+				prelocked_wake_up_interruptible_sync_poll(
+					&pipe->wait, EPOLLOUT | EPOLLWRNORM);
+				spin_unlock_irq(&pipe->wait.lock);
+				kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 			}
 			total_len -= chars;
 			if (!total_len)
@@ -358,6 +363,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		if (do_wakeup) {
 			wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+			do_wakeup = 0;
 		}
 		pipe_wait(pipe);
 	}


^ permalink raw reply related

* [RFC PATCH 03/21] pipe: Use head and tail pointers for the ring, not cursor and length
From: David Howells @ 2019-10-15 21:48 UTC (permalink / raw)
  To: torvalds
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
	linux-usb, linux-block, linux-security-module, linux-fsdevel,
	linux-api, linux-security-module, linux-kernel
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>

Convert pipes to use head and tail pointers for the buffer ring rather than
pointer and length as the latter requires two atomic ops to update (or a
combined op) whereas the former only requires one.

 (1) The head pointer is the point at which production occurs and points to
     the slot in which the next buffer will be placed.  This is equivalent
     to pipe->curbuf + pipe->nrbufs.

     The head pointer belongs to the write-side.

 (2) The tail pointer is the point at which consumption occurs.  It points
     to the next slot to be consumed.  This is equivalent to pipe->curbuf.

     The tail pointer belongs to the read-side.

 (3) head and tail are allowed to run to UINT_MAX and wrap naturally.  They
     are only masked off when the array is being accessed, e.g.:

	pipe->bufs[head & mask]

     This means that it is not necessary to have a dead slot in the ring as
     head == tail isn't ambiguous.

 (4) The ring is empty if "head == tail".

 (5) The occupancy of the ring is "head - tail".

 (6) The number of free slots in the ring is "(tail + pipe->ring_size) -
     head".

 (7) The ring is full if "head >= (tail + pipe->ring_size)", which can also
     be written as "head - tail >= pipe->ring_size".

Also split pipe->buffers into pipe->ring_size (which indicates the size of the
ring) and pipe->max_usage (which restricts the amount of ring that write() is
allowed to fill).  This allows for a pipe that is both writable by the kernel
notification facility and by userspace, allowing plenty of ring space for
notifications to be added whilst preventing userspace from being able to use
up too much buffer space.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/fuse/dev.c             |   31 +++--
 fs/pipe.c                 |  168 ++++++++++++++++------------
 fs/splice.c               |  183 ++++++++++++++++++------------
 include/linux/pipe_fs_i.h |   38 ++++++
 include/linux/uio.h       |    4 -
 lib/iov_iter.c            |  274 ++++++++++++++++++++++++++-------------------
 6 files changed, 421 insertions(+), 277 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index dadd617d826c..8bbe5ec6c336 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -703,7 +703,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 			cs->pipebufs++;
 			cs->nr_segs--;
 		} else {
-			if (cs->nr_segs == cs->pipe->buffers)
+			if (cs->nr_segs >= cs->pipe->max_usage)
 				return -EIO;
 
 			page = alloc_page(GFP_HIGHUSER);
@@ -879,7 +879,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
 	struct pipe_buffer *buf;
 	int err;
 
-	if (cs->nr_segs == cs->pipe->buffers)
+	if (cs->nr_segs >= cs->pipe->max_usage)
 		return -EIO;
 
 	err = unlock_request(cs->req);
@@ -1341,7 +1341,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 	if (!fud)
 		return -EPERM;
 
-	bufs = kvmalloc_array(pipe->buffers, sizeof(struct pipe_buffer),
+	bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer),
 			      GFP_KERNEL);
 	if (!bufs)
 		return -ENOMEM;
@@ -1353,7 +1353,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 	if (ret < 0)
 		goto out;
 
-	if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+	if (pipe->head - pipe->tail + cs.nr_segs > pipe->max_usage) {
 		ret = -EIO;
 		goto out;
 	}
@@ -1935,6 +1935,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 				     struct file *out, loff_t *ppos,
 				     size_t len, unsigned int flags)
 {
+	unsigned int head, tail, mask, count;
 	unsigned nbuf;
 	unsigned idx;
 	struct pipe_buffer *bufs;
@@ -1949,8 +1950,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 
 	pipe_lock(pipe);
 
-	bufs = kvmalloc_array(pipe->nrbufs, sizeof(struct pipe_buffer),
-			      GFP_KERNEL);
+	head = pipe->head;
+	tail = pipe->tail;
+	mask = pipe->ring_size - 1;
+	count = head - tail;
+
+	bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
 	if (!bufs) {
 		pipe_unlock(pipe);
 		return -ENOMEM;
@@ -1958,8 +1963,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 
 	nbuf = 0;
 	rem = 0;
-	for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
-		rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+	for (idx = tail; idx < head && rem < len; idx++)
+		rem += pipe->bufs[idx & mask].len;
 
 	ret = -EINVAL;
 	if (rem < len)
@@ -1970,16 +1975,16 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 		struct pipe_buffer *ibuf;
 		struct pipe_buffer *obuf;
 
-		BUG_ON(nbuf >= pipe->buffers);
-		BUG_ON(!pipe->nrbufs);
-		ibuf = &pipe->bufs[pipe->curbuf];
+		BUG_ON(nbuf >= pipe->ring_size);
+		BUG_ON(tail == head);
+		ibuf = &pipe->bufs[tail];
 		obuf = &bufs[nbuf];
 
 		if (rem >= ibuf->len) {
 			*obuf = *ibuf;
 			ibuf->ops = NULL;
-			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
-			pipe->nrbufs--;
+			tail++;
+			pipe_commit_read(pipe, tail);
 		} else {
 			if (!pipe_buf_get(pipe, ibuf))
 				goto out_free;
diff --git a/fs/pipe.c b/fs/pipe.c
index 8a2ab2f974bd..0574277bad18 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -43,10 +43,11 @@ unsigned long pipe_user_pages_hard;
 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
 
 /*
- * We use a start+len construction, which provides full use of the 
- * allocated memory.
- * -- Florian Coosmann (FGC)
- * 
+ * We use head and tail indices that aren't masked off, except at the point of
+ * dereference, but rather they're allowed to wrap naturally.  This means there
+ * isn't a dead spot in the buffer, provided the ring size < INT_MAX.
+ * -- David Howells 2019-09-23.
+ *
  * Reads with count = 0 should always return 0.
  * -- Julian Bradfield 1999-06-07.
  *
@@ -285,10 +286,12 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 	ret = 0;
 	__pipe_lock(pipe);
 	for (;;) {
-		int bufs = pipe->nrbufs;
-		if (bufs) {
-			int curbuf = pipe->curbuf;
-			struct pipe_buffer *buf = pipe->bufs + curbuf;
+		unsigned int head = pipe->head;
+		unsigned int tail = pipe->tail;
+		unsigned int mask = pipe->ring_size - 1;
+
+		if (tail != head) {
+			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
 			size_t chars = buf->len;
 			size_t written;
 			int error;
@@ -321,17 +324,17 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 
 			if (!buf->len) {
 				pipe_buf_release(pipe, buf);
-				curbuf = (curbuf + 1) & (pipe->buffers - 1);
-				pipe->curbuf = curbuf;
-				pipe->nrbufs = --bufs;
+				tail++;
+				pipe_commit_read(pipe, tail);
 				do_wakeup = 1;
 			}
 			total_len -= chars;
 			if (!total_len)
 				break;	/* common path: read succeeded */
+			if (tail != head)	/* More to do? */
+				continue;
 		}
-		if (bufs)	/* More to do? */
-			continue;
+
 		if (!pipe->writers)
 			break;
 		if (!pipe->waiting_writers) {
@@ -380,6 +383,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *filp = iocb->ki_filp;
 	struct pipe_inode_info *pipe = filp->private_data;
+	unsigned int head, tail, buffers, mask;
 	ssize_t ret = 0;
 	int do_wakeup = 0;
 	size_t total_len = iov_iter_count(from);
@@ -397,12 +401,15 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 		goto out;
 	}
 
+	tail = pipe->tail;
+	head = pipe->head;
+	buffers = pipe->max_usage;
+	mask = pipe->ring_size - 1;
+
 	/* We try to merge small writes */
 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
-	if (pipe->nrbufs && chars != 0) {
-		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
-							(pipe->buffers - 1);
-		struct pipe_buffer *buf = pipe->bufs + lastbuf;
+	if (head != tail && chars != 0) {
+		struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
 		int offset = buf->offset + buf->len;
 
 		if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
@@ -423,18 +430,16 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 	}
 
 	for (;;) {
-		int bufs;
-
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret)
 				ret = -EPIPE;
 			break;
 		}
-		bufs = pipe->nrbufs;
-		if (bufs < pipe->buffers) {
-			int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
-			struct pipe_buffer *buf = pipe->bufs + newbuf;
+
+		tail = pipe->tail;
+		if (head - tail < buffers) {
+			struct pipe_buffer *buf = &pipe->bufs[head & mask];
 			struct page *page = pipe->tmp_page;
 			int copied;
 
@@ -470,14 +475,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 				buf->ops = &packet_pipe_buf_ops;
 				buf->flags = PIPE_BUF_FLAG_PACKET;
 			}
-			pipe->nrbufs = ++bufs;
+
+			head++;
+			pipe_commit_write(pipe, head);
 			pipe->tmp_page = NULL;
 
 			if (!iov_iter_count(from))
 				break;
 		}
-		if (bufs < pipe->buffers)
+
+		if (head - tail < buffers)
 			continue;
+
+		/* Wait for buffer space to become available. */
 		if (filp->f_flags & O_NONBLOCK) {
 			if (!ret)
 				ret = -EAGAIN;
@@ -515,17 +525,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct pipe_inode_info *pipe = filp->private_data;
-	int count, buf, nrbufs;
+	int count, head, tail, mask;
 
 	switch (cmd) {
 		case FIONREAD:
 			__pipe_lock(pipe);
 			count = 0;
-			buf = pipe->curbuf;
-			nrbufs = pipe->nrbufs;
-			while (--nrbufs >= 0) {
-				count += pipe->bufs[buf].len;
-				buf = (buf+1) & (pipe->buffers - 1);
+			head = pipe->head;
+			tail = pipe->tail;
+			mask = pipe->ring_size - 1;
+
+			while (tail < head) {
+				count += pipe->bufs[tail & mask].len;
+				tail++;
 			}
 			__pipe_unlock(pipe);
 
@@ -541,21 +553,26 @@ pipe_poll(struct file *filp, poll_table *wait)
 {
 	__poll_t mask;
 	struct pipe_inode_info *pipe = filp->private_data;
-	int nrbufs;
+	unsigned int head = READ_ONCE(pipe->head);
+	unsigned int tail = READ_ONCE(pipe->tail);
+	unsigned int occupancy = head - tail;
 
 	poll_wait(filp, &pipe->wait, wait);
 
+	BUG_ON(occupancy > pipe->ring_size);
+
 	/* Reading only -- no need for acquiring the semaphore.  */
-	nrbufs = pipe->nrbufs;
 	mask = 0;
 	if (filp->f_mode & FMODE_READ) {
-		mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
+		if (occupancy > 0)
+			mask |= EPOLLIN | EPOLLRDNORM;
 		if (!pipe->writers && filp->f_version != pipe->w_counter)
 			mask |= EPOLLHUP;
 	}
 
 	if (filp->f_mode & FMODE_WRITE) {
-		mask |= (nrbufs < pipe->buffers) ? EPOLLOUT | EPOLLWRNORM : 0;
+		if (occupancy < pipe->max_usage)
+			mask |= EPOLLOUT | EPOLLWRNORM;
 		/*
 		 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
 		 * behave exactly like pipes for poll().
@@ -679,7 +696,8 @@ struct pipe_inode_info *alloc_pipe_info(void)
 	if (pipe->bufs) {
 		init_waitqueue_head(&pipe->wait);
 		pipe->r_counter = pipe->w_counter = 1;
-		pipe->buffers = pipe_bufs;
+		pipe->max_usage = pipe_bufs;
+		pipe->ring_size = pipe_bufs;
 		pipe->user = user;
 		mutex_init(&pipe->mutex);
 		return pipe;
@@ -697,9 +715,9 @@ void free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
 
-	(void) account_pipe_buffers(pipe->user, pipe->buffers, 0);
+	(void) account_pipe_buffers(pipe->user, pipe->ring_size, 0);
 	free_uid(pipe->user);
-	for (i = 0; i < pipe->buffers; i++) {
+	for (i = 0; i < pipe->ring_size; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 		if (buf->ops)
 			pipe_buf_release(pipe, buf);
@@ -880,7 +898,7 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 
 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
 {
-	int cur = *cnt;	
+	int cur = *cnt;
 
 	while (cur == *cnt) {
 		pipe_wait(pipe);
@@ -955,7 +973,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 			}
 		}
 		break;
-	
+
 	case FMODE_WRITE:
 	/*
 	 *  O_WRONLY
@@ -975,7 +993,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 				goto err_wr;
 		}
 		break;
-	
+
 	case FMODE_READ | FMODE_WRITE:
 	/*
 	 *  O_RDWR
@@ -1054,14 +1072,14 @@ unsigned int round_pipe_size(unsigned long size)
 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 {
 	struct pipe_buffer *bufs;
-	unsigned int size, nr_pages;
+	unsigned int size, nr_slots, head, tail, mask, n;
 	unsigned long user_bufs;
 	long ret = 0;
 
 	size = round_pipe_size(arg);
-	nr_pages = size >> PAGE_SHIFT;
+	nr_slots = size >> PAGE_SHIFT;
 
-	if (!nr_pages)
+	if (!nr_slots)
 		return -EINVAL;
 
 	/*
@@ -1071,13 +1089,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 	 * Decreasing the pipe capacity is always permitted, even
 	 * if the user is currently over a limit.
 	 */
-	if (nr_pages > pipe->buffers &&
+	if (nr_slots > pipe->ring_size &&
 			size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 
-	user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);
+	user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots);
 
-	if (nr_pages > pipe->buffers &&
+	if (nr_slots > pipe->ring_size &&
 			(too_many_pipe_buffers_hard(user_bufs) ||
 			 too_many_pipe_buffers_soft(user_bufs)) &&
 			is_unprivileged_user()) {
@@ -1086,17 +1104,21 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 	}
 
 	/*
-	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
-	 * expect a lot of shrink+grow operations, just free and allocate
-	 * again like we would do for growing. If the pipe currently
+	 * We can shrink the pipe, if arg is greater than the ring occupancy.
+	 * Since we don't expect a lot of shrink+grow operations, just free and
+	 * allocate again like we would do for growing.  If the pipe currently
 	 * contains more buffers than arg, then return busy.
 	 */
-	if (nr_pages < pipe->nrbufs) {
+	mask = pipe->ring_size - 1;
+	head = pipe->head & mask;
+	tail = pipe->tail & mask;
+	n = pipe->head - pipe->tail;
+	if (nr_slots < n) {
 		ret = -EBUSY;
 		goto out_revert_acct;
 	}
 
-	bufs = kcalloc(nr_pages, sizeof(*bufs),
+	bufs = kcalloc(nr_slots, sizeof(*bufs),
 		       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 	if (unlikely(!bufs)) {
 		ret = -ENOMEM;
@@ -1105,33 +1127,33 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
 
 	/*
 	 * The pipe array wraps around, so just start the new one at zero
-	 * and adjust the indexes.
+	 * and adjust the indices.
 	 */
-	if (pipe->nrbufs) {
-		unsigned int tail;
-		unsigned int head;
-
-		tail = pipe->curbuf + pipe->nrbufs;
-		if (tail < pipe->buffers)
-			tail = 0;
-		else
-			tail &= (pipe->buffers - 1);
-
-		head = pipe->nrbufs - tail;
-		if (head)
-			memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
-		if (tail)
-			memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
+	if (n > 0) {
+		if (head > tail) {
+			memcpy(bufs, &pipe->bufs[tail], n * sizeof(struct pipe_buffer));
+		} else {
+			unsigned int s = pipe->ring_size - tail;
+			if (n - s > 0)
+				memcpy(bufs + s, &pipe->bufs[0],
+				       (n - s) * sizeof(struct pipe_buffer));
+			memcpy(bufs, &pipe->bufs[tail], s * sizeof(struct pipe_buffer));
+		}
 	}
 
-	pipe->curbuf = 0;
+	head = n;
+	tail = 0;
+
 	kfree(pipe->bufs);
 	pipe->bufs = bufs;
-	pipe->buffers = nr_pages;
-	return nr_pages * PAGE_SIZE;
+	pipe->ring_size = nr_slots;
+	pipe->max_usage = nr_slots;
+	pipe->tail = tail;
+	pipe->head = head;
+	return pipe->max_usage * PAGE_SIZE;
 
 out_revert_acct:
-	(void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers);
+	(void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size);
 	return ret;
 }
 
@@ -1161,7 +1183,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 		ret = pipe_set_size(pipe, arg);
 		break;
 	case F_GETPIPE_SZ:
-		ret = pipe->buffers * PAGE_SIZE;
+		ret = pipe->max_usage * PAGE_SIZE;
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/fs/splice.c b/fs/splice.c
index 98412721f056..7f5f59e2967b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 		       struct splice_pipe_desc *spd)
 {
 	unsigned int spd_pages = spd->nr_pages;
+	unsigned int tail = pipe->tail;
+	unsigned int head = pipe->head;
+	unsigned int mask = pipe->ring_size - 1;
 	int ret = 0, page_nr = 0;
 
 	if (!spd_pages)
@@ -196,9 +199,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 		goto out;
 	}
 
-	while (pipe->nrbufs < pipe->buffers) {
-		int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
-		struct pipe_buffer *buf = pipe->bufs + newbuf;
+	while (head - tail < pipe->max_usage) {
+		struct pipe_buffer *buf = &pipe->bufs[head & mask];
 
 		buf->page = spd->pages[page_nr];
 		buf->offset = spd->partial[page_nr].offset;
@@ -207,7 +209,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 		buf->ops = spd->ops;
 		buf->flags = 0;
 
-		pipe->nrbufs++;
+		head++;
+		pipe_commit_write(pipe, head);
 		page_nr++;
 		ret += buf->len;
 
@@ -228,17 +231,19 @@ EXPORT_SYMBOL_GPL(splice_to_pipe);
 
 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
+	unsigned int head = pipe->head;
+	unsigned int tail = pipe->tail;
+	unsigned int mask = pipe->ring_size - 1;
 	int ret;
 
 	if (unlikely(!pipe->readers)) {
 		send_sig(SIGPIPE, current, 0);
 		ret = -EPIPE;
-	} else if (pipe->nrbufs == pipe->buffers) {
+	} else if (head - tail >= pipe->max_usage) {
 		ret = -EAGAIN;
 	} else {
-		int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
-		pipe->bufs[newbuf] = *buf;
-		pipe->nrbufs++;
+		pipe->bufs[head & mask] = *buf;
+		pipe_commit_write(pipe, head + 1);
 		return buf->len;
 	}
 	pipe_buf_release(pipe, buf);
@@ -252,14 +257,14 @@ EXPORT_SYMBOL(add_to_pipe);
  */
 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
 {
-	unsigned int buffers = READ_ONCE(pipe->buffers);
+	unsigned int max_usage = READ_ONCE(pipe->max_usage);
 
-	spd->nr_pages_max = buffers;
-	if (buffers <= PIPE_DEF_BUFFERS)
+	spd->nr_pages_max = max_usage;
+	if (max_usage <= PIPE_DEF_BUFFERS)
 		return 0;
 
-	spd->pages = kmalloc_array(buffers, sizeof(struct page *), GFP_KERNEL);
-	spd->partial = kmalloc_array(buffers, sizeof(struct partial_page),
+	spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
+	spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
 				     GFP_KERNEL);
 
 	if (spd->pages && spd->partial)
@@ -298,10 +303,11 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 {
 	struct iov_iter to;
 	struct kiocb kiocb;
-	int idx, ret;
+	unsigned int i_head;
+	int ret;
 
 	iov_iter_pipe(&to, READ, pipe, len);
-	idx = to.idx;
+	i_head = to.head;
 	init_sync_kiocb(&kiocb, in);
 	kiocb.ki_pos = *ppos;
 	ret = call_read_iter(in, &kiocb, &to);
@@ -309,7 +315,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 		*ppos = kiocb.ki_pos;
 		file_accessed(in);
 	} else if (ret < 0) {
-		to.idx = idx;
+		to.head = i_head;
 		to.iov_offset = 0;
 		iov_iter_advance(&to, 0); /* to free what was emitted */
 		/*
@@ -370,11 +376,12 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	struct iov_iter to;
 	struct page **pages;
 	unsigned int nr_pages;
+	unsigned int mask;
 	size_t offset, base, copied = 0;
 	ssize_t res;
 	int i;
 
-	if (pipe->nrbufs == pipe->buffers)
+	if (pipe->head - pipe->tail == pipe->max_usage)
 		return -EAGAIN;
 
 	/*
@@ -400,8 +407,9 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 		}
 	}
 
-	pipe->bufs[to.idx].offset = offset;
-	pipe->bufs[to.idx].len -= offset;
+	mask = pipe->ring_size - 1;
+	pipe->bufs[to.head & mask].offset = offset;
+	pipe->bufs[to.head & mask].len -= offset;
 
 	for (i = 0; i < nr_pages; i++) {
 		size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
@@ -443,7 +451,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
 
 	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
 
-	if (sd->len < sd->total_len && pipe->nrbufs > 1)
+	if (sd->len < sd->total_len && pipe->head - pipe->tail > 1)
 		more |= MSG_SENDPAGE_NOTLAST;
 
 	return file->f_op->sendpage(file, buf->page, buf->offset,
@@ -481,10 +489,13 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 			  splice_actor *actor)
 {
+	unsigned int head = pipe->head;
+	unsigned int tail = pipe->tail;
+	unsigned int mask = pipe->ring_size - 1;
 	int ret;
 
-	while (pipe->nrbufs) {
-		struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+	while (tail != head) {
+		struct pipe_buffer *buf = &pipe->bufs[tail & mask];
 
 		sd->len = buf->len;
 		if (sd->len > sd->total_len)
@@ -511,8 +522,8 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
 
 		if (!buf->len) {
 			pipe_buf_release(pipe, buf);
-			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
-			pipe->nrbufs--;
+			tail++;
+			pipe_commit_read(pipe, tail);
 			if (pipe->files)
 				sd->need_wakeup = true;
 		}
@@ -543,7 +554,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
 	if (signal_pending(current))
 		return -ERESTARTSYS;
 
-	while (!pipe->nrbufs) {
+	while (pipe->tail == pipe->head) {
 		if (!pipe->writers)
 			return 0;
 
@@ -686,7 +697,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 		.pos = *ppos,
 		.u.file = out,
 	};
-	int nbufs = pipe->buffers;
+	int nbufs = pipe->max_usage;
 	struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
 					GFP_KERNEL);
 	ssize_t ret;
@@ -699,16 +710,19 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	splice_from_pipe_begin(&sd);
 	while (sd.total_len) {
 		struct iov_iter from;
+		unsigned int head = pipe->head;
+		unsigned int tail = pipe->tail;
+		unsigned int mask = pipe->ring_size - 1;
 		size_t left;
-		int n, idx;
+		int n;
 
 		ret = splice_from_pipe_next(pipe, &sd);
 		if (ret <= 0)
 			break;
 
-		if (unlikely(nbufs < pipe->buffers)) {
+		if (unlikely(nbufs < pipe->max_usage)) {
 			kfree(array);
-			nbufs = pipe->buffers;
+			nbufs = pipe->max_usage;
 			array = kcalloc(nbufs, sizeof(struct bio_vec),
 					GFP_KERNEL);
 			if (!array) {
@@ -719,16 +733,13 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 		/* build the vector */
 		left = sd.total_len;
-		for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
-			struct pipe_buffer *buf = pipe->bufs + idx;
+		for (n = 0; head - tail > 0 && left && n < nbufs; tail++, n++) {
+			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
 			size_t this_len = buf->len;
 
 			if (this_len > left)
 				this_len = left;
 
-			if (idx == pipe->buffers - 1)
-				idx = -1;
-
 			ret = pipe_buf_confirm(pipe, buf);
 			if (unlikely(ret)) {
 				if (ret == -ENODATA)
@@ -752,14 +763,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 		*ppos = sd.pos;
 
 		/* dismiss the fully eaten buffers, adjust the partial one */
+		tail = pipe->tail;
 		while (ret) {
-			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
 			if (ret >= buf->len) {
 				ret -= buf->len;
 				buf->len = 0;
 				pipe_buf_release(pipe, buf);
-				pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
-				pipe->nrbufs--;
+				tail++;
+				pipe_commit_read(pipe, tail);
 				if (pipe->files)
 					sd.need_wakeup = true;
 			} else {
@@ -942,15 +954,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	sd->flags &= ~SPLICE_F_NONBLOCK;
 	more = sd->flags & SPLICE_F_MORE;
 
-	WARN_ON_ONCE(pipe->nrbufs != 0);
+	WARN_ON_ONCE(pipe->head != pipe->tail);
 
 	while (len) {
+		unsigned int p_occupancy, p_space;
 		size_t read_len;
 		loff_t pos = sd->pos, prev_pos = pos;
 
 		/* Don't try to read more the pipe has space for. */
-		read_len = min_t(size_t, len,
-				 (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT);
+		p_occupancy = READ_ONCE(pipe->head) - READ_ONCE(pipe->tail);
+		p_space = pipe->max_usage - p_occupancy;
+		read_len = min_t(size_t, len, p_space << PAGE_SHIFT);
 		ret = do_splice_to(in, &pos, pipe, read_len, flags);
 		if (unlikely(ret <= 0))
 			goto out_release;
@@ -989,7 +1003,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	}
 
 done:
-	pipe->nrbufs = pipe->curbuf = 0;
+	pipe->tail = pipe->head = 0;
 	file_accessed(in);
 	return bytes;
 
@@ -998,8 +1012,8 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	 * If we did an incomplete transfer we must release
 	 * the pipe buffers in question:
 	 */
-	for (i = 0; i < pipe->buffers; i++) {
-		struct pipe_buffer *buf = pipe->bufs + i;
+	for (i = 0; i < pipe->ring_size; i++) {
+		struct pipe_buffer *buf = &pipe->bufs[i];
 
 		if (buf->ops)
 			pipe_buf_release(pipe, buf);
@@ -1075,7 +1089,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
 			send_sig(SIGPIPE, current, 0);
 			return -EPIPE;
 		}
-		if (pipe->nrbufs != pipe->buffers)
+		if (pipe->head - pipe->tail < pipe->max_usage)
 			return 0;
 		if (flags & SPLICE_F_NONBLOCK)
 			return -EAGAIN;
@@ -1442,16 +1456,16 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 	int ret;
 
 	/*
-	 * Check ->nrbufs without the inode lock first. This function
+	 * Check the pipe occupancy without the inode lock first. This function
 	 * is speculative anyways, so missing one is ok.
 	 */
-	if (pipe->nrbufs)
+	if (pipe->head != pipe->tail)
 		return 0;
 
 	ret = 0;
 	pipe_lock(pipe);
 
-	while (!pipe->nrbufs) {
+	while (pipe->head == pipe->tail) {
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
@@ -1483,13 +1497,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 	 * Check ->nrbufs without the inode lock first. This function
 	 * is speculative anyways, so missing one is ok.
 	 */
-	if (pipe->nrbufs < pipe->buffers)
+	if (pipe->head - pipe->tail >= pipe->max_usage)
 		return 0;
 
 	ret = 0;
 	pipe_lock(pipe);
 
-	while (pipe->nrbufs >= pipe->buffers) {
+	while (pipe->head - pipe->tail >= pipe->max_usage) {
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			ret = -EPIPE;
@@ -1520,7 +1534,10 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 			       size_t len, unsigned int flags)
 {
 	struct pipe_buffer *ibuf, *obuf;
-	int ret = 0, nbuf;
+	unsigned int i_head, o_head;
+	unsigned int i_tail, o_tail;
+	unsigned int i_mask, o_mask;
+	int ret = 0;
 	bool input_wakeup = false;
 
 
@@ -1540,7 +1557,14 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 	 */
 	pipe_double_lock(ipipe, opipe);
 
+	i_tail = ipipe->tail;
+	i_mask = ipipe->ring_size - 1;
+	o_head = opipe->head;
+	o_mask = opipe->ring_size - 1;
+
 	do {
+		size_t o_len;
+
 		if (!opipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret)
@@ -1548,14 +1572,17 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 			break;
 		}
 
-		if (!ipipe->nrbufs && !ipipe->writers)
+		i_head = ipipe->head;
+		o_tail = opipe->tail;
+
+		if (i_head == i_tail && !ipipe->writers)
 			break;
 
 		/*
 		 * Cannot make any progress, because either the input
 		 * pipe is empty or the output pipe is full.
 		 */
-		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
+		if (i_head == i_tail || o_head - o_tail >= opipe->max_usage) {
 			/* Already processed some buffers, break */
 			if (ret)
 				break;
@@ -1575,9 +1602,8 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 			goto retry;
 		}
 
-		ibuf = ipipe->bufs + ipipe->curbuf;
-		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
-		obuf = opipe->bufs + nbuf;
+		ibuf = &ipipe->bufs[i_tail & i_mask];
+		obuf = &opipe->bufs[o_head & o_mask];
 
 		if (len >= ibuf->len) {
 			/*
@@ -1585,10 +1611,12 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 			 */
 			*obuf = *ibuf;
 			ibuf->ops = NULL;
-			opipe->nrbufs++;
-			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
-			ipipe->nrbufs--;
+			i_tail++;
+			pipe_commit_read(ipipe, i_tail);
 			input_wakeup = true;
+			o_len = obuf->len;
+			o_head++;
+			pipe_commit_write(opipe, o_head);
 		} else {
 			/*
 			 * Get a reference to this pipe buffer,
@@ -1610,12 +1638,14 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
 			pipe_buf_mark_unmergeable(obuf);
 
 			obuf->len = len;
-			opipe->nrbufs++;
-			ibuf->offset += obuf->len;
-			ibuf->len -= obuf->len;
+			ibuf->offset += len;
+			ibuf->len -= len;
+			o_len = len;
+			o_head++;
+			pipe_commit_write(opipe, o_head);
 		}
-		ret += obuf->len;
-		len -= obuf->len;
+		ret += o_len;
+		len -= o_len;
 	} while (len);
 
 	pipe_unlock(ipipe);
@@ -1641,7 +1671,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		     size_t len, unsigned int flags)
 {
 	struct pipe_buffer *ibuf, *obuf;
-	int ret = 0, i = 0, nbuf;
+	unsigned int i_head, o_head;
+	unsigned int i_tail, o_tail;
+	unsigned int i_mask, o_mask;
+	int ret = 0, i = 0;
 
 	/*
 	 * Potential ABBA deadlock, work around it by ordering lock
@@ -1650,6 +1683,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 	 */
 	pipe_double_lock(ipipe, opipe);
 
+	i_tail = ipipe->tail;
+	i_mask = ipipe->ring_size - 1;
+	o_head = opipe->head;
+	o_mask = opipe->ring_size - 1;
+
 	do {
 		if (!opipe->readers) {
 			send_sig(SIGPIPE, current, 0);
@@ -1658,15 +1696,18 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 			break;
 		}
 
+		i_head = ipipe->head;
+		o_tail = opipe->tail;
+
 		/*
-		 * If we have iterated all input buffers or ran out of
+		 * If we have iterated all input buffers or run out of
 		 * output room, break.
 		 */
-		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
+		if (i_head != i_tail || o_head - o_tail >= opipe->max_usage)
 			break;
 
-		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
-		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
+		ibuf = &ipipe->bufs[i_tail & i_mask];
+		obuf = &opipe->bufs[o_head & o_mask];
 
 		/*
 		 * Get a reference to this pipe buffer,
@@ -1678,7 +1719,6 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 			break;
 		}
 
-		obuf = opipe->bufs + nbuf;
 		*obuf = *ibuf;
 
 		/*
@@ -1691,10 +1731,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 
 		if (obuf->len > len)
 			obuf->len = len;
-
-		opipe->nrbufs++;
 		ret += obuf->len;
 		len -= obuf->len;
+
+		o_head++;
+		pipe_commit_write(opipe, o_head);
 		i++;
 	} while (len);
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 5c626fdc10db..ec82c5374410 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -30,9 +30,10 @@ struct pipe_buffer {
  *	struct pipe_inode_info - a linux kernel pipe
  *	@mutex: mutex protecting the whole thing
  *	@wait: reader/writer wait point in case of empty/full pipe
- *	@nrbufs: the number of non-empty pipe buffers in this pipe
- *	@buffers: total number of buffers (should be a power of 2)
- *	@curbuf: the current pipe buffer entry
+ *	@head: The point of buffer production
+ *	@tail: The point of buffer consumption
+ *	@max_usage: The maximum number of slots that may be used in the ring
+ *	@ring_size: total number of buffers (should be a power of 2)
  *	@tmp_page: cached released page
  *	@readers: number of current readers of this pipe
  *	@writers: number of current writers of this pipe
@@ -48,7 +49,10 @@ struct pipe_buffer {
 struct pipe_inode_info {
 	struct mutex mutex;
 	wait_queue_head_t wait;
-	unsigned int nrbufs, curbuf, buffers;
+	unsigned int head;
+	unsigned int tail;
+	unsigned int max_usage;
+	unsigned int ring_size;
 	unsigned int readers;
 	unsigned int writers;
 	unsigned int files;
@@ -104,6 +108,32 @@ struct pipe_buf_operations {
 	bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
 };
 
+/**
+ * pipe_commit_read - Set pipe buffer tail pointer in the read-side
+ * @pipe: The pipe in question
+ * @tail: The new tail pointer
+ *
+ * Update the tail pointer in the read-side code after a read has taken place.
+ */
+static inline void pipe_commit_read(struct pipe_inode_info *pipe,
+				    unsigned int tail)
+{
+	pipe->tail = tail;
+}
+
+/**
+ * pipe_commit_write - Set pipe buffer head pointer in the write-side
+ * @pipe: The pipe in question
+ * @head: The new head pointer
+ *
+ * Update the head pointer in the write-side code after a write has taken place.
+ */
+static inline void pipe_commit_write(struct pipe_inode_info *pipe,
+				     unsigned int head)
+{
+	pipe->head = head;
+}
+
 /**
  * pipe_buf_get - get a reference to a pipe_buffer
  * @pipe:	the pipe that the buffer belongs to
diff --git a/include/linux/uio.h b/include/linux/uio.h
index ab5f523bc0df..9576fd8158d7 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -45,8 +45,8 @@ struct iov_iter {
 	union {
 		unsigned long nr_segs;
 		struct {
-			int idx;
-			int start_idx;
+			unsigned int head;
+			unsigned int start_head;
 		};
 	};
 };
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 639d5e7014c1..c8a046c87e3f 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -325,28 +325,33 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
 static bool sanity(const struct iov_iter *i)
 {
 	struct pipe_inode_info *pipe = i->pipe;
-	int idx = i->idx;
-	int next = pipe->curbuf + pipe->nrbufs;
+	unsigned int p_head = pipe->head;
+	unsigned int p_tail = pipe->tail;
+	unsigned int p_mask = pipe->ring_size - 1;
+	unsigned int p_occupancy = p_head - p_tail;
+	unsigned int i_head = i->head;
+	unsigned int idx;
+
 	if (i->iov_offset) {
 		struct pipe_buffer *p;
-		if (unlikely(!pipe->nrbufs))
+		if (unlikely(p_occupancy == 0))
 			goto Bad;	// pipe must be non-empty
-		if (unlikely(idx != ((next - 1) & (pipe->buffers - 1))))
+		if (unlikely(i_head != p_head - 1))
 			goto Bad;	// must be at the last buffer...
 
-		p = &pipe->bufs[idx];
+		p = &pipe->bufs[i_head & p_mask];
 		if (unlikely(p->offset + p->len != i->iov_offset))
 			goto Bad;	// ... at the end of segment
 	} else {
-		if (idx != (next & (pipe->buffers - 1)))
+		if (i_head != p_head)
 			goto Bad;	// must be right after the last buffer
 	}
 	return true;
 Bad:
-	printk(KERN_ERR "idx = %d, offset = %zd\n", i->idx, i->iov_offset);
-	printk(KERN_ERR "curbuf = %d, nrbufs = %d, buffers = %d\n",
-			pipe->curbuf, pipe->nrbufs, pipe->buffers);
-	for (idx = 0; idx < pipe->buffers; idx++)
+	printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
+	printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
+			p_head, p_tail, pipe->ring_size);
+	for (idx = 0; idx < pipe->ring_size; idx++)
 		printk(KERN_ERR "[%p %p %d %d]\n",
 			pipe->bufs[idx].ops,
 			pipe->bufs[idx].page,
@@ -359,18 +364,15 @@ static bool sanity(const struct iov_iter *i)
 #define sanity(i) true
 #endif
 
-static inline int next_idx(int idx, struct pipe_inode_info *pipe)
-{
-	return (idx + 1) & (pipe->buffers - 1);
-}
-
 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
 	struct pipe_inode_info *pipe = i->pipe;
 	struct pipe_buffer *buf;
+	unsigned int p_tail = pipe->tail;
+	unsigned int p_mask = pipe->ring_size - 1;
+	unsigned int i_head = i->head;
 	size_t off;
-	int idx;
 
 	if (unlikely(bytes > i->count))
 		bytes = i->count;
@@ -382,8 +384,7 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
 		return 0;
 
 	off = i->iov_offset;
-	idx = i->idx;
-	buf = &pipe->bufs[idx];
+	buf = &pipe->bufs[i_head & p_mask];
 	if (off) {
 		if (offset == off && buf->page == page) {
 			/* merge with the last one */
@@ -391,18 +392,21 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
 			i->iov_offset += bytes;
 			goto out;
 		}
-		idx = next_idx(idx, pipe);
-		buf = &pipe->bufs[idx];
+		i_head++;
+		buf = &pipe->bufs[i_head & p_mask];
 	}
-	if (idx == pipe->curbuf && pipe->nrbufs)
+	if (i_head - p_tail >= pipe->max_usage)
 		return 0;
-	pipe->nrbufs++;
+
 	buf->ops = &page_cache_pipe_buf_ops;
-	get_page(buf->page = page);
+	get_page(page);
+	buf->page = page;
 	buf->offset = offset;
 	buf->len = bytes;
+
+	pipe_commit_read(pipe, i_head);
 	i->iov_offset = offset + bytes;
-	i->idx = idx;
+	i->head = i_head;
 out:
 	i->count -= bytes;
 	return bytes;
@@ -480,24 +484,30 @@ static inline bool allocated(struct pipe_buffer *buf)
 	return buf->ops == &default_pipe_buf_ops;
 }
 
-static inline void data_start(const struct iov_iter *i, int *idxp, size_t *offp)
+static inline void data_start(const struct iov_iter *i, unsigned int *i_headp,
+			      size_t *offp)
 {
+	unsigned int p_mask = i->pipe->ring_size - 1;
+	unsigned int i_head = i->head;
 	size_t off = i->iov_offset;
-	int idx = i->idx;
-	if (off && (!allocated(&i->pipe->bufs[idx]) || off == PAGE_SIZE)) {
-		idx = next_idx(idx, i->pipe);
+
+	if (off && (!allocated(&i->pipe->bufs[i_head & p_mask]) ||
+		    off == PAGE_SIZE)) {
+		i_head++;
 		off = 0;
 	}
-	*idxp = idx;
+	*i_headp = i_head;
 	*offp = off;
 }
 
 static size_t push_pipe(struct iov_iter *i, size_t size,
-			int *idxp, size_t *offp)
+			int *i_headp, size_t *offp)
 {
 	struct pipe_inode_info *pipe = i->pipe;
+	unsigned int p_tail = pipe->tail;
+	unsigned int p_mask = pipe->ring_size - 1;
+	unsigned int i_head;
 	size_t off;
-	int idx;
 	ssize_t left;
 
 	if (unlikely(size > i->count))
@@ -506,33 +516,34 @@ static size_t push_pipe(struct iov_iter *i, size_t size,
 		return 0;
 
 	left = size;
-	data_start(i, &idx, &off);
-	*idxp = idx;
+	data_start(i, &i_head, &off);
+	*i_headp = i_head;
 	*offp = off;
 	if (off) {
 		left -= PAGE_SIZE - off;
 		if (left <= 0) {
-			pipe->bufs[idx].len += size;
+			pipe->bufs[i_head & p_mask].len += size;
 			return size;
 		}
-		pipe->bufs[idx].len = PAGE_SIZE;
-		idx = next_idx(idx, pipe);
+		pipe->bufs[i_head & p_mask].len = PAGE_SIZE;
+		i_head++;
 	}
-	while (idx != pipe->curbuf || !pipe->nrbufs) {
+	while (i_head <= p_tail + pipe->max_usage) {
+		struct pipe_buffer *buf = &pipe->bufs[i_head & p_mask];
 		struct page *page = alloc_page(GFP_USER);
 		if (!page)
 			break;
-		pipe->nrbufs++;
-		pipe->bufs[idx].ops = &default_pipe_buf_ops;
-		pipe->bufs[idx].page = page;
-		pipe->bufs[idx].offset = 0;
-		if (left <= PAGE_SIZE) {
-			pipe->bufs[idx].len = left;
+
+		buf->ops = &default_pipe_buf_ops;
+		buf->page = page;
+		buf->offset = 0;
+		buf->len = max_t(ssize_t, left, PAGE_SIZE);
+		left -= buf->len;
+		i_head++;
+		pipe_commit_write(pipe, i_head);
+
+		if (left == 0)
 			return size;
-		}
-		pipe->bufs[idx].len = PAGE_SIZE;
-		left -= PAGE_SIZE;
-		idx = next_idx(idx, pipe);
 	}
 	return size - left;
 }
@@ -541,23 +552,26 @@ static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
 				struct iov_iter *i)
 {
 	struct pipe_inode_info *pipe = i->pipe;
+	unsigned int p_mask = pipe->ring_size - 1;
+	unsigned int i_head;
 	size_t n, off;
-	int idx;
 
 	if (!sanity(i))
 		return 0;
 
-	bytes = n = push_pipe(i, bytes, &idx, &off);
+	bytes = n = push_pipe(i, bytes, &i_head, &off);
 	if (unlikely(!n))
 		return 0;
-	for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+	do {
 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
-		memcpy_to_page(pipe->bufs[idx].page, off, addr, chunk);
-		i->idx = idx;
+		memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
+		i->head = i_head;
 		i->iov_offset = off + chunk;
 		n -= chunk;
 		addr += chunk;
-	}
+		off = 0;
+		i_head++;
+	} while (n);
 	i->count -= bytes;
 	return bytes;
 }
@@ -573,28 +587,31 @@ static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
 				__wsum *csum, struct iov_iter *i)
 {
 	struct pipe_inode_info *pipe = i->pipe;
+	unsigned int p_mask = pipe->ring_size - 1;
+	unsigned int i_head;
 	size_t n, r;
 	size_t off = 0;
 	__wsum sum = *csum;
-	int idx;
 
 	if (!sanity(i))
 		return 0;
 
-	bytes = n = push_pipe(i, bytes, &idx, &r);
+	bytes = n = push_pipe(i, bytes, &i_head, &r);
 	if (unlikely(!n))
 		return 0;
-	for ( ; n; idx = next_idx(idx, pipe), r = 0) {
+	do {
 		size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
-		char *p = kmap_atomic(pipe->bufs[idx].page);
+		char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
 		sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
 		kunmap_atomic(p);
-		i->idx = idx;
+		i->head = i_head;
 		i->iov_offset = r + chunk;
 		n -= chunk;
 		off += chunk;
 		addr += chunk;
-	}
+		r = 0;
+		i_head++;
+	} while (n);
 	i->count -= bytes;
 	*csum = sum;
 	return bytes;
@@ -645,29 +662,32 @@ static size_t copy_pipe_to_iter_mcsafe(const void *addr, size_t bytes,
 				struct iov_iter *i)
 {
 	struct pipe_inode_info *pipe = i->pipe;
+	unsigned int p_mask = pipe->ring_size - 1;
+	unsigned int i_head;
 	size_t n, off, xfer = 0;
-	int idx;
 
 	if (!sanity(i))
 		return 0;
 
-	bytes = n = push_pipe(i, bytes, &idx, &off);
+	bytes = n = push_pipe(i, bytes, &i_head, &off);
 	if (unlikely(!n))
 		return 0;
-	for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+	do {
 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
 		unsigned long rem;
 
-		rem = memcpy_mcsafe_to_page(pipe->bufs[idx].page, off, addr,
-				chunk);
-		i->idx = idx;
+		rem = memcpy_mcsafe_to_page(pipe->bufs[i_head & p_mask].page,
+					    off, addr, chunk);
+		i->head = i_head;
 		i->iov_offset = off + chunk - rem;
 		xfer += chunk - rem;
 		if (rem)
 			break;
 		n -= chunk;
 		addr += chunk;
-	}
+		off = 0;
+		i_head++;
+	} while (n);
 	i->count -= xfer;
 	return xfer;
 }
@@ -925,6 +945,8 @@ EXPORT_SYMBOL(copy_page_from_iter);
 static size_t pipe_zero(size_t bytes, struct iov_iter *i)
 {
 	struct pipe_inode_info *pipe = i->pipe;
+	unsigned int p_mask = pipe->ring_size - 1;
+	unsigned int i_head;
 	size_t n, off;
 	int idx;
 
@@ -935,13 +957,15 @@ static size_t pipe_zero(size_t bytes, struct iov_iter *i)
 	if (unlikely(!n))
 		return 0;
 
-	for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+	do {
 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
-		memzero_page(pipe->bufs[idx].page, off, chunk);
-		i->idx = idx;
+		memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
+		i->head = i_head;
 		i->iov_offset = off + chunk;
 		n -= chunk;
-	}
+		off = 0;
+		i_head++;
+	} while (n);
 	i->count -= bytes;
 	return bytes;
 }
@@ -987,20 +1011,26 @@ EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
 static inline void pipe_truncate(struct iov_iter *i)
 {
 	struct pipe_inode_info *pipe = i->pipe;
-	if (pipe->nrbufs) {
+	unsigned int p_tail = pipe->tail;
+	unsigned int p_head = pipe->head;
+	unsigned int p_mask = pipe->ring_size - 1;
+
+	if (p_head != p_tail) {
+		struct pipe_buffer *buf;
+		unsigned int i_head = i->head;
 		size_t off = i->iov_offset;
-		int idx = i->idx;
-		int nrbufs = (idx - pipe->curbuf) & (pipe->buffers - 1);
+
 		if (off) {
-			pipe->bufs[idx].len = off - pipe->bufs[idx].offset;
-			idx = next_idx(idx, pipe);
-			nrbufs++;
+			buf = &pipe->bufs[i_head & p_mask];
+			buf->len = off - buf->offset;
+			i_head++;
 		}
-		while (pipe->nrbufs > nrbufs) {
-			pipe_buf_release(pipe, &pipe->bufs[idx]);
-			idx = next_idx(idx, pipe);
-			pipe->nrbufs--;
+		while (p_head != i_head) {
+			p_head--;
+			pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
 		}
+
+		pipe_commit_write(pipe, p_head);
 	}
 }
 
@@ -1011,18 +1041,20 @@ static void pipe_advance(struct iov_iter *i, size_t size)
 		size = i->count;
 	if (size) {
 		struct pipe_buffer *buf;
+		unsigned int p_mask = pipe->ring_size - 1;
+		unsigned int i_head = i->head;
 		size_t off = i->iov_offset, left = size;
-		int idx = i->idx;
+
 		if (off) /* make it relative to the beginning of buffer */
-			left += off - pipe->bufs[idx].offset;
+			left += off - pipe->bufs[i_head & p_mask].offset;
 		while (1) {
-			buf = &pipe->bufs[idx];
+			buf = &pipe->bufs[i_head & p_mask];
 			if (left <= buf->len)
 				break;
 			left -= buf->len;
-			idx = next_idx(idx, pipe);
+			i_head++;
 		}
-		i->idx = idx;
+		i->head = i_head;
 		i->iov_offset = buf->offset + left;
 	}
 	i->count -= size;
@@ -1053,25 +1085,27 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
 	i->count += unroll;
 	if (unlikely(iov_iter_is_pipe(i))) {
 		struct pipe_inode_info *pipe = i->pipe;
-		int idx = i->idx;
+		unsigned int p_mask = pipe->ring_size - 1;
+		unsigned int i_head = i->head;
 		size_t off = i->iov_offset;
 		while (1) {
-			size_t n = off - pipe->bufs[idx].offset;
+			struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
+			size_t n = off - b->offset;
 			if (unroll < n) {
 				off -= unroll;
 				break;
 			}
 			unroll -= n;
-			if (!unroll && idx == i->start_idx) {
+			if (!unroll && i_head == i->start_head) {
 				off = 0;
 				break;
 			}
-			if (!idx--)
-				idx = pipe->buffers - 1;
-			off = pipe->bufs[idx].offset + pipe->bufs[idx].len;
+			i_head--;
+			b = &pipe->bufs[i_head & p_mask];
+			off = b->offset + b->len;
 		}
 		i->iov_offset = off;
-		i->idx = idx;
+		i->head = i_head;
 		pipe_truncate(i);
 		return;
 	}
@@ -1159,13 +1193,13 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
 			size_t count)
 {
 	BUG_ON(direction != READ);
-	WARN_ON(pipe->nrbufs == pipe->buffers);
+	WARN_ON(pipe->head - pipe->tail >= pipe->ring_size);
 	i->type = ITER_PIPE | READ;
 	i->pipe = pipe;
-	i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+	i->head = pipe->head;
 	i->iov_offset = 0;
 	i->count = count;
-	i->start_idx = i->idx;
+	i->start_head = i->head;
 }
 EXPORT_SYMBOL(iov_iter_pipe);
 
@@ -1189,11 +1223,12 @@ EXPORT_SYMBOL(iov_iter_discard);
 
 unsigned long iov_iter_alignment(const struct iov_iter *i)
 {
+	unsigned int p_mask = i->pipe->ring_size - 1;
 	unsigned long res = 0;
 	size_t size = i->count;
 
 	if (unlikely(iov_iter_is_pipe(i))) {
-		if (size && i->iov_offset && allocated(&i->pipe->bufs[i->idx]))
+		if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
 			return size | i->iov_offset;
 		return size;
 	}
@@ -1231,19 +1266,20 @@ EXPORT_SYMBOL(iov_iter_gap_alignment);
 static inline ssize_t __pipe_get_pages(struct iov_iter *i,
 				size_t maxsize,
 				struct page **pages,
-				int idx,
+				int i_head,
 				size_t *start)
 {
 	struct pipe_inode_info *pipe = i->pipe;
-	ssize_t n = push_pipe(i, maxsize, &idx, start);
+	unsigned int p_mask = pipe->ring_size - 1;
+	ssize_t n = push_pipe(i, maxsize, &i_head, start);
 	if (!n)
 		return -EFAULT;
 
 	maxsize = n;
 	n += *start;
 	while (n > 0) {
-		get_page(*pages++ = pipe->bufs[idx].page);
-		idx = next_idx(idx, pipe);
+		get_page(*pages++ = pipe->bufs[i_head & p_mask].page);
+		i_head++;
 		n -= PAGE_SIZE;
 	}
 
@@ -1254,9 +1290,10 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
 		   size_t *start)
 {
+	unsigned int p_tail;
+	unsigned int i_head;
 	unsigned npages;
 	size_t capacity;
-	int idx;
 
 	if (!maxsize)
 		return 0;
@@ -1264,12 +1301,15 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
 	if (!sanity(i))
 		return -EFAULT;
 
-	data_start(i, &idx, start);
-	/* some of this one + all after this one */
-	npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
-	capacity = min(npages,maxpages) * PAGE_SIZE - *start;
+	data_start(i, &i_head, start);
+	p_tail = i->pipe->tail;
+	/* Amount of free space: some of this one + all after this one */
+	npages = (p_tail + i->pipe->ring_size) - i_head;
+	if (npages > i->pipe->max_usage)
+		npages = i->pipe->max_usage;
+	capacity = min(npages, maxpages) * PAGE_SIZE - *start;
 
-	return __pipe_get_pages(i, min(maxsize, capacity), pages, idx, start);
+	return __pipe_get_pages(i, min(maxsize, capacity), pages, i_head, start);
 }
 
 ssize_t iov_iter_get_pages(struct iov_iter *i,
@@ -1323,8 +1363,9 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 		   size_t *start)
 {
 	struct page **p;
+	unsigned int p_tail;
+	unsigned int i_head;
 	ssize_t n;
-	int idx;
 	int npages;
 
 	if (!maxsize)
@@ -1333,9 +1374,11 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 	if (!sanity(i))
 		return -EFAULT;
 
-	data_start(i, &idx, start);
-	/* some of this one + all after this one */
-	npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
+	data_start(i, &i_head, start);
+	/* Amount of free space: some of this one + all after this one */
+	npages = (p_tail + i->pipe->ring_size) - i_head;
+	if (npages > i->pipe->max_usage)
+		npages = i->pipe->max_usage;
 	n = npages * PAGE_SIZE - *start;
 	if (maxsize > n)
 		maxsize = n;
@@ -1344,7 +1387,7 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
 	p = get_pages_array(npages);
 	if (!p)
 		return -ENOMEM;
-	n = __pipe_get_pages(i, maxsize, p, idx, start);
+	n = __pipe_get_pages(i, maxsize, p, i_head, start);
 	if (n > 0)
 		*pages = p;
 	else
@@ -1560,15 +1603,18 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 
 	if (unlikely(iov_iter_is_pipe(i))) {
 		struct pipe_inode_info *pipe = i->pipe;
+		unsigned int p_tail = pipe->tail;
+		unsigned int i_head;
 		size_t off;
-		int idx;
 
 		if (!sanity(i))
 			return 0;
 
-		data_start(i, &idx, &off);
+		data_start(i, &i_head, &off);
 		/* some of this one + all after this one */
-		npages = ((pipe->curbuf - idx - 1) & (pipe->buffers - 1)) + 1;
+		npages = (p_tail + pipe->ring_size) - i_head;
+		if (npages > i->pipe->max_usage)
+			npages = i->pipe->max_usage;
 		if (npages >= maxpages)
 			return maxpages;
 	} else iterate_all_kinds(i, size, v, ({


^ permalink raw reply related

* [RFC PATCH 02/21] Add a prelocked wake-up
From: David Howells @ 2019-10-15 21:48 UTC (permalink / raw)
  To: torvalds
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
	linux-usb, linux-block, linux-security-module, linux-fsdevel,
	linux-api, linux-security-module, linux-kernel
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>

Add a wakeup call for a case whereby the caller already has the waitqueue
spinlock held.  This can be used by pipes to alter the ring buffer indices
under the spinlock.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 include/linux/wait.h |    2 ++
 kernel/sched/wait.c  |    7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3eb7cae8206c..d511b298a20c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -229,6 +229,8 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 	__wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
 #define wake_up_interruptible_sync_poll(x, m)					\
 	__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, poll_to_key(m))
+void prelocked_wake_up_interruptible_sync_poll(struct wait_queue_head *wq_head,
+					       __poll_t mask);
 
 #define ___wait_cond_timeout(condition)						\
 ({										\
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index c1e566a114ca..43fbbbe9af27 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -126,6 +126,13 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
 	} while (bookmark.flags & WQ_FLAG_BOOKMARK);
 }
 
+void prelocked_wake_up_interruptible_sync_poll(struct wait_queue_head *wq_head,
+					       __poll_t mask)
+{
+	__wake_up_common(wq_head, TASK_INTERRUPTIBLE, 1, WF_SYNC,
+			 poll_to_key(mask), NULL);
+}
+
 /**
  * __wake_up - wake up threads blocked on a waitqueue.
  * @wq_head: the waitqueue


^ permalink raw reply related

* [RFC PATCH 01/21] pipe: Reduce #inclusion of pipe_fs_i.h
From: David Howells @ 2019-10-15 21:47 UTC (permalink / raw)
  To: torvalds
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
	linux-usb, linux-block, linux-security-module, linux-fsdevel,
	linux-api, linux-security-module, linux-kernel
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>

Remove some #inclusions of linux/pipe_fs_i.h that don't seem to be
necessary any more.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/exec.c                  |    1 -
 fs/ocfs2/aops.c            |    1 -
 security/smack/smack_lsm.c |    1 -
 3 files changed, 3 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 555e93c7dec8..57bc7ef8d31b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,7 +59,6 @@
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
-#include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
 #include <linux/vmalloc.h>
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8de1c9d644f6..c50ac6b7415b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -11,7 +11,6 @@
 #include <linux/pagemap.h>
 #include <asm/byteorder.h>
 #include <linux/swap.h>
-#include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index abeb09c30633..ecea41ce919b 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -28,7 +28,6 @@
 #include <linux/icmpv6.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
-#include <linux/pipe_fs_i.h>
 #include <net/cipso_ipv4.h>
 #include <net/ip.h>
 #include <net/ipv6.h>


^ permalink raw reply related

* [RFC PATCH 00/21] pipe: Keyrings, Block and USB notifications
From: David Howells @ 2019-10-15 21:47 UTC (permalink / raw)
  To: torvalds
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
	linux-usb, linux-block, linux-security-module, linux-fsdevel,
	linux-api, linux-security-module, linux-kernel


Here's a set of patches to add a general notification queue concept and to
add event sources such as:

 (1) Keys/keyrings, such as linking and unlinking keys and changing their
     attributes.

 (2) General device events (single common queue) including:

     - Block layer events, such as device errors

     - USB subsystem events, such as device attach/remove, device reset,
       device errors.

I have patches for adding superblock and mount topology watches also,
though those are not in this set as there are other dependencies.

LSM hooks are included:

 (1) A set of hooks are provided that allow an LSM to rule on whether or
     not a watch may be set.  Each of these hooks takes a different
     "watched object" parameter, so they're not really shareable.  The LSM
     should use current's credentials.  [Wanted by SELinux & Smack]

 (2) A hook is provided to allow an LSM to rule on whether or not a
     particular message may be posted to a particular queue.  This is given
     the credentials from the event generator (which may be the system) and
     the watch setter.  [Wanted by Smack]

I've provided SELinux and Smack with implementations of some of these hooks.


Design decisions:

 (1) The notification queue is built on top of a standard pipe.  Messages are
     effectively spliced in.

	pipe2(fds, O_TMPFILE); // Note that O_TMPFILE is just hacked in atm

     which is then configured::

	ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, queue_depth);
	ioctl(fds[1], IOC_WATCH_QUEUE_SET_FILTER, &filter);

     Messages are then read out of the pipe using read().

 (2) It should be possible to allow write() to insert data into the
     notification pipes too, but this is currently disabled as the kernel has
     to be able to insert messages into the pipe *without* holding pipe->mutex
     and the code to make this work needs careful auditing.

 (3) sendfile(), splice() and vmsplice() are disabled on notification pipes
     because of the pipe->mutex issue and also because they sometimes want to
     revert what they just did - but one or more notification messages
     might've been interleaved in the ring.

 (4) The kernel inserts messages with the wait queue spinlock held.  This
     means that pipe_read() and pipe_write() have to take the spinlock to
     update the queue pointers.

 (5) Records in the buffer are binary, typed and have a length so that they
     can be of varying size.

     This allows multiple heterogeneous sources to share a common buffer;
     there are 16 million types available, of which I've used just a few,
     so there is scope for others to be used.  Tags may be specified when a
     watchpoint is created to help distinguish the sources.

 (6) Records are filterable as types have up to 256 subtypes that can be
     individually filtered.  Other filtration is also available.

 (7) Notification pipes don't interfere with each other; each may be bound to
     a different set of watches.  Any particular notification will be copied
     to all the queues that are currently watching for it - and only those
     that are watching for it.

 (8) When recording a notification, the kernel will not sleep, but will rather
     mark a queue as having lost a message if there's insufficient space.
     read() will fabricate a loss notification message at an appropriate point
     later.

 (9) The notification pipe is created and then watchpoints are attached to it,
     using one of:

	keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fds[1], 0x01);
	watch_devices(fds[1], 0x02, 0);

     where in both cases, fd indicates the queue and the number after is a
     tag between 0 and 255.

(10) Watches are removed if either the notification pipe is destroyed or the
     watched object is destroyed.  In the latter case, a message will be
     generated indicating the enforced watch removal.


Things I want to avoid:

 (1) Introducing features that make the core VFS dependent on the network
     stack or networking namespaces (ie. usage of netlink).

 (2) Dumping all this stuff into dmesg and having a daemon that sits there
     parsing the output and distributing it as this then puts the
     responsibility for security into userspace and makes handling
     namespaces tricky.  Further, dmesg might not exist or might be
     inaccessible inside a container.

 (3) Letting users see events they shouldn't be able to see.


Benchmarks:

PATCHES BENCHMARK	BEST		TOTAL BYTES	AVG BYTES	STDDEV
======= =============== =============== =============== =============== ===============
0	pipe		      305798752	    36324188187	      302701568		6128384
0	vmsplice	      432261032	    49377769035	      411481408	       60638553
0	splice		      284943309	    27769149338	      231409577	      171374033

prelock pipe		      307500893	    36470120589	      303917671		9970065
prelock vmsplice	      437991897	    51502859114	      429190492	       22841773
prelock splice		      285469128	    28158931451	      234657762	      138350254

ht	pipe		      304526362	    35934813852	      299456782	       14371537
ht	vmsplice	      437819690	    51481865190	      429015543	       25713854
ht	splice		      242954439	    23296773094	      194139775	      148906490

r	pipe		      306166161	    36117069156	      300975576	       11493141
r	vmsplice	      335931911	    39837793527	      331981612		9098195
r	splice		      282334654	    27472565449	      228938045	      219399557

rx	pipe		      304664843	    36059231110	      300493592		9800700
rx	vmsplice	      446602247	    52216984680	      435141539	       31456017
rx	splice		      277978890	    26516522263	      220971018	      199170870

rxw	pipe		      304180415	    36317294286	      302644119		5967557
rxw	vmsplice	      447361082	    51238998643	      426991655	       62191270
rxw	splice		      277213923	    27627387718	      230228230	      173675370

rxwx	pipe		      305390974	    36110185475	      300918212	       11895583
rxwx	vmsplice	      445372781	    50668744616	      422239538	       63133543
rxwx	splice		      280218603	    27677264384	      230643869	      196958675

rxwxf	pipe		      305001592	    36325565843	      302713048		7551300
rxwxf	vmsplice	      444316544	    50213093070	      418442442	       77117994
rxwxf	splice		      278371338	    28859765396	      240498044	      160245812

all	pipe		      298858861	    35543407781	      296195064		5173617
all	vmsplice	      453414388	    53792295991	      448269133	       10914547
all	splice		      279264055	    26812726990	      223439391	      212421416

The patches column indicates the point in the patchset at which the benchmarks
were taken:

	0	No patches
	prelock "Add a prelocked wake-up"
	hr	"pipe: Use head and tail pointers for the ring, not cursor and length"
	r	"pipe: Advance tail pointer inside of wait spinlock in pipe_read()"
	rx	"pipe: Conditionalise wakeup in pipe_read()"
	rxw	"pipe: Rearrange sequence in pipe_write() to preallocate slot"
	rxwx	"pipe: Remove redundant wakeup from pipe_write()"
	rxwxf	"pipe: Check for ring full inside of the spinlock in pipe_write()"
	all	All of the patches

The benchmark programs can be found here:

	http://people.redhat.com/dhowells/pipe-bench.c
	http://people.redhat.com/dhowells/vmsplice-bench.c
	http://people.redhat.com/dhowells/splice-bench.c

The patches can be found here also:

	http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=pipe-experimental

Changes:

 ver #1:

 (*) Build on top of standard pipes instead of having a driver.

David
---
David Howells (21):
      pipe: Reduce #inclusion of pipe_fs_i.h
      Add a prelocked wake-up
      pipe: Use head and tail pointers for the ring, not cursor and length
      pipe: Advance tail pointer inside of wait spinlock in pipe_read()
      pipe: Conditionalise wakeup in pipe_read()
      pipe: Rearrange sequence in pipe_write() to preallocate slot
      pipe: Remove redundant wakeup from pipe_write()
      pipe: Check for ring full inside of the spinlock in pipe_write()
      uapi: General notification queue definitions
      security: Add hooks to rule on setting a watch
      security: Add a hook for the point of notification insertion
      pipe: Add general notification queue support
      keys: Add a notification facility
      Add sample notification program
      pipe: Allow buffers to be marked read-whole-or-error for notifications
      pipe: Add notification lossage handling
      Add a general, global device notification watch list
      block: Add block layer notifications
      usb: Add USB subsystem notifications
      selinux: Implement the watch_key security hook
      smack: Implement the watch_key and post_notification hooks


 Documentation/ioctl/ioctl-number.rst        |    1 
 Documentation/security/keys/core.rst        |   58 ++
 Documentation/watch_queue.rst               |  385 ++++++++++++++++
 arch/alpha/kernel/syscalls/syscall.tbl      |    1 
 arch/arm/tools/syscall.tbl                  |    1 
 arch/arm64/include/asm/unistd.h             |    2 
 arch/arm64/include/asm/unistd32.h           |    2 
 arch/ia64/kernel/syscalls/syscall.tbl       |    1 
 arch/m68k/kernel/syscalls/syscall.tbl       |    1 
 arch/microblaze/kernel/syscalls/syscall.tbl |    1 
 arch/mips/kernel/syscalls/syscall_n32.tbl   |    1 
 arch/mips/kernel/syscalls/syscall_n64.tbl   |    1 
 arch/mips/kernel/syscalls/syscall_o32.tbl   |    1 
 arch/parisc/kernel/syscalls/syscall.tbl     |    1 
 arch/powerpc/kernel/syscalls/syscall.tbl    |    1 
 arch/s390/kernel/syscalls/syscall.tbl       |    1 
 arch/sh/kernel/syscalls/syscall.tbl         |    1 
 arch/sparc/kernel/syscalls/syscall.tbl      |    1 
 arch/x86/entry/syscalls/syscall_32.tbl      |    1 
 arch/x86/entry/syscalls/syscall_64.tbl      |    1 
 arch/xtensa/kernel/syscalls/syscall.tbl     |    1 
 block/Kconfig                               |    9 
 block/blk-core.c                            |   29 +
 drivers/base/Kconfig                        |    9 
 drivers/base/Makefile                       |    1 
 drivers/base/watch.c                        |   90 ++++
 drivers/usb/core/Kconfig                    |    9 
 drivers/usb/core/devio.c                    |   47 ++
 drivers/usb/core/hub.c                      |    4 
 fs/exec.c                                   |    1 
 fs/fuse/dev.c                               |   31 +
 fs/ocfs2/aops.c                             |    1 
 fs/pipe.c                                   |  392 +++++++++++-----
 fs/splice.c                                 |  195 +++++---
 include/linux/blkdev.h                      |   15 +
 include/linux/device.h                      |    7 
 include/linux/key.h                         |    3 
 include/linux/lsm_audit.h                   |    1 
 include/linux/lsm_hooks.h                   |   38 ++
 include/linux/pipe_fs_i.h                   |   63 ++-
 include/linux/security.h                    |   32 +
 include/linux/syscalls.h                    |    1 
 include/linux/uio.h                         |    4 
 include/linux/usb.h                         |   18 +
 include/linux/wait.h                        |    2 
 include/linux/watch_queue.h                 |  127 +++++
 include/uapi/asm-generic/unistd.h           |    4 
 include/uapi/linux/keyctl.h                 |    2 
 include/uapi/linux/watch_queue.h            |  155 ++++++
 init/Kconfig                                |   12 
 kernel/Makefile                             |    1 
 kernel/sched/wait.c                         |    7 
 kernel/sys_ni.c                             |    1 
 kernel/watch_queue.c                        |  660 +++++++++++++++++++++++++++
 lib/iov_iter.c                              |  274 +++++++----
 samples/Kconfig                             |    7 
 samples/Makefile                            |    1 
 samples/watch_queue/Makefile                |    7 
 samples/watch_queue/watch_test.c            |  252 ++++++++++
 security/keys/Kconfig                       |    9 
 security/keys/compat.c                      |    3 
 security/keys/gc.c                          |    5 
 security/keys/internal.h                    |   30 +
 security/keys/key.c                         |   38 +-
 security/keys/keyctl.c                      |   99 ++++
 security/keys/keyring.c                     |   20 +
 security/keys/request_key.c                 |    4 
 security/security.c                         |   23 +
 security/selinux/hooks.c                    |   14 +
 security/smack/smack_lsm.c                  |   83 +++
 70 files changed, 2927 insertions(+), 377 deletions(-)
 create mode 100644 Documentation/watch_queue.rst
 create mode 100644 drivers/base/watch.c
 create mode 100644 include/linux/watch_queue.h
 create mode 100644 include/uapi/linux/watch_queue.h
 create mode 100644 kernel/watch_queue.c
 create mode 100644 samples/watch_queue/Makefile
 create mode 100644 samples/watch_queue/watch_test.c


^ permalink raw reply

* Re: [PATCH v2] perf_event: Add support for LSM and SELinux checks
From: Stephen Smalley @ 2019-10-15 14:35 UTC (permalink / raw)
  To: Joel Fernandes (Google), linux-kernel
  Cc: Peter Zijlstra, rostedt, primiano, rsavitski, jeffv, kernel-team,
	James Morris, Alexei Starovoitov, Arnaldo Carvalho de Melo, bpf,
	Daniel Borkmann, Ingo Molnar, Jiri Olsa, Kees Cook,
	linux-security-module, Matthew Garrett, Namhyung Kim, selinux,
	Song Liu, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	Yonghong Song
In-Reply-To: <20191014170308.70668-1-joel@joelfernandes.org>

On 10/14/19 1:03 PM, Joel Fernandes (Google) wrote:
> In current mainline, the degree of access to perf_event_open(2) system
> call depends on the perf_event_paranoid sysctl.  This has a number of
> limitations:
> 
> 1. The sysctl is only a single value. Many types of accesses are controlled
>     based on the single value thus making the control very limited and
>     coarse grained.
> 2. The sysctl is global, so if the sysctl is changed, then that means
>     all processes get access to perf_event_open(2) opening the door to
>     security issues.
> 
> This patch adds LSM and SELinux access checking which will be used in
> Android to access perf_event_open(2) for the purposes of attaching BPF
> programs to tracepoints, perf profiling and other operations from
> userspace. These operations are intended for production systems.
> 
> 5 new LSM hooks are added:
> 1. perf_event_open: This controls access during the perf_event_open(2)
>     syscall itself. The hook is called from all the places that the
>     perf_event_paranoid sysctl is checked to keep it consistent with the
>     systctl. The hook gets passed a 'type' argument which controls CPU,
>     kernel and tracepoint accesses (in this context, CPU, kernel and
>     tracepoint have the same semantics as the perf_event_paranoid sysctl).
>     Additionally, I added an 'open' type which is similar to
>     perf_event_paranoid sysctl == 3 patch carried in Android and several other
>     distros but was rejected in mainline [1] in 2016.
> 
> 2. perf_event_alloc: This allocates a new security object for the event
>     which stores the current SID within the event. It will be useful when
>     the perf event's FD is passed through IPC to another process which may
>     try to read the FD. Appropriate security checks will limit access.
> 
> 3. perf_event_free: Called when the event is closed.
> 
> 4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event.
> 
> 5. perf_event_write: Called from the ioctl(2) syscalls for the event.
> 
> [1] https://lwn.net/Articles/696240/
> 
> Since Peter had suggest LSM hooks in 2016 [1], I am adding his
> Suggested-by tag below.
> 
> To use this patch, we set the perf_event_paranoid sysctl to -1 and then
> apply selinux checking as appropriate (default deny everything, and then
> add policy rules to give access to domains that need it). In the future
> we can remove the perf_event_paranoid sysctl altogether.
> 
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: rostedt@goodmis.org
> Cc: primiano@google.com
> Cc: rsavitski@google.com
> Cc: jeffv@google.com
> Cc: kernel-team@android.com
> Acked-by: James Morris <jmorris@namei.org>
> Co-developed-by: Peter Zijlstra <peterz@infradead.org>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
> 
> Changes since v1:
>   o Fixes from Peter Ziljstra.
>   o Added Ack from James Morris and Co-developed-by tag for Peter.
> Changes since RFC:
>   o Small nits, style changes (James Morris).
>   o Consolidation of code (Peter Zijlstra).
> 
> 
>   arch/x86/events/intel/bts.c         |  8 ++--
>   arch/x86/events/intel/core.c        |  5 ++-
>   arch/x86/events/intel/p4.c          |  5 ++-
>   include/linux/lsm_hooks.h           | 15 +++++++
>   include/linux/perf_event.h          | 28 +++++++++---
>   include/linux/security.h            | 39 +++++++++++++++-
>   include/uapi/linux/perf_event.h     |  9 ++++
>   kernel/events/core.c                | 57 +++++++++++++++++++-----
>   kernel/trace/trace_event_perf.c     | 15 ++++---
>   security/security.c                 | 27 +++++++++++
>   security/selinux/hooks.c            | 69 +++++++++++++++++++++++++++++
>   security/selinux/include/classmap.h |  2 +
>   security/selinux/include/objsec.h   |  6 ++-
>   13 files changed, 255 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
> index 5ee3fed881d3..38de4a7f6752 100644
> --- a/arch/x86/events/intel/bts.c
> +++ b/arch/x86/events/intel/bts.c
> @@ -549,9 +549,11 @@ static int bts_event_init(struct perf_event *event)
>   	 * Note that the default paranoia setting permits unprivileged
>   	 * users to profile the kernel.
>   	 */
> -	if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
> -	    !capable(CAP_SYS_ADMIN))
> -		return -EACCES;
> +	if (event->attr.exclude_kernel) {
> +		ret = perf_allow_kernel(&event->attr);
> +		if (ret)
> +			return ret;
> +	}
>   
>   	if (x86_add_exclusive(x86_lbr_exclusive_bts))
>   		return -EBUSY;
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index 27ee47a7be66..32967a9e9962 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -3315,8 +3315,9 @@ static int intel_pmu_hw_config(struct perf_event *event)
>   	if (x86_pmu.version < 3)
>   		return -EINVAL;
>   
> -	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
> -		return -EACCES;
> +	ret = perf_allow_cpu(&event->attr);
> +	if (ret)
> +		return ret;
>   
>   	event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
>   
> diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
> index dee579efb2b2..a4cc66005ce8 100644
> --- a/arch/x86/events/intel/p4.c
> +++ b/arch/x86/events/intel/p4.c
> @@ -776,8 +776,9 @@ static int p4_validate_raw_event(struct perf_event *event)
>   	 * the user needs special permissions to be able to use it
>   	 */
>   	if (p4_ht_active() && p4_event_bind_map[v].shared) {
> -		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
> -			return -EACCES;
> +		v = perf_allow_cpu(&event->attr);
> +		if (v)
> +			return v;
>   	}
>   
>   	/* ESCR EventMask bits may be invalid */
> diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
> index a3763247547c..20d8cf194fb7 100644
> --- a/include/linux/lsm_hooks.h
> +++ b/include/linux/lsm_hooks.h
> @@ -1818,6 +1818,14 @@ union security_list_options {
>   	void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
>   #endif /* CONFIG_BPF_SYSCALL */
>   	int (*locked_down)(enum lockdown_reason what);
> +#ifdef CONFIG_PERF_EVENTS
> +	int (*perf_event_open)(struct perf_event_attr *attr, int type);
> +	int (*perf_event_alloc)(struct perf_event *event);
> +	void (*perf_event_free)(struct perf_event *event);
> +	int (*perf_event_read)(struct perf_event *event);
> +	int (*perf_event_write)(struct perf_event *event);
> +
> +#endif
>   };
>   
>   struct security_hook_heads {
> @@ -2060,6 +2068,13 @@ struct security_hook_heads {
>   	struct hlist_head bpf_prog_free_security;
>   #endif /* CONFIG_BPF_SYSCALL */
>   	struct hlist_head locked_down;
> +#ifdef CONFIG_PERF_EVENTS
> +	struct hlist_head perf_event_open;
> +	struct hlist_head perf_event_alloc;
> +	struct hlist_head perf_event_free;
> +	struct hlist_head perf_event_read;
> +	struct hlist_head perf_event_write;
> +#endif
>   } __randomize_layout;
>   
>   /*
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 61448c19a132..664bb7f99c46 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -56,6 +56,7 @@ struct perf_guest_info_callbacks {
>   #include <linux/perf_regs.h>
>   #include <linux/cgroup.h>
>   #include <linux/refcount.h>
> +#include <linux/security.h>
>   #include <asm/local.h>
>   
>   struct perf_callchain_entry {
> @@ -721,6 +722,9 @@ struct perf_event {
>   	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
>   #endif
>   
> +#ifdef CONFIG_SECURITY
> +	void *security;
> +#endif
>   	struct list_head		sb_list;
>   #endif /* CONFIG_PERF_EVENTS */
>   };
> @@ -1241,19 +1245,33 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
>   int perf_event_max_stack_handler(struct ctl_table *table, int write,
>   				 void __user *buffer, size_t *lenp, loff_t *ppos);
>   
> -static inline bool perf_paranoid_tracepoint_raw(void)
> +static inline int perf_is_paranoid(void)
>   {
>   	return sysctl_perf_event_paranoid > -1;
>   }
>   
> -static inline bool perf_paranoid_cpu(void)
> +static inline int perf_allow_kernel(struct perf_event_attr *attr)
>   {
> -	return sysctl_perf_event_paranoid > 0;
> +	if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN))
> +		return -EACCES;
> +
> +	return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
>   }
>   
> -static inline bool perf_paranoid_kernel(void)
> +static inline int perf_allow_cpu(struct perf_event_attr *attr)
>   {
> -	return sysctl_perf_event_paranoid > 1;
> +	if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN))
> +		return -EACCES;
> +
> +	return security_perf_event_open(attr, PERF_SECURITY_CPU);
> +}
> +
> +static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
> +{
> +	if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
>   }
>   
>   extern void perf_event_init(void);
> diff --git a/include/linux/security.h b/include/linux/security.h
> index a8d59d612d27..273e11c66ed7 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -1894,5 +1894,42 @@ static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
>   #endif /* CONFIG_SECURITY */
>   #endif /* CONFIG_BPF_SYSCALL */
>   
> -#endif /* ! __LINUX_SECURITY_H */
> +#ifdef CONFIG_PERF_EVENTS
> +struct perf_event_attr;
> +
> +#ifdef CONFIG_SECURITY
> +extern int security_perf_event_open(struct perf_event_attr *attr, int type);
> +extern int security_perf_event_alloc(struct perf_event *event);
> +extern void security_perf_event_free(struct perf_event *event);
> +extern int security_perf_event_read(struct perf_event *event);
> +extern int security_perf_event_write(struct perf_event *event);
> +#else
> +static inline int security_perf_event_open(struct perf_event_attr *attr,
> +					   int type)
> +{
> +	return 0;
> +}
>   
> +static inline int security_perf_event_alloc(struct perf_event *event)
> +{
> +	return 0;
> +}
> +
> +static inline void security_perf_event_free(struct perf_event *event)
> +{
> +	return 0;
> +}
> +
> +static inline int security_perf_event_read(struct perf_event *event)
> +{
> +	return 0;
> +}
> +
> +static inline int security_perf_event_write(struct perf_event *event)
> +{
> +	return 0;
> +}
> +#endif /* CONFIG_SECURITY */
> +#endif /* CONFIG_PERF_EVENTS */
> +
> +#endif /* ! __LINUX_SECURITY_H */
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index bb7b271397a6..2af95f937a5b 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -427,6 +427,15 @@ struct perf_event_attr {
>   	__u16	__reserved_2;	/* align to __u64 */
>   };
>   
> +
> +/* Access to perf_event_open(2) syscall. */
> +#define PERF_SECURITY_OPEN		0
> +
> +/* Finer grained perf_event_open(2) access control. */
> +#define PERF_SECURITY_CPU		1
> +#define PERF_SECURITY_KERNEL		2
> +#define PERF_SECURITY_TRACEPOINT	3
> +

Why are these definitions part of the uapi header and not private to the 
kernel?  You map them to SELinux permissions in the kernel, and the 
SELinux permissions are mapped to policy values at policy load time (so 
the values need not be identical).  We also have mechanisms in SELinux 
to permit evolution of the permissions over time in a 
backward-compatible manner.

>   /*
>    * Structure used by below PERF_EVENT_IOC_QUERY_BPF command
>    * to query bpf programs attached to the same perf tracepoint
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 4655adbbae10..8ad597fae5f4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -4217,8 +4217,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
>   
>   	if (!task) {
>   		/* Must be root to operate on a CPU event: */
> -		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
> -			return ERR_PTR(-EACCES);
> +		err = perf_allow_cpu(&event->attr);
> +		if (err)
> +			return ERR_PTR(err);
>   
>   		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
>   		ctx = &cpuctx->ctx;
> @@ -4527,6 +4528,8 @@ static void _free_event(struct perf_event *event)
>   
>   	unaccount_event(event);
>   
> +	security_perf_event_free(event);
> +
>   	if (event->rb) {
>   		/*
>   		 * Can happen when we close an event with re-directed output.
> @@ -4980,6 +4983,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
>   	struct perf_event_context *ctx;
>   	int ret;
>   
> +	ret = security_perf_event_read(event);
> +	if (ret)
> +		return ret;
> +
>   	ctx = perf_event_ctx_lock(event);
>   	ret = __perf_read(event, buf, count);
>   	perf_event_ctx_unlock(event, ctx);
> @@ -5244,6 +5251,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>   	struct perf_event_context *ctx;
>   	long ret;
>   
> +	/* Treat ioctl like writes as it is likely a mutating operation. */
> +	ret = security_perf_event_write(event);
> +	if (ret)
> +		return ret;
> +
>   	ctx = perf_event_ctx_lock(event);
>   	ret = _perf_ioctl(event, cmd, arg);
>   	perf_event_ctx_unlock(event, ctx);
> @@ -5706,6 +5718,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
>   	if (!(vma->vm_flags & VM_SHARED))
>   		return -EINVAL;
>   
> +	ret = security_perf_event_read(event);
> +	if (ret)
> +		return ret;
> +
>   	vma_size = vma->vm_end - vma->vm_start;
>   
>   	if (vma->vm_pgoff == 0) {
> @@ -5819,7 +5835,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
>   	lock_limit >>= PAGE_SHIFT;
>   	locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
>   
> -	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
> +	if ((locked > lock_limit) && perf_is_paranoid() &&
>   		!capable(CAP_IPC_LOCK)) {
>   		ret = -EPERM;
>   		goto unlock;
> @@ -10553,11 +10569,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>   		}
>   	}
>   
> +	err = security_perf_event_alloc(event);
> +	if (err)
> +		goto err_callchain_buffer;
> +
>   	/* symmetric to unaccount_event() in _free_event() */
>   	account_event(event);
>   
>   	return event;
>   
> +err_callchain_buffer:
> +	if (!event->parent) {
> +		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
> +			put_callchain_buffers();
> +	}
>   err_addr_filters:
>   	kfree(event->addr_filter_ranges);
>   
> @@ -10675,9 +10700,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
>   			attr->branch_sample_type = mask;
>   		}
>   		/* privileged levels capture (kernel, hv): check permissions */
> -		if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
> -		    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
> -			return -EACCES;
> +		if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
> +			ret = perf_allow_kernel(attr);
> +			if (ret)
> +				return ret;
> +		}
>   	}
>   
>   	if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
> @@ -10890,13 +10917,19 @@ SYSCALL_DEFINE5(perf_event_open,
>   	if (flags & ~PERF_FLAG_ALL)
>   		return -EINVAL;
>   
> +	/* Do we allow access to perf_event_open(2) ? */
> +	err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
> +	if (err)
> +		return err;
> +
>   	err = perf_copy_attr(attr_uptr, &attr);
>   	if (err)
>   		return err;
>   
>   	if (!attr.exclude_kernel) {
> -		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
> -			return -EACCES;
> +		err = perf_allow_kernel(&attr);
> +		if (err)
> +			return err;
>   	}
>   
>   	if (attr.namespaces) {
> @@ -10913,9 +10946,11 @@ SYSCALL_DEFINE5(perf_event_open,
>   	}
>   
>   	/* Only privileged users can get physical addresses */
> -	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
> -	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
> -		return -EACCES;
> +	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
> +		err = perf_allow_kernel(&attr);
> +		if (err)
> +			return err;
> +	}
>   
>   	err = security_locked_down(LOCKDOWN_PERF);
>   	if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
> diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
> index 0892e38ed6fb..0917fee6ee7c 100644
> --- a/kernel/trace/trace_event_perf.c
> +++ b/kernel/trace/trace_event_perf.c
> @@ -8,6 +8,7 @@
>   
>   #include <linux/module.h>
>   #include <linux/kprobes.h>
> +#include <linux/security.h>
>   #include "trace.h"
>   #include "trace_probe.h"
>   
> @@ -26,8 +27,10 @@ static int	total_ref_count;
>   static int perf_trace_event_perm(struct trace_event_call *tp_event,
>   				 struct perf_event *p_event)
>   {
> +	int ret;
> +
>   	if (tp_event->perf_perm) {
> -		int ret = tp_event->perf_perm(tp_event, p_event);
> +		ret = tp_event->perf_perm(tp_event, p_event);
>   		if (ret)
>   			return ret;
>   	}
> @@ -46,8 +49,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
>   
>   	/* The ftrace function trace is allowed only for root. */
>   	if (ftrace_event_is_function(tp_event)) {
> -		if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
> -			return -EPERM;
> +		ret = perf_allow_tracepoint(&p_event->attr);
> +		if (ret)
> +			return ret;
>   
>   		if (!is_sampling_event(p_event))
>   			return 0;
> @@ -82,8 +86,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
>   	 * ...otherwise raw tracepoint data can be a severe data leak,
>   	 * only allow root to have these.
>   	 */
> -	if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
> -		return -EPERM;
> +	ret = perf_allow_tracepoint(&p_event->attr);
> +	if (ret)
> +		return ret;
>   
>   	return 0;
>   }
> diff --git a/security/security.c b/security/security.c
> index 1bc000f834e2..cd2d18d2d279 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -2404,3 +2404,30 @@ int security_locked_down(enum lockdown_reason what)
>   	return call_int_hook(locked_down, 0, what);
>   }
>   EXPORT_SYMBOL(security_locked_down);
> +
> +#ifdef CONFIG_PERF_EVENTS
> +int security_perf_event_open(struct perf_event_attr *attr, int type)
> +{
> +	return call_int_hook(perf_event_open, 0, attr, type);
> +}
> +
> +int security_perf_event_alloc(struct perf_event *event)
> +{
> +	return call_int_hook(perf_event_alloc, 0, event);
> +}
> +
> +void security_perf_event_free(struct perf_event *event)
> +{
> +	call_void_hook(perf_event_free, event);
> +}
> +
> +int security_perf_event_read(struct perf_event *event)
> +{
> +	return call_int_hook(perf_event_read, 0, event);
> +}
> +
> +int security_perf_event_write(struct perf_event *event)
> +{
> +	return call_int_hook(perf_event_write, 0, event);
> +}
> +#endif /* CONFIG_PERF_EVENTS */
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index 9625b99e677f..28eb05490d59 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -6795,6 +6795,67 @@ struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
>   	.lbs_msg_msg = sizeof(struct msg_security_struct),
>   };
>   
> +#ifdef CONFIG_PERF_EVENTS
> +static int selinux_perf_event_open(struct perf_event_attr *attr, int type)
> +{
> +	u32 requested, sid = current_sid();
> +
> +	if (type == PERF_SECURITY_OPEN)
> +		requested = PERF_EVENT__OPEN;
> +	else if (type == PERF_SECURITY_CPU)
> +		requested = PERF_EVENT__CPU;
> +	else if (type == PERF_SECURITY_KERNEL)
> +		requested = PERF_EVENT__KERNEL;
> +	else if (type == PERF_SECURITY_TRACEPOINT)
> +		requested = PERF_EVENT__TRACEPOINT;
> +	else
> +		return -EINVAL;
> +
> +	return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PERF_EVENT,
> +			    requested, NULL);
> +}
> +
> +static int selinux_perf_event_alloc(struct perf_event *event)
> +{
> +	struct perf_event_security_struct *perfsec;
> +
> +	perfsec = kzalloc(sizeof(*perfsec), GFP_KERNEL);
> +	if (!perfsec)
> +		return -ENOMEM;
> +
> +	perfsec->sid = current_sid();
> +	event->security = perfsec;
> +
> +	return 0;
> +}
> +
> +static void selinux_perf_event_free(struct perf_event *event)
> +{
> +	struct perf_event_security_struct *perfsec = event->security;
> +
> +	event->security = NULL;
> +	kfree(perfsec);
> +}
> +
> +static int selinux_perf_event_read(struct perf_event *event)
> +{
> +	struct perf_event_security_struct *perfsec = event->security;
> +	u32 sid = current_sid();
> +
> +	return avc_has_perm(&selinux_state, sid, perfsec->sid,
> +			    SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL);
> +}
> +
> +static int selinux_perf_event_write(struct perf_event *event)
> +{
> +	struct perf_event_security_struct *perfsec = event->security;
> +	u32 sid = current_sid();
> +
> +	return avc_has_perm(&selinux_state, sid, perfsec->sid,
> +			    SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL);
> +}
> +#endif
> +
>   static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
>   	LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
>   	LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
> @@ -7030,6 +7091,14 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
>   	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
>   	LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
>   #endif
> +
> +#ifdef CONFIG_PERF_EVENTS
> +	LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open),
> +	LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
> +	LSM_HOOK_INIT(perf_event_free, selinux_perf_event_free),
> +	LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read),
> +	LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write),
> +#endif
>   };
>   
>   static __init int selinux_init(void)
> diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
> index 32e9b03be3dd..7db24855e12d 100644
> --- a/security/selinux/include/classmap.h
> +++ b/security/selinux/include/classmap.h
> @@ -244,6 +244,8 @@ struct security_class_mapping secclass_map[] = {
>   	  {"map_create", "map_read", "map_write", "prog_load", "prog_run"} },
>   	{ "xdp_socket",
>   	  { COMMON_SOCK_PERMS, NULL } },
> +	{ "perf_event",
> +	  {"open", "cpu", "kernel", "tracepoint", "read", "write"} },
>   	{ NULL }
>     };
>   
> diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
> index 586b7abd0aa7..a4a86cbcfb0a 100644
> --- a/security/selinux/include/objsec.h
> +++ b/security/selinux/include/objsec.h
> @@ -141,7 +141,11 @@ struct pkey_security_struct {
>   };
>   
>   struct bpf_security_struct {
> -	u32 sid;  /*SID of bpf obj creater*/
> +	u32 sid;  /* SID of bpf obj creator */
> +};
> +
> +struct perf_event_security_struct {
> +	u32 sid;  /* SID of perf_event obj creator */
>   };
>   
>   extern struct lsm_blob_sizes selinux_blob_sizes;
> 


^ permalink raw reply

* Re: [Patch v7 0/4] Create and consolidate trusted keys subsystem
From: Sumit Garg @ 2019-10-15  8:32 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: dhowells, peterhuewe, keyrings, linux-integrity,
	open list:HARDWARE RANDOM NUMBER GENERATOR CORE,
	linux-security-module, Herbert Xu, davem, jgg, Arnd Bergmann,
	Greg Kroah-Hartman, jejb, Mimi Zohar, James Morris,
	Serge E. Hallyn, Linux Kernel Mailing List, Daniel Thompson
In-Reply-To: <20191014201604.GN15552@linux.intel.com>

On Tue, 15 Oct 2019 at 01:46, Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> On Fri, Oct 11, 2019 at 02:05:17PM -0700, Jerry Snitselaar wrote:
> > On Fri Oct 11 19, Jarkko Sakkinen wrote:
> > > On Mon, Oct 07, 2019 at 10:55:31AM +0530, Sumit Garg wrote:
> > > > This patch-set does restructuring of trusted keys code to create and
> > > > consolidate trusted keys subsystem.
> > > >
> > > > Also, patch #2 replaces tpm1_buf code used in security/keys/trusted.c and
> > > > crypto/asymmertic_keys/asym_tpm.c files to use the common tpm_buf code.
> > > >
> > > > Changes in v7:
> > > > 1. Rebased to top of tpmdd/master
> > > > 2. Patch #4: update tpm2 trusted keys code to use tpm_send() instead of
> > > >    tpm_transmit_cmd() which is an internal function.
> > > >
> > > > Changes in v6:
> > > > 1. Switch TPM asymmetric code also to use common tpm_buf code. These
> > > >    changes required patches #1 and #2 update, so I have dropped review
> > > >    tags from those patches.
> > > > 2. Incorporated miscellaneous comments from Jarkko.
> > > >
> > > > Changes in v5:
> > > > 1. Drop 5/5 patch as its more relavant along with TEE patch-set.
> > > > 2. Add Reviewed-by tag for patch #2.
> > > > 3. Fix build failure when "CONFIG_HEADER_TEST" and
> > > >    "CONFIG_KERNEL_HEADER_TEST" config options are enabled.
> > > > 4. Misc changes to rename files.
> > > >
> > > > Changes in v4:
> > > > 1. Separate patch for export of tpm_buf code to include/linux/tpm.h
> > > > 2. Change TPM1.x trusted keys code to use common tpm_buf
> > > > 3. Keep module name as trusted.ko only
> > > >
> > > > Changes in v3:
> > > >
> > > > Move TPM2 trusted keys code to trusted keys subsystem.
> > > >
> > > > Changes in v2:
> > > >
> > > > Split trusted keys abstraction patch for ease of review.
> > > >
> > > > Sumit Garg (4):
> > > >   tpm: Move tpm_buf code to include/linux/
> > > >   KEYS: Use common tpm_buf for trusted and asymmetric keys
> > > >   KEYS: trusted: Create trusted keys subsystem
> > > >   KEYS: trusted: Move TPM2 trusted keys code
> > > >
> > > >  crypto/asymmetric_keys/asym_tpm.c                  | 101 +++----
> > > >  drivers/char/tpm/tpm-interface.c                   |  56 ----
> > > >  drivers/char/tpm/tpm.h                             | 226 ---------------
> > > >  drivers/char/tpm/tpm2-cmd.c                        | 307 --------------------
> > > >  include/Kbuild                                     |   1 -
> > > >  include/keys/{trusted.h => trusted_tpm.h}          |  49 +---
> > > >  include/linux/tpm.h                                | 251 ++++++++++++++--
> > > >  security/keys/Makefile                             |   2 +-
> > > >  security/keys/trusted-keys/Makefile                |   8 +
> > > >  .../{trusted.c => trusted-keys/trusted_tpm1.c}     |  96 +++----
> > > >  security/keys/trusted-keys/trusted_tpm2.c          | 314 +++++++++++++++++++++
> > > >  11 files changed, 652 insertions(+), 759 deletions(-)
> > > >  rename include/keys/{trusted.h => trusted_tpm.h} (77%)
> > > >  create mode 100644 security/keys/trusted-keys/Makefile
> > > >  rename security/keys/{trusted.c => trusted-keys/trusted_tpm1.c} (94%)
> > > >  create mode 100644 security/keys/trusted-keys/trusted_tpm2.c
> > > >
> > > > --
> > > > 2.7.4
> > > >
> > >
> > > I fixed a merge conflict caused by James' commit. Already pushed.
> > > Compiling test kernel ATM i.e. tested-by's will follow later.
> > >
> > > /Jarkko
> >
> > Are you missing patch 4 on master?
>
> Already removed the patch set given the sparse issues.

The sparse issues weren't due to this patch-set but they already
existed in "security/keys/trusted.c" and this patch-set only did a
rename for that file. So I think we should have a separate patch to
fix sparse issues.

-Sumit

> Read this email
> after doing that. Thanks anyway for pointing that out.
>
> /Jarkko

^ permalink raw reply

* Re: [PATCH v2] perf_event: Add support for LSM and SELinux checks
From: Peter Zijlstra @ 2019-10-15  8:30 UTC (permalink / raw)
  To: Joel Fernandes (Google)
  Cc: linux-kernel, rostedt, primiano, rsavitski, jeffv, kernel-team,
	James Morris, Alexei Starovoitov, Arnaldo Carvalho de Melo, bpf,
	Daniel Borkmann, Ingo Molnar, Jiri Olsa, Kees Cook,
	linux-security-module, Matthew Garrett, Namhyung Kim, selinux,
	Song Liu, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	Yonghong Song
In-Reply-To: <20191014170308.70668-1-joel@joelfernandes.org>

On Mon, Oct 14, 2019 at 01:03:08PM -0400, Joel Fernandes (Google) wrote:
> In current mainline, the degree of access to perf_event_open(2) system
> call depends on the perf_event_paranoid sysctl.  This has a number of
> limitations:
> 
> 1. The sysctl is only a single value. Many types of accesses are controlled
>    based on the single value thus making the control very limited and
>    coarse grained.
> 2. The sysctl is global, so if the sysctl is changed, then that means
>    all processes get access to perf_event_open(2) opening the door to
>    security issues.
> 
> This patch adds LSM and SELinux access checking which will be used in
> Android to access perf_event_open(2) for the purposes of attaching BPF
> programs to tracepoints, perf profiling and other operations from
> userspace. These operations are intended for production systems.
> 
> 5 new LSM hooks are added:
> 1. perf_event_open: This controls access during the perf_event_open(2)
>    syscall itself. The hook is called from all the places that the
>    perf_event_paranoid sysctl is checked to keep it consistent with the
>    systctl. The hook gets passed a 'type' argument which controls CPU,
>    kernel and tracepoint accesses (in this context, CPU, kernel and
>    tracepoint have the same semantics as the perf_event_paranoid sysctl).
>    Additionally, I added an 'open' type which is similar to
>    perf_event_paranoid sysctl == 3 patch carried in Android and several other
>    distros but was rejected in mainline [1] in 2016.
> 
> 2. perf_event_alloc: This allocates a new security object for the event
>    which stores the current SID within the event. It will be useful when
>    the perf event's FD is passed through IPC to another process which may
>    try to read the FD. Appropriate security checks will limit access.
> 
> 3. perf_event_free: Called when the event is closed.
> 
> 4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event.
> 
> 5. perf_event_write: Called from the ioctl(2) syscalls for the event.
> 
> [1] https://lwn.net/Articles/696240/
> 
> Since Peter had suggest LSM hooks in 2016 [1], I am adding his
> Suggested-by tag below.

Thanks, I've queued the patch!

> To use this patch, we set the perf_event_paranoid sysctl to -1 and then
> apply selinux checking as appropriate (default deny everything, and then
> add policy rules to give access to domains that need it). In the future
> we can remove the perf_event_paranoid sysctl altogether.

This I'm not sure about; the sysctl is only redundant when you actually
use a security thingy, not everyone is. I always find them things to be
mightily unfriendly.


^ permalink raw reply

* Re: [Patch v7 0/4] Create and consolidate trusted keys subsystem
From: Sumit Garg @ 2019-10-15  8:28 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: dhowells, peterhuewe, keyrings, linux-integrity,
	open list:HARDWARE RANDOM NUMBER GENERATOR CORE,
	linux-security-module, Herbert Xu, davem, jgg, Arnd Bergmann,
	Greg Kroah-Hartman, jejb, Mimi Zohar, James Morris,
	Serge E. Hallyn, Jerry Snitselaar, Linux Kernel Mailing List,
	Daniel Thompson
In-Reply-To: <20191014193350.GG15552@linux.intel.com>

On Tue, 15 Oct 2019 at 01:04, Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> On Fri, Oct 11, 2019 at 03:37:57PM +0300, Jarkko Sakkinen wrote:
> > On Mon, Oct 07, 2019 at 10:55:31AM +0530, Sumit Garg wrote:
> > > This patch-set does restructuring of trusted keys code to create and
> > > consolidate trusted keys subsystem.
> > >
> > > Also, patch #2 replaces tpm1_buf code used in security/keys/trusted.c and
> > > crypto/asymmertic_keys/asym_tpm.c files to use the common tpm_buf code.
> > >
> > > Changes in v7:
> > > 1. Rebased to top of tpmdd/master
> > > 2. Patch #4: update tpm2 trusted keys code to use tpm_send() instead of
> > >    tpm_transmit_cmd() which is an internal function.
> > >
> > > Changes in v6:
> > > 1. Switch TPM asymmetric code also to use common tpm_buf code. These
> > >    changes required patches #1 and #2 update, so I have dropped review
> > >    tags from those patches.
> > > 2. Incorporated miscellaneous comments from Jarkko.
> > >
> > > Changes in v5:
> > > 1. Drop 5/5 patch as its more relavant along with TEE patch-set.
> > > 2. Add Reviewed-by tag for patch #2.
> > > 3. Fix build failure when "CONFIG_HEADER_TEST" and
> > >    "CONFIG_KERNEL_HEADER_TEST" config options are enabled.
> > > 4. Misc changes to rename files.
> > >
> > > Changes in v4:
> > > 1. Separate patch for export of tpm_buf code to include/linux/tpm.h
> > > 2. Change TPM1.x trusted keys code to use common tpm_buf
> > > 3. Keep module name as trusted.ko only
> > >
> > > Changes in v3:
> > >
> > > Move TPM2 trusted keys code to trusted keys subsystem.
> > >
> > > Changes in v2:
> > >
> > > Split trusted keys abstraction patch for ease of review.
> > >
> > > Sumit Garg (4):
> > >   tpm: Move tpm_buf code to include/linux/
> > >   KEYS: Use common tpm_buf for trusted and asymmetric keys
> > >   KEYS: trusted: Create trusted keys subsystem
> > >   KEYS: trusted: Move TPM2 trusted keys code
> > >
> > >  crypto/asymmetric_keys/asym_tpm.c                  | 101 +++----
> > >  drivers/char/tpm/tpm-interface.c                   |  56 ----
> > >  drivers/char/tpm/tpm.h                             | 226 ---------------
> > >  drivers/char/tpm/tpm2-cmd.c                        | 307 --------------------
> > >  include/Kbuild                                     |   1 -
> > >  include/keys/{trusted.h => trusted_tpm.h}          |  49 +---
> > >  include/linux/tpm.h                                | 251 ++++++++++++++--
> > >  security/keys/Makefile                             |   2 +-
> > >  security/keys/trusted-keys/Makefile                |   8 +
> > >  .../{trusted.c => trusted-keys/trusted_tpm1.c}     |  96 +++----
> > >  security/keys/trusted-keys/trusted_tpm2.c          | 314 +++++++++++++++++++++
> > >  11 files changed, 652 insertions(+), 759 deletions(-)
> > >  rename include/keys/{trusted.h => trusted_tpm.h} (77%)
> > >  create mode 100644 security/keys/trusted-keys/Makefile
> > >  rename security/keys/{trusted.c => trusted-keys/trusted_tpm1.c} (94%)
> > >  create mode 100644 security/keys/trusted-keys/trusted_tpm2.c
> > >
> > > --
> > > 2.7.4
> > >
> >
> > I fixed a merge conflict caused by James' commit. Already pushed.
> > Compiling test kernel ATM i.e. tested-by's will follow later.
>
> Update to my latest master for v8 (otherwise there won't be a clean
> merge).
>

Okay, I will send v8 to rebase to your latest master.

-Sumit

> /Jarkko

^ permalink raw reply

* Re: [Patch v7 0/4] Create and consolidate trusted keys subsystem
From: Jarkko Sakkinen @ 2019-10-14 20:16 UTC (permalink / raw)
  To: Sumit Garg, dhowells, peterhuewe, keyrings, linux-integrity,
	linux-crypto, linux-security-module, herbert, davem, jgg, arnd,
	gregkh, jejb, zohar, jmorris, serge, linux-kernel,
	daniel.thompson
In-Reply-To: <20191011210517.qxjemugqczsvscu6@cantor>

On Fri, Oct 11, 2019 at 02:05:17PM -0700, Jerry Snitselaar wrote:
> On Fri Oct 11 19, Jarkko Sakkinen wrote:
> > On Mon, Oct 07, 2019 at 10:55:31AM +0530, Sumit Garg wrote:
> > > This patch-set does restructuring of trusted keys code to create and
> > > consolidate trusted keys subsystem.
> > > 
> > > Also, patch #2 replaces tpm1_buf code used in security/keys/trusted.c and
> > > crypto/asymmertic_keys/asym_tpm.c files to use the common tpm_buf code.
> > > 
> > > Changes in v7:
> > > 1. Rebased to top of tpmdd/master
> > > 2. Patch #4: update tpm2 trusted keys code to use tpm_send() instead of
> > >    tpm_transmit_cmd() which is an internal function.
> > > 
> > > Changes in v6:
> > > 1. Switch TPM asymmetric code also to use common tpm_buf code. These
> > >    changes required patches #1 and #2 update, so I have dropped review
> > >    tags from those patches.
> > > 2. Incorporated miscellaneous comments from Jarkko.
> > > 
> > > Changes in v5:
> > > 1. Drop 5/5 patch as its more relavant along with TEE patch-set.
> > > 2. Add Reviewed-by tag for patch #2.
> > > 3. Fix build failure when "CONFIG_HEADER_TEST" and
> > >    "CONFIG_KERNEL_HEADER_TEST" config options are enabled.
> > > 4. Misc changes to rename files.
> > > 
> > > Changes in v4:
> > > 1. Separate patch for export of tpm_buf code to include/linux/tpm.h
> > > 2. Change TPM1.x trusted keys code to use common tpm_buf
> > > 3. Keep module name as trusted.ko only
> > > 
> > > Changes in v3:
> > > 
> > > Move TPM2 trusted keys code to trusted keys subsystem.
> > > 
> > > Changes in v2:
> > > 
> > > Split trusted keys abstraction patch for ease of review.
> > > 
> > > Sumit Garg (4):
> > >   tpm: Move tpm_buf code to include/linux/
> > >   KEYS: Use common tpm_buf for trusted and asymmetric keys
> > >   KEYS: trusted: Create trusted keys subsystem
> > >   KEYS: trusted: Move TPM2 trusted keys code
> > > 
> > >  crypto/asymmetric_keys/asym_tpm.c                  | 101 +++----
> > >  drivers/char/tpm/tpm-interface.c                   |  56 ----
> > >  drivers/char/tpm/tpm.h                             | 226 ---------------
> > >  drivers/char/tpm/tpm2-cmd.c                        | 307 --------------------
> > >  include/Kbuild                                     |   1 -
> > >  include/keys/{trusted.h => trusted_tpm.h}          |  49 +---
> > >  include/linux/tpm.h                                | 251 ++++++++++++++--
> > >  security/keys/Makefile                             |   2 +-
> > >  security/keys/trusted-keys/Makefile                |   8 +
> > >  .../{trusted.c => trusted-keys/trusted_tpm1.c}     |  96 +++----
> > >  security/keys/trusted-keys/trusted_tpm2.c          | 314 +++++++++++++++++++++
> > >  11 files changed, 652 insertions(+), 759 deletions(-)
> > >  rename include/keys/{trusted.h => trusted_tpm.h} (77%)
> > >  create mode 100644 security/keys/trusted-keys/Makefile
> > >  rename security/keys/{trusted.c => trusted-keys/trusted_tpm1.c} (94%)
> > >  create mode 100644 security/keys/trusted-keys/trusted_tpm2.c
> > > 
> > > --
> > > 2.7.4
> > > 
> > 
> > I fixed a merge conflict caused by James' commit. Already pushed.
> > Compiling test kernel ATM i.e. tested-by's will follow later.
> > 
> > /Jarkko
> 
> Are you missing patch 4 on master?

Already removed the patch set given the sparse issues. Read this email
after doing that. Thanks anyway for pointing that out.

/Jarkko

^ permalink raw reply

* Re: [Patch v7 0/4] Create and consolidate trusted keys subsystem
From: Jarkko Sakkinen @ 2019-10-14 19:33 UTC (permalink / raw)
  To: Sumit Garg
  Cc: dhowells, peterhuewe, keyrings, linux-integrity, linux-crypto,
	linux-security-module, herbert, davem, jgg, arnd, gregkh, jejb,
	zohar, jmorris, serge, jsnitsel, linux-kernel, daniel.thompson
In-Reply-To: <20191011123757.GD3129@linux.intel.com>

On Fri, Oct 11, 2019 at 03:37:57PM +0300, Jarkko Sakkinen wrote:
> On Mon, Oct 07, 2019 at 10:55:31AM +0530, Sumit Garg wrote:
> > This patch-set does restructuring of trusted keys code to create and
> > consolidate trusted keys subsystem.
> > 
> > Also, patch #2 replaces tpm1_buf code used in security/keys/trusted.c and
> > crypto/asymmertic_keys/asym_tpm.c files to use the common tpm_buf code.
> > 
> > Changes in v7:
> > 1. Rebased to top of tpmdd/master
> > 2. Patch #4: update tpm2 trusted keys code to use tpm_send() instead of
> >    tpm_transmit_cmd() which is an internal function.
> > 
> > Changes in v6:
> > 1. Switch TPM asymmetric code also to use common tpm_buf code. These
> >    changes required patches #1 and #2 update, so I have dropped review
> >    tags from those patches.
> > 2. Incorporated miscellaneous comments from Jarkko.
> > 
> > Changes in v5:
> > 1. Drop 5/5 patch as its more relavant along with TEE patch-set.
> > 2. Add Reviewed-by tag for patch #2.
> > 3. Fix build failure when "CONFIG_HEADER_TEST" and
> >    "CONFIG_KERNEL_HEADER_TEST" config options are enabled.
> > 4. Misc changes to rename files.
> > 
> > Changes in v4:
> > 1. Separate patch for export of tpm_buf code to include/linux/tpm.h
> > 2. Change TPM1.x trusted keys code to use common tpm_buf
> > 3. Keep module name as trusted.ko only
> > 
> > Changes in v3:
> > 
> > Move TPM2 trusted keys code to trusted keys subsystem.
> > 
> > Changes in v2:
> > 
> > Split trusted keys abstraction patch for ease of review.
> > 
> > Sumit Garg (4):
> >   tpm: Move tpm_buf code to include/linux/
> >   KEYS: Use common tpm_buf for trusted and asymmetric keys
> >   KEYS: trusted: Create trusted keys subsystem
> >   KEYS: trusted: Move TPM2 trusted keys code
> > 
> >  crypto/asymmetric_keys/asym_tpm.c                  | 101 +++----
> >  drivers/char/tpm/tpm-interface.c                   |  56 ----
> >  drivers/char/tpm/tpm.h                             | 226 ---------------
> >  drivers/char/tpm/tpm2-cmd.c                        | 307 --------------------
> >  include/Kbuild                                     |   1 -
> >  include/keys/{trusted.h => trusted_tpm.h}          |  49 +---
> >  include/linux/tpm.h                                | 251 ++++++++++++++--
> >  security/keys/Makefile                             |   2 +-
> >  security/keys/trusted-keys/Makefile                |   8 +
> >  .../{trusted.c => trusted-keys/trusted_tpm1.c}     |  96 +++----
> >  security/keys/trusted-keys/trusted_tpm2.c          | 314 +++++++++++++++++++++
> >  11 files changed, 652 insertions(+), 759 deletions(-)
> >  rename include/keys/{trusted.h => trusted_tpm.h} (77%)
> >  create mode 100644 security/keys/trusted-keys/Makefile
> >  rename security/keys/{trusted.c => trusted-keys/trusted_tpm1.c} (94%)
> >  create mode 100644 security/keys/trusted-keys/trusted_tpm2.c
> > 
> > -- 
> > 2.7.4
> > 
> 
> I fixed a merge conflict caused by James' commit. Already pushed.
> Compiling test kernel ATM i.e. tested-by's will follow later.

Update to my latest master for v8 (otherwise there won't be a clean
merge).

/Jarkko

^ permalink raw reply

* [PATCH v2] perf_event: Add support for LSM and SELinux checks
From: Joel Fernandes (Google) @ 2019-10-14 17:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Joel Fernandes (Google), Peter Zijlstra, rostedt, primiano,
	rsavitski, jeffv, kernel-team, James Morris, Alexei Starovoitov,
	Arnaldo Carvalho de Melo, bpf, Daniel Borkmann, Ingo Molnar,
	Jiri Olsa, Kees Cook, linux-security-module, Matthew Garrett,
	Namhyung Kim, selinux, Song Liu,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT), Yonghong Song

In current mainline, the degree of access to perf_event_open(2) system
call depends on the perf_event_paranoid sysctl.  This has a number of
limitations:

1. The sysctl is only a single value. Many types of accesses are controlled
   based on the single value thus making the control very limited and
   coarse grained.
2. The sysctl is global, so if the sysctl is changed, then that means
   all processes get access to perf_event_open(2) opening the door to
   security issues.

This patch adds LSM and SELinux access checking which will be used in
Android to access perf_event_open(2) for the purposes of attaching BPF
programs to tracepoints, perf profiling and other operations from
userspace. These operations are intended for production systems.

5 new LSM hooks are added:
1. perf_event_open: This controls access during the perf_event_open(2)
   syscall itself. The hook is called from all the places that the
   perf_event_paranoid sysctl is checked to keep it consistent with the
   systctl. The hook gets passed a 'type' argument which controls CPU,
   kernel and tracepoint accesses (in this context, CPU, kernel and
   tracepoint have the same semantics as the perf_event_paranoid sysctl).
   Additionally, I added an 'open' type which is similar to
   perf_event_paranoid sysctl == 3 patch carried in Android and several other
   distros but was rejected in mainline [1] in 2016.

2. perf_event_alloc: This allocates a new security object for the event
   which stores the current SID within the event. It will be useful when
   the perf event's FD is passed through IPC to another process which may
   try to read the FD. Appropriate security checks will limit access.

3. perf_event_free: Called when the event is closed.

4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event.

5. perf_event_write: Called from the ioctl(2) syscalls for the event.

[1] https://lwn.net/Articles/696240/

Since Peter had suggest LSM hooks in 2016 [1], I am adding his
Suggested-by tag below.

To use this patch, we set the perf_event_paranoid sysctl to -1 and then
apply selinux checking as appropriate (default deny everything, and then
add policy rules to give access to domains that need it). In the future
we can remove the perf_event_paranoid sysctl altogether.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: rostedt@goodmis.org
Cc: primiano@google.com
Cc: rsavitski@google.com
Cc: jeffv@google.com
Cc: kernel-team@android.com
Acked-by: James Morris <jmorris@namei.org>
Co-developed-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---

Changes since v1:
 o Fixes from Peter Ziljstra.
 o Added Ack from James Morris and Co-developed-by tag for Peter.
Changes since RFC:
 o Small nits, style changes (James Morris).
 o Consolidation of code (Peter Zijlstra).


 arch/x86/events/intel/bts.c         |  8 ++--
 arch/x86/events/intel/core.c        |  5 ++-
 arch/x86/events/intel/p4.c          |  5 ++-
 include/linux/lsm_hooks.h           | 15 +++++++
 include/linux/perf_event.h          | 28 +++++++++---
 include/linux/security.h            | 39 +++++++++++++++-
 include/uapi/linux/perf_event.h     |  9 ++++
 kernel/events/core.c                | 57 +++++++++++++++++++-----
 kernel/trace/trace_event_perf.c     | 15 ++++---
 security/security.c                 | 27 +++++++++++
 security/selinux/hooks.c            | 69 +++++++++++++++++++++++++++++
 security/selinux/include/classmap.h |  2 +
 security/selinux/include/objsec.h   |  6 ++-
 13 files changed, 255 insertions(+), 30 deletions(-)

diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 5ee3fed881d3..38de4a7f6752 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -549,9 +549,11 @@ static int bts_event_init(struct perf_event *event)
 	 * Note that the default paranoia setting permits unprivileged
 	 * users to profile the kernel.
 	 */
-	if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
-	    !capable(CAP_SYS_ADMIN))
-		return -EACCES;
+	if (event->attr.exclude_kernel) {
+		ret = perf_allow_kernel(&event->attr);
+		if (ret)
+			return ret;
+	}
 
 	if (x86_add_exclusive(x86_lbr_exclusive_bts))
 		return -EBUSY;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 27ee47a7be66..32967a9e9962 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3315,8 +3315,9 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (x86_pmu.version < 3)
 		return -EINVAL;
 
-	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-		return -EACCES;
+	ret = perf_allow_cpu(&event->attr);
+	if (ret)
+		return ret;
 
 	event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
 
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index dee579efb2b2..a4cc66005ce8 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -776,8 +776,9 @@ static int p4_validate_raw_event(struct perf_event *event)
 	 * the user needs special permissions to be able to use it
 	 */
 	if (p4_ht_active() && p4_event_bind_map[v].shared) {
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
+		v = perf_allow_cpu(&event->attr);
+		if (v)
+			return v;
 	}
 
 	/* ESCR EventMask bits may be invalid */
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a3763247547c..20d8cf194fb7 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1818,6 +1818,14 @@ union security_list_options {
 	void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
 #endif /* CONFIG_BPF_SYSCALL */
 	int (*locked_down)(enum lockdown_reason what);
+#ifdef CONFIG_PERF_EVENTS
+	int (*perf_event_open)(struct perf_event_attr *attr, int type);
+	int (*perf_event_alloc)(struct perf_event *event);
+	void (*perf_event_free)(struct perf_event *event);
+	int (*perf_event_read)(struct perf_event *event);
+	int (*perf_event_write)(struct perf_event *event);
+
+#endif
 };
 
 struct security_hook_heads {
@@ -2060,6 +2068,13 @@ struct security_hook_heads {
 	struct hlist_head bpf_prog_free_security;
 #endif /* CONFIG_BPF_SYSCALL */
 	struct hlist_head locked_down;
+#ifdef CONFIG_PERF_EVENTS
+	struct hlist_head perf_event_open;
+	struct hlist_head perf_event_alloc;
+	struct hlist_head perf_event_free;
+	struct hlist_head perf_event_read;
+	struct hlist_head perf_event_write;
+#endif
 } __randomize_layout;
 
 /*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61448c19a132..664bb7f99c46 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -56,6 +56,7 @@ struct perf_guest_info_callbacks {
 #include <linux/perf_regs.h>
 #include <linux/cgroup.h>
 #include <linux/refcount.h>
+#include <linux/security.h>
 #include <asm/local.h>
 
 struct perf_callchain_entry {
@@ -721,6 +722,9 @@ struct perf_event {
 	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
 #endif
 
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 	struct list_head		sb_list;
 #endif /* CONFIG_PERF_EVENTS */
 };
@@ -1241,19 +1245,33 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
-static inline bool perf_paranoid_tracepoint_raw(void)
+static inline int perf_is_paranoid(void)
 {
 	return sysctl_perf_event_paranoid > -1;
 }
 
-static inline bool perf_paranoid_cpu(void)
+static inline int perf_allow_kernel(struct perf_event_attr *attr)
 {
-	return sysctl_perf_event_paranoid > 0;
+	if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
 }
 
-static inline bool perf_paranoid_kernel(void)
+static inline int perf_allow_cpu(struct perf_event_attr *attr)
 {
-	return sysctl_perf_event_paranoid > 1;
+	if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	return security_perf_event_open(attr, PERF_SECURITY_CPU);
+}
+
+static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
+{
+	if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
 }
 
 extern void perf_event_init(void);
diff --git a/include/linux/security.h b/include/linux/security.h
index a8d59d612d27..273e11c66ed7 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1894,5 +1894,42 @@ static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
 #endif /* CONFIG_SECURITY */
 #endif /* CONFIG_BPF_SYSCALL */
 
-#endif /* ! __LINUX_SECURITY_H */
+#ifdef CONFIG_PERF_EVENTS
+struct perf_event_attr;
+
+#ifdef CONFIG_SECURITY
+extern int security_perf_event_open(struct perf_event_attr *attr, int type);
+extern int security_perf_event_alloc(struct perf_event *event);
+extern void security_perf_event_free(struct perf_event *event);
+extern int security_perf_event_read(struct perf_event *event);
+extern int security_perf_event_write(struct perf_event *event);
+#else
+static inline int security_perf_event_open(struct perf_event_attr *attr,
+					   int type)
+{
+	return 0;
+}
 
+static inline int security_perf_event_alloc(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline void security_perf_event_free(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline int security_perf_event_read(struct perf_event *event)
+{
+	return 0;
+}
+
+static inline int security_perf_event_write(struct perf_event *event)
+{
+	return 0;
+}
+#endif /* CONFIG_SECURITY */
+#endif /* CONFIG_PERF_EVENTS */
+
+#endif /* ! __LINUX_SECURITY_H */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index bb7b271397a6..2af95f937a5b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -427,6 +427,15 @@ struct perf_event_attr {
 	__u16	__reserved_2;	/* align to __u64 */
 };
 
+
+/* Access to perf_event_open(2) syscall. */
+#define PERF_SECURITY_OPEN		0
+
+/* Finer grained perf_event_open(2) access control. */
+#define PERF_SECURITY_CPU		1
+#define PERF_SECURITY_KERNEL		2
+#define PERF_SECURITY_TRACEPOINT	3
+
 /*
  * Structure used by below PERF_EVENT_IOC_QUERY_BPF command
  * to query bpf programs attached to the same perf tracepoint
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4655adbbae10..8ad597fae5f4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4217,8 +4217,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 
 	if (!task) {
 		/* Must be root to operate on a CPU event: */
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-			return ERR_PTR(-EACCES);
+		err = perf_allow_cpu(&event->attr);
+		if (err)
+			return ERR_PTR(err);
 
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
@@ -4527,6 +4528,8 @@ static void _free_event(struct perf_event *event)
 
 	unaccount_event(event);
 
+	security_perf_event_free(event);
+
 	if (event->rb) {
 		/*
 		 * Can happen when we close an event with re-directed output.
@@ -4980,6 +4983,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	struct perf_event_context *ctx;
 	int ret;
 
+	ret = security_perf_event_read(event);
+	if (ret)
+		return ret;
+
 	ctx = perf_event_ctx_lock(event);
 	ret = __perf_read(event, buf, count);
 	perf_event_ctx_unlock(event, ctx);
@@ -5244,6 +5251,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct perf_event_context *ctx;
 	long ret;
 
+	/* Treat ioctl like writes as it is likely a mutating operation. */
+	ret = security_perf_event_write(event);
+	if (ret)
+		return ret;
+
 	ctx = perf_event_ctx_lock(event);
 	ret = _perf_ioctl(event, cmd, arg);
 	perf_event_ctx_unlock(event, ctx);
@@ -5706,6 +5718,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_SHARED))
 		return -EINVAL;
 
+	ret = security_perf_event_read(event);
+	if (ret)
+		return ret;
+
 	vma_size = vma->vm_end - vma->vm_start;
 
 	if (vma->vm_pgoff == 0) {
@@ -5819,7 +5835,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	lock_limit >>= PAGE_SHIFT;
 	locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
 
-	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+	if ((locked > lock_limit) && perf_is_paranoid() &&
 		!capable(CAP_IPC_LOCK)) {
 		ret = -EPERM;
 		goto unlock;
@@ -10553,11 +10569,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		}
 	}
 
+	err = security_perf_event_alloc(event);
+	if (err)
+		goto err_callchain_buffer;
+
 	/* symmetric to unaccount_event() in _free_event() */
 	account_event(event);
 
 	return event;
 
+err_callchain_buffer:
+	if (!event->parent) {
+		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+			put_callchain_buffers();
+	}
 err_addr_filters:
 	kfree(event->addr_filter_ranges);
 
@@ -10675,9 +10700,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 			attr->branch_sample_type = mask;
 		}
 		/* privileged levels capture (kernel, hv): check permissions */
-		if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
-		    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
+		if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
+			ret = perf_allow_kernel(attr);
+			if (ret)
+				return ret;
+		}
 	}
 
 	if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -10890,13 +10917,19 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (flags & ~PERF_FLAG_ALL)
 		return -EINVAL;
 
+	/* Do we allow access to perf_event_open(2) ? */
+	err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+	if (err)
+		return err;
+
 	err = perf_copy_attr(attr_uptr, &attr);
 	if (err)
 		return err;
 
 	if (!attr.exclude_kernel) {
-		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
+		err = perf_allow_kernel(&attr);
+		if (err)
+			return err;
 	}
 
 	if (attr.namespaces) {
@@ -10913,9 +10946,11 @@ SYSCALL_DEFINE5(perf_event_open,
 	}
 
 	/* Only privileged users can get physical addresses */
-	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
-	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-		return -EACCES;
+	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
+		err = perf_allow_kernel(&attr);
+		if (err)
+			return err;
+	}
 
 	err = security_locked_down(LOCKDOWN_PERF);
 	if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0892e38ed6fb..0917fee6ee7c 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 #include <linux/kprobes.h>
+#include <linux/security.h>
 #include "trace.h"
 #include "trace_probe.h"
 
@@ -26,8 +27,10 @@ static int	total_ref_count;
 static int perf_trace_event_perm(struct trace_event_call *tp_event,
 				 struct perf_event *p_event)
 {
+	int ret;
+
 	if (tp_event->perf_perm) {
-		int ret = tp_event->perf_perm(tp_event, p_event);
+		ret = tp_event->perf_perm(tp_event, p_event);
 		if (ret)
 			return ret;
 	}
@@ -46,8 +49,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
 
 	/* The ftrace function trace is allowed only for root. */
 	if (ftrace_event_is_function(tp_event)) {
-		if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
-			return -EPERM;
+		ret = perf_allow_tracepoint(&p_event->attr);
+		if (ret)
+			return ret;
 
 		if (!is_sampling_event(p_event))
 			return 0;
@@ -82,8 +86,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
 	 * ...otherwise raw tracepoint data can be a severe data leak,
 	 * only allow root to have these.
 	 */
-	if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	ret = perf_allow_tracepoint(&p_event->attr);
+	if (ret)
+		return ret;
 
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 1bc000f834e2..cd2d18d2d279 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2404,3 +2404,30 @@ int security_locked_down(enum lockdown_reason what)
 	return call_int_hook(locked_down, 0, what);
 }
 EXPORT_SYMBOL(security_locked_down);
+
+#ifdef CONFIG_PERF_EVENTS
+int security_perf_event_open(struct perf_event_attr *attr, int type)
+{
+	return call_int_hook(perf_event_open, 0, attr, type);
+}
+
+int security_perf_event_alloc(struct perf_event *event)
+{
+	return call_int_hook(perf_event_alloc, 0, event);
+}
+
+void security_perf_event_free(struct perf_event *event)
+{
+	call_void_hook(perf_event_free, event);
+}
+
+int security_perf_event_read(struct perf_event *event)
+{
+	return call_int_hook(perf_event_read, 0, event);
+}
+
+int security_perf_event_write(struct perf_event *event)
+{
+	return call_int_hook(perf_event_write, 0, event);
+}
+#endif /* CONFIG_PERF_EVENTS */
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9625b99e677f..28eb05490d59 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6795,6 +6795,67 @@ struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
 	.lbs_msg_msg = sizeof(struct msg_security_struct),
 };
 
+#ifdef CONFIG_PERF_EVENTS
+static int selinux_perf_event_open(struct perf_event_attr *attr, int type)
+{
+	u32 requested, sid = current_sid();
+
+	if (type == PERF_SECURITY_OPEN)
+		requested = PERF_EVENT__OPEN;
+	else if (type == PERF_SECURITY_CPU)
+		requested = PERF_EVENT__CPU;
+	else if (type == PERF_SECURITY_KERNEL)
+		requested = PERF_EVENT__KERNEL;
+	else if (type == PERF_SECURITY_TRACEPOINT)
+		requested = PERF_EVENT__TRACEPOINT;
+	else
+		return -EINVAL;
+
+	return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PERF_EVENT,
+			    requested, NULL);
+}
+
+static int selinux_perf_event_alloc(struct perf_event *event)
+{
+	struct perf_event_security_struct *perfsec;
+
+	perfsec = kzalloc(sizeof(*perfsec), GFP_KERNEL);
+	if (!perfsec)
+		return -ENOMEM;
+
+	perfsec->sid = current_sid();
+	event->security = perfsec;
+
+	return 0;
+}
+
+static void selinux_perf_event_free(struct perf_event *event)
+{
+	struct perf_event_security_struct *perfsec = event->security;
+
+	event->security = NULL;
+	kfree(perfsec);
+}
+
+static int selinux_perf_event_read(struct perf_event *event)
+{
+	struct perf_event_security_struct *perfsec = event->security;
+	u32 sid = current_sid();
+
+	return avc_has_perm(&selinux_state, sid, perfsec->sid,
+			    SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL);
+}
+
+static int selinux_perf_event_write(struct perf_event *event)
+{
+	struct perf_event_security_struct *perfsec = event->security;
+	u32 sid = current_sid();
+
+	return avc_has_perm(&selinux_state, sid, perfsec->sid,
+			    SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL);
+}
+#endif
+
 static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
 	LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
@@ -7030,6 +7091,14 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
 	LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
 #endif
+
+#ifdef CONFIG_PERF_EVENTS
+	LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open),
+	LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
+	LSM_HOOK_INIT(perf_event_free, selinux_perf_event_free),
+	LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read),
+	LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write),
+#endif
 };
 
 static __init int selinux_init(void)
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 32e9b03be3dd..7db24855e12d 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -244,6 +244,8 @@ struct security_class_mapping secclass_map[] = {
 	  {"map_create", "map_read", "map_write", "prog_load", "prog_run"} },
 	{ "xdp_socket",
 	  { COMMON_SOCK_PERMS, NULL } },
+	{ "perf_event",
+	  {"open", "cpu", "kernel", "tracepoint", "read", "write"} },
 	{ NULL }
   };
 
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 586b7abd0aa7..a4a86cbcfb0a 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -141,7 +141,11 @@ struct pkey_security_struct {
 };
 
 struct bpf_security_struct {
-	u32 sid;  /*SID of bpf obj creater*/
+	u32 sid;  /* SID of bpf obj creator */
+};
+
+struct perf_event_security_struct {
+	u32 sid;  /* SID of perf_event obj creator */
 };
 
 extern struct lsm_blob_sizes selinux_blob_sizes;
-- 
2.23.0.700.g56cf767bdb-goog

^ permalink raw reply related

* Re: [PATCH] perf_event: Add support for LSM and SELinux checks
From: Joel Fernandes @ 2019-10-14 16:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, rostedt, primiano, rsavitski, jeffv, kernel-team,
	Alexei Starovoitov, Arnaldo Carvalho de Melo, bpf,
	Daniel Borkmann, Ingo Molnar, James Morris, Jiri Olsa, Kees Cook,
	linux-security-module, Matthew Garrett, Namhyung Kim, selinux,
	Song Liu, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	Yonghong Song
In-Reply-To: <20191014093544.GB2328@hirez.programming.kicks-ass.net>

On Mon, Oct 14, 2019 at 11:35:44AM +0200, Peter Zijlstra wrote:
> On Fri, Oct 11, 2019 at 12:03:30PM -0400, Joel Fernandes (Google) wrote:
> 
> > @@ -4761,6 +4762,7 @@ int perf_event_release_kernel(struct perf_event *event)
> >  	}
> >  
> >  no_ctx:
> > +	security_perf_event_free(event);
> >  	put_event(event); /* Must be the 'last' reference */
> >  	return 0;
> >  }
> 
> > @@ -10553,11 +10568,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> >  		}
> >  	}
> >  
> > +	err = security_perf_event_alloc(event);
> > +	if (err)
> > +		goto err_security;
> > +
> >  	/* symmetric to unaccount_event() in _free_event() */
> >  	account_event(event);
> >  
> >  	return event;
> >  
> > +err_security:
> >  err_addr_filters:
> >  	kfree(event->addr_filter_ranges);
> >  
> 
> There's a bunch of problems here I think:
> 
>  - err_security is named wrong; the naming scheme is to name the label
>    after the last thing that succeeded / first thing that needs to be
>    undone.
> 
>  - per that, you're forgetting to undo 'get_callchain_buffers()'

Yes, you're right. Tested your fix below. Sorry to miss this.

>  - perf_event_release_kernel() is not a full match to
>    perf_event_alloc(), inherited events get created by
>    perf_event_alloc() but never pass through
>    perf_event_release_kernel().

Oh, through inherit_event(). Thanks for pointing this semantic out, did not
know that.

> I'm thinking the below patch on top should ammend these issues; please
> verify.

Yes, applied your diff below and verified that the events are getting freed
as they were in my initial set of tests. The diff also looks good to me.

I squashed your diff below and will resend as v3. Since you modified this
patch a lot, I will add your Co-developed-by tag as well.

thanks, Peter!

 - Joel


> ---
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -4540,6 +4540,8 @@ static void _free_event(struct perf_even
>  
>  	unaccount_event(event);
>  
> +	security_perf_event_free(event);
> +
>  	if (event->rb) {
>  		/*
>  		 * Can happen when we close an event with re-directed output.
> @@ -4774,7 +4776,6 @@ int perf_event_release_kernel(struct per
>  	}
>  
>  no_ctx:
> -	security_perf_event_free(event);
>  	put_event(event); /* Must be the 'last' reference */
>  	return 0;
>  }
> @@ -10595,14 +10596,18 @@ perf_event_alloc(struct perf_event_attr
>  
>  	err = security_perf_event_alloc(event);
>  	if (err)
> -		goto err_security;
> +		goto err_callchain_buffer;
>  
>  	/* symmetric to unaccount_event() in _free_event() */
>  	account_event(event);
>  
>  	return event;
>  
> -err_security:
> +err_callchain_buffer:
> +	if (!event->parent) {
> +		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
> +			put_callchain_buffers();
> +	}
>  err_addr_filters:
>  	kfree(event->addr_filter_ranges);
>  

^ permalink raw reply

* Re: [PATCH] perf_event: Add support for LSM and SELinux checks
From: Peter Zijlstra @ 2019-10-14  9:35 UTC (permalink / raw)
  To: Joel Fernandes (Google)
  Cc: linux-kernel, rostedt, primiano, rsavitski, jeffv, kernel-team,
	Alexei Starovoitov, Arnaldo Carvalho de Melo, bpf,
	Daniel Borkmann, Ingo Molnar, James Morris, Jiri Olsa, Kees Cook,
	linux-security-module, Matthew Garrett, Namhyung Kim, selinux,
	Song Liu, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	Yonghong Song
In-Reply-To: <20191011160330.199604-1-joel@joelfernandes.org>

On Fri, Oct 11, 2019 at 12:03:30PM -0400, Joel Fernandes (Google) wrote:

> @@ -4761,6 +4762,7 @@ int perf_event_release_kernel(struct perf_event *event)
>  	}
>  
>  no_ctx:
> +	security_perf_event_free(event);
>  	put_event(event); /* Must be the 'last' reference */
>  	return 0;
>  }

> @@ -10553,11 +10568,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
>  		}
>  	}
>  
> +	err = security_perf_event_alloc(event);
> +	if (err)
> +		goto err_security;
> +
>  	/* symmetric to unaccount_event() in _free_event() */
>  	account_event(event);
>  
>  	return event;
>  
> +err_security:
>  err_addr_filters:
>  	kfree(event->addr_filter_ranges);
>  

There's a bunch of problems here I think:

 - err_security is named wrong; the naming scheme is to name the label
   after the last thing that succeeded / first thing that needs to be
   undone.

 - per that, you're forgetting to undo 'get_callchain_buffers()'

 - perf_event_release_kernel() is not a full match to
   perf_event_alloc(), inherited events get created by
   perf_event_alloc() but never pass through
   perf_event_release_kernel().


I'm thinking the below patch on top should ammend these issues; please
verify.

---
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4540,6 +4540,8 @@ static void _free_event(struct perf_even
 
 	unaccount_event(event);
 
+	security_perf_event_free(event);
+
 	if (event->rb) {
 		/*
 		 * Can happen when we close an event with re-directed output.
@@ -4774,7 +4776,6 @@ int perf_event_release_kernel(struct per
 	}
 
 no_ctx:
-	security_perf_event_free(event);
 	put_event(event); /* Must be the 'last' reference */
 	return 0;
 }
@@ -10595,14 +10596,18 @@ perf_event_alloc(struct perf_event_attr
 
 	err = security_perf_event_alloc(event);
 	if (err)
-		goto err_security;
+		goto err_callchain_buffer;
 
 	/* symmetric to unaccount_event() in _free_event() */
 	account_event(event);
 
 	return event;
 
-err_security:
+err_callchain_buffer:
+	if (!event->parent) {
+		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+			put_callchain_buffers();
+	}
 err_addr_filters:
 	kfree(event->addr_filter_ranges);
 

^ permalink raw reply

* kernel panic while using get_random_bytes
From: Temp Sha @ 2019-10-14  8:21 UTC (permalink / raw)
  To: linux-security-module

hi,

i use get_random_bytes() function for my randomness requirement in
kernel version 4.14.142
however is gives panic as soon as I call   get_random_bytes() in my module.


Oct 10 07:20:18 BUG: unable to handle kernel paging request at 00007f5563ced000
IP: chacha20_block+0x24d/0x280
PGD 800000010f7f8067 P4D 800000010f7f8067 PUD 161316067 PMD 1015a8067 PTE 0
Oops: 0002 [#1] PREEMPT SMP PTI
Modules linked in: ipi_hsl(PO) mymod(PO) e1000 ipv6 ftdi_sio usbserial
xt_tcpudp xt_mark iptable_nat nf_nat_ipv4 nf_conntrack_ipv4
nf_defrag_ipv4 nf_nat xt_connlimit nf_conntrack iptable_filter
ip_tables x_tables
CPU: 0 PID: 1841 Comm: hexdump Tainted: P           O    4.14.142-ws-symbol #1
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
task: ffff8881611da000 task.stack: ffffc900504c4000
RIP: 0010:chacha20_block+0x24d/0x280
RSP: 0018:ffffc900504c7c70 EFLAGS: 00010886
RAX: 0000000000000000 RBX: 00000000a88c95b0 RCX: 00007f5563ced000
RDX: ffff88810f79da00 RSI: 0000000015c4332e RDI: 000000007613f298
RBP: ffffc900504c7d00 R08: 000000009d39d68d R09: 00000000bfbdb51f
R10: 00000000ed798a26 R11: 0000000083c184dc R12: 0000000036fc61e0
R13: 00000000f9004639 R14: 0000000042c0d351 R15: 000000008a6cef0f
FS:  00007f5563cef700(0000) GS:ffff888167e00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f5563ced000 CR3: 000000016117c000 CR4: 00000000000006b0
Call Trace:
 _extract_crng+0x6d/0xc0
 extract_crng+0x3a/0x40
 _get_random_bytes+0x56/0x1c0
 ? vprintk_func+0x3f/0xd0
 ? printk+0x3e/0x46
 get_random_bytes+0x2f/0x40
 xyz_packets+0x1110/0x11e0 [mymod]
 proc_reg_read+0x3d/0x60
 __vfs_read+0x23/0x120
 ? vm_mmap_pgoff+0x9d/0xd0
 vfs_read+0x8e/0x110
 SyS_read+0x48/0xc0
 do_syscall_64+0x5c/0x260
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x7f55631f


what could be the problem?


thanks

^ permalink raw reply

* Re: [PATCH 1/7 v2] tracefs: Revert ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
From: Steven Rostedt @ 2019-10-13  0:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Linux Kernel Mailing List, Ingo Molnar, Andrew Morton,
	Matthew Garrett, James Morris James Morris, LSM List, Linux API,
	Ben Hutchings, Al Viro
In-Reply-To: <20191012203502.065258d2@gandalf.local.home>

On Sat, 12 Oct 2019 20:35:02 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Sat, 12 Oct 2019 15:56:15 -0700
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> > On Fri, Oct 11, 2019 at 5:59 PM Steven Rostedt <rostedt@goodmis.org> wrote:  
> > >
> > >
> > > I bisected this down to the addition of the proxy_ops into tracefs for
> > > lockdown. It appears that the allocation of the proxy_ops and then freeing
> > > it in the destroy_inode callback, is causing havoc with the memory system.
> > > Reading the documentation about destroy_inode and talking with Linus about
> > > this, this is buggy and wrong.    
> > 
> > Can you still add the explanation about the inode memory leak to this message?
> > 
> > Right now it just says "it's buggy and wrong". True. But doesn't
> > explain _why_ it is buggy and wrong.
> >   
> 
> Sure. The patches just finished my testing (along with other fixes that
> I need to send you). I have to make a few other updates in the change
> log though, so I'll be rebasing them (but not touching the code), to
> clean up the change logs.
> 

I updated this change log to state:

"I bisected this down to the addition of the proxy_ops into tracefs for
lockdown. It appears that the allocation of the proxy_ops and then freeing
it in the destroy_inode callback, is causing havoc with the memory system.
Reading the documentation about destroy_inode and talking with Linus about
this, this is buggy and wrong. When defining the destroy_inode() method, it 
is expected that the destroy_inode() will also free the inode, and not just 
the extra allocations done in the creation of the inode. The faulty commit 
causes a memory leak of the inode data structure when they are deleted."

-- Steve

^ permalink raw reply

* Re: [PATCH 1/7 v2] tracefs: Revert ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
From: Steven Rostedt @ 2019-10-13  0:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Linux Kernel Mailing List, Ingo Molnar, Andrew Morton,
	Matthew Garrett, James Morris James Morris, LSM List, Linux API,
	Ben Hutchings, Al Viro
In-Reply-To: <CAHk-=whE7GjKz9LtEVNw=zEgWr65N1mU7t2rA4MLiia8Zit6DQ@mail.gmail.com>

On Sat, 12 Oct 2019 15:56:15 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Fri, Oct 11, 2019 at 5:59 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> >
> > I bisected this down to the addition of the proxy_ops into tracefs for
> > lockdown. It appears that the allocation of the proxy_ops and then freeing
> > it in the destroy_inode callback, is causing havoc with the memory system.
> > Reading the documentation about destroy_inode and talking with Linus about
> > this, this is buggy and wrong.  
> 
> Can you still add the explanation about the inode memory leak to this message?
> 
> Right now it just says "it's buggy and wrong". True. But doesn't
> explain _why_ it is buggy and wrong.
> 

Sure. The patches just finished my testing (along with other fixes that
I need to send you). I have to make a few other updates in the change
log though, so I'll be rebasing them (but not touching the code), to
clean up the change logs.

-- Steve

^ permalink raw reply

* Re: [PATCH 1/7 v2] tracefs: Revert ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
From: Linus Torvalds @ 2019-10-12 22:56 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Linux Kernel Mailing List, Ingo Molnar, Andrew Morton,
	Matthew Garrett, James Morris James Morris, LSM List, Linux API,
	Ben Hutchings, Al Viro
In-Reply-To: <20191012005920.630331484@goodmis.org>

On Fri, Oct 11, 2019 at 5:59 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
>
> I bisected this down to the addition of the proxy_ops into tracefs for
> lockdown. It appears that the allocation of the proxy_ops and then freeing
> it in the destroy_inode callback, is causing havoc with the memory system.
> Reading the documentation about destroy_inode and talking with Linus about
> this, this is buggy and wrong.

Can you still add the explanation about the inode memory leak to this message?

Right now it just says "it's buggy and wrong". True. But doesn't
explain _why_ it is buggy and wrong.

          Linus

^ permalink raw reply

* Re: [PATCH 4/7 v2] tracing: Have trace events system open call tracing_open_generic_tr()
From: Steven Rostedt @ 2019-10-12  2:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
	James Morris James Morris, LSM List, Linux API, Ben Hutchings,
	Al Viro
In-Reply-To: <20191012005921.091872328@goodmis.org>

On Fri, 11 Oct 2019 20:57:51 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -1391,9 +1391,6 @@ static int subsystem_open(struct inode *inode, struct file *filp)
>  	struct trace_array *tr;
>  	int ret;
>  
> -	if (tracing_is_disabled())
> -		return -ENODEV;
> -
>  	/* Make sure the system still exists */
>  	mutex_lock(&event_mutex);
>  	mutex_lock(&trace_types_lock);
> @@ -1420,16 +1417,9 @@ static int subsystem_open(struct inode *inode, struct file *filp)
>  	WARN_ON(!dir);
>  
>  	/* Still need to increment the ref count of the system */
> -	if (trace_array_get(tr) < 0) {
> -		put_system(dir);
> -		return -ENODEV;
> -	}
> -
> -	ret = tracing_open_generic(inode, filp);
> -	if (ret < 0) {
> -		trace_array_put(tr);
> +	ret = tracing_open_generic_tr(inode, filp);
> +	if (ret < 0)
>  		put_system(dir);
> -	}
>  
>  	return ret;
>  }

I got a bit too aggressive on this patch. The subsystem_open() gets the
tr from a search, not from the inode. Thus it can't use the
tracing_open_generic_tr() call. It needs to do the trace_array_get()
directly.

V2 of this patch:

-- Steve

From b69083301044f587afefd5a4ac754a8c43ba0f0d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 11 Oct 2019 19:12:21 -0400
Subject: [PATCH] tracing: Have trace events system open call
 tracing_open_generic_tr()

Instead of having the trace events system open calls open code the taking of
the trace_array descriptor (with trace_array_get()) and then calling
trace_open_generic(), have it use the tracing_open_generic_tr() that does
the combination of the two. This requires making tracing_open_generic_tr()
global.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        |  2 +-
 kernel/trace/trace.h        |  1 +
 kernel/trace/trace_events.c | 17 +++--------------
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fa7d813b04c6..94f1b9124939 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4156,7 +4156,7 @@ bool tracing_is_disabled(void)
  * Open and update trace_array ref count.
  * Must have the current trace_array passed to it.
  */
-static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
+int tracing_open_generic_tr(struct inode *inode, struct file *filp)
 {
 	struct trace_array *tr = inode->i_private;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f801d154ff6a..854dbf4050f8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -681,6 +681,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf);
 void tracing_reset_current(int cpu);
 void tracing_reset_all_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
+int tracing_open_generic_tr(struct inode *inode, struct file *filp);
 bool tracing_is_disabled(void);
 bool tracer_tracing_is_on(struct trace_array *tr);
 void tracer_tracing_on(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b89cdfe20bc1..9613a757c902 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1440,28 +1440,17 @@ static int system_tr_open(struct inode *inode, struct file *filp)
 	struct trace_array *tr = inode->i_private;
 	int ret;
 
-	if (tracing_is_disabled())
-		return -ENODEV;
-
-	if (trace_array_get(tr) < 0)
-		return -ENODEV;
-
 	/* Make a temporary dir that has no system but points to tr */
 	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
-	if (!dir) {
-		trace_array_put(tr);
+	if (!dir)
 		return -ENOMEM;
-	}
-
-	dir->tr = tr;
 
-	ret = tracing_open_generic(inode, filp);
+	ret = tracing_open_generic_tr(inode, filp);
 	if (ret < 0) {
-		trace_array_put(tr);
 		kfree(dir);
 		return ret;
 	}
-
+	dir->tr = tr;
 	filp->private_data = dir;
 
 	return 0;
-- 
2.20.1


^ permalink raw reply related

* [PATCH 3/7 v2] tracing: Get trace_array reference for available_tracers files
From: Steven Rostedt @ 2019-10-12  0:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
	James Morris James Morris, LSM List, Linux API, Ben Hutchings,
	Al Viro, stable
In-Reply-To: <20191012005747.210722465@goodmis.org>

From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>

As instances may have different tracers available, we need to look at the
trace_array descriptor that shows lists the available tracers for the
instance. But there's a race between opening the file and the admin from
deleting the instance. The trace_array_get() needs to be called before
accessing the trace_array.

Cc: stable@vger.kernel.org
Fixes: 607e2ea167e56 ("tracing: Set up infrastructure to allow tracers for instances")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 252f79c435f8..fa7d813b04c6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4355,9 +4355,14 @@ static int show_traces_open(struct inode *inode, struct file *file)
 	if (tracing_disabled)
 		return -ENODEV;
 
+	if (trace_array_get(tr) < 0)
+		return -ENODEV;
+
 	ret = seq_open(file, &show_traces_seq_ops);
-	if (ret)
+	if (ret) {
+		trace_array_put(tr);
 		return ret;
+	}
 
 	m = file->private_data;
 	m->private = tr;
@@ -4365,6 +4370,14 @@ static int show_traces_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int show_traces_release(struct inode *inode, struct file *file)
+{
+	struct trace_array *tr = inode->i_private;
+
+	trace_array_put(tr);
+	return seq_release(inode, file);
+}
+
 static ssize_t
 tracing_write_stub(struct file *filp, const char __user *ubuf,
 		   size_t count, loff_t *ppos)
@@ -4395,8 +4408,8 @@ static const struct file_operations tracing_fops = {
 static const struct file_operations show_traces_fops = {
 	.open		= show_traces_open,
 	.read		= seq_read,
-	.release	= seq_release,
 	.llseek		= seq_lseek,
+	.release	= show_traces_release,
 };
 
 static ssize_t
-- 
2.23.0



^ permalink raw reply related

* [PATCH 0/7 v2] tracing: Fix tracefs lockdown and various clean ups
From: Steven Rostedt @ 2019-10-12  0:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
	James Morris James Morris, LSM List, Linux API, Ben Hutchings,
	Al Viro

It appears that using destroy_inode() to clean up the proxy_ops that was
used by the lockdown code to have all open calls to the tracefs directory
was totally broken. It caused the inodes to not be cleaned up as the
destroy_inode() method is expected to clean up the inode, and not just what
it allocated as extra.

Linus suggested to get rid of the proxy_ops in tracefs, and just put the
checks in the open functions themselves. This also gives us a bit more fine
grain control to what exactly can be accessed.

Currently, I left the event format files (as they may need to be used by
something other than tracing), and enabled_functions, which shows what
functions are currently being traced. Not sure it is wise to not display
that information.

They can always be locked down later if need be.

Steven Rostedt (VMware) (7):
      tracefs: Revert ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
      ftrace: Get a reference counter for the trace_array on filter files
      tracing: Get trace_array reference for available_tracers files
      tracing: Have trace events system open call tracing_open_generic_tr()
      tracing: Add tracing_check_open_get_tr()
      tracing: Add some more locked_down checks
      tracing: Do not create tracefs files if tracefs lockdown is in effect

----
 fs/tracefs/inode.c                  |  46 ++----------
 kernel/trace/ftrace.c               |  55 ++++++++++----
 kernel/trace/trace.c                | 138 ++++++++++++++++++++++--------------
 kernel/trace/trace.h                |   2 +
 kernel/trace/trace_dynevent.c       |   4 ++
 kernel/trace/trace_events.c         |  49 +++++--------
 kernel/trace/trace_events_hist.c    |  13 +++-
 kernel/trace/trace_events_trigger.c |   8 ++-
 kernel/trace/trace_kprobe.c         |  12 +++-
 kernel/trace/trace_printk.c         |   7 ++
 kernel/trace/trace_stack.c          |   8 +++
 kernel/trace/trace_stat.c           |   6 +-
 kernel/trace/trace_uprobe.c         |  11 +++
 13 files changed, 220 insertions(+), 139 deletions(-)

^ permalink raw reply

* [PATCH 4/7 v2] tracing: Have trace events system open call tracing_open_generic_tr()
From: Steven Rostedt @ 2019-10-12  0:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
	James Morris James Morris, LSM List, Linux API, Ben Hutchings,
	Al Viro
In-Reply-To: <20191012005747.210722465@goodmis.org>

From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>

Instead of having the trace events system open calls open code the taking of
the trace_array descriptor (with trace_array_get()) and then calling
trace_open_generic(), have it use the tracing_open_generic_tr() that does
the combination of the two. This requires making tracing_open_generic_tr()
global.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        |  2 +-
 kernel/trace/trace.h        |  1 +
 kernel/trace/trace_events.c | 31 +++++--------------------------
 3 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fa7d813b04c6..94f1b9124939 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4156,7 +4156,7 @@ bool tracing_is_disabled(void)
  * Open and update trace_array ref count.
  * Must have the current trace_array passed to it.
  */
-static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
+int tracing_open_generic_tr(struct inode *inode, struct file *filp)
 {
 	struct trace_array *tr = inode->i_private;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f801d154ff6a..854dbf4050f8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -681,6 +681,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf);
 void tracing_reset_current(int cpu);
 void tracing_reset_all_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
+int tracing_open_generic_tr(struct inode *inode, struct file *filp);
 bool tracing_is_disabled(void);
 bool tracer_tracing_is_on(struct trace_array *tr);
 void tracer_tracing_on(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b89cdfe20bc1..0d29f2f13477 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1391,9 +1391,6 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 	struct trace_array *tr;
 	int ret;
 
-	if (tracing_is_disabled())
-		return -ENODEV;
-
 	/* Make sure the system still exists */
 	mutex_lock(&event_mutex);
 	mutex_lock(&trace_types_lock);
@@ -1420,16 +1417,9 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 	WARN_ON(!dir);
 
 	/* Still need to increment the ref count of the system */
-	if (trace_array_get(tr) < 0) {
-		put_system(dir);
-		return -ENODEV;
-	}
-
-	ret = tracing_open_generic(inode, filp);
-	if (ret < 0) {
-		trace_array_put(tr);
+	ret = tracing_open_generic_tr(inode, filp);
+	if (ret < 0)
 		put_system(dir);
-	}
 
 	return ret;
 }
@@ -1440,28 +1430,17 @@ static int system_tr_open(struct inode *inode, struct file *filp)
 	struct trace_array *tr = inode->i_private;
 	int ret;
 
-	if (tracing_is_disabled())
-		return -ENODEV;
-
-	if (trace_array_get(tr) < 0)
-		return -ENODEV;
-
 	/* Make a temporary dir that has no system but points to tr */
 	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
-	if (!dir) {
-		trace_array_put(tr);
+	if (!dir)
 		return -ENOMEM;
-	}
 
-	dir->tr = tr;
-
-	ret = tracing_open_generic(inode, filp);
+	ret = tracing_open_generic_tr(inode, filp);
 	if (ret < 0) {
-		trace_array_put(tr);
 		kfree(dir);
 		return ret;
 	}
-
+	dir->tr = tr;
 	filp->private_data = dir;
 
 	return 0;
-- 
2.23.0



^ permalink raw reply related

* [PATCH 7/7 v2] tracing: Do not create tracefs files if tracefs lockdown is in effect
From: Steven Rostedt @ 2019-10-12  0:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
	James Morris James Morris, LSM List, Linux API, Ben Hutchings,
	Al Viro
In-Reply-To: <20191012005747.210722465@goodmis.org>

From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>

If on boot up, lockdown is activated for tracefs, don't even bother creating
the files. This can also prevent instances from being created if lockdown is
in effect.

Link: http://lkml.kernel.org/r/CAHk-=whC6Ji=fWnjh2+eS4b15TnbsS4VPVtvBOwCy1jjEG_JHQ@mail.gmail.com

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 fs/tracefs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index eeeae0475da9..0caa151cae4e 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -16,6 +16,7 @@
 #include <linux/namei.h>
 #include <linux/tracefs.h>
 #include <linux/fsnotify.h>
+#include <linux/security.h>
 #include <linux/seq_file.h>
 #include <linux/parser.h>
 #include <linux/magic.h>
@@ -390,6 +391,9 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
 	struct dentry *dentry;
 	struct inode *inode;
 
+	if (security_locked_down(LOCKDOWN_TRACEFS))
+		return NULL;
+
 	if (!(mode & S_IFMT))
 		mode |= S_IFREG;
 	BUG_ON(!S_ISREG(mode));
-- 
2.23.0



^ permalink raw reply related

* [PATCH 1/7 v2] tracefs: Revert ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
From: Steven Rostedt @ 2019-10-12  0:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
	James Morris James Morris, LSM List, Linux API, Ben Hutchings,
	Al Viro
In-Reply-To: <20191012005747.210722465@goodmis.org>

From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>

Running the latest kernel through my "make instances" stress tests, I
triggered the following bug (with KASAN and kmemleak enabled):

mkdir invoked oom-killer:
gfp_mask=0x40cd0(GFP_KERNEL|__GFP_COMP|__GFP_RECLAIMABLE), order=0,
oom_score_adj=0
CPU: 1 PID: 2229 Comm: mkdir Not tainted 5.4.0-rc2-test #325
Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
Call Trace:
 dump_stack+0x64/0x8c
 dump_header+0x43/0x3b7
 ? trace_hardirqs_on+0x48/0x4a
 oom_kill_process+0x68/0x2d5
 out_of_memory+0x2aa/0x2d0
 __alloc_pages_nodemask+0x96d/0xb67
 __alloc_pages_node+0x19/0x1e
 alloc_slab_page+0x17/0x45
 new_slab+0xd0/0x234
 ___slab_alloc.constprop.86+0x18f/0x336
 ? alloc_inode+0x2c/0x74
 ? irq_trace+0x12/0x1e
 ? tracer_hardirqs_off+0x1d/0xd7
 ? __slab_alloc.constprop.85+0x21/0x53
 __slab_alloc.constprop.85+0x31/0x53
 ? __slab_alloc.constprop.85+0x31/0x53
 ? alloc_inode+0x2c/0x74
 kmem_cache_alloc+0x50/0x179
 ? alloc_inode+0x2c/0x74
 alloc_inode+0x2c/0x74
 new_inode_pseudo+0xf/0x48
 new_inode+0x15/0x25
 tracefs_get_inode+0x23/0x7c
 ? lookup_one_len+0x54/0x6c
 tracefs_create_file+0x53/0x11d
 trace_create_file+0x15/0x33
 event_create_dir+0x2a3/0x34b
 __trace_add_new_event+0x1c/0x26
 event_trace_add_tracer+0x56/0x86
 trace_array_create+0x13e/0x1e1
 instance_mkdir+0x8/0x17
 tracefs_syscall_mkdir+0x39/0x50
 ? get_dname+0x31/0x31
 vfs_mkdir+0x78/0xa3
 do_mkdirat+0x71/0xb0
 sys_mkdir+0x19/0x1b
 do_fast_syscall_32+0xb0/0xed

I bisected this down to the addition of the proxy_ops into tracefs for
lockdown. It appears that the allocation of the proxy_ops and then freeing
it in the destroy_inode callback, is causing havoc with the memory system.
Reading the documentation about destroy_inode and talking with Linus about
this, this is buggy and wrong.

Instead of allocating the proxy_ops (and then having to free it) the checks
should be done by the open functions themselves, and not hack into the
tracefs directory. First revert the tracefs updates for locked_down and then
later we can add the locked_down checks in the kernel/trace files.

Link: http://lkml.kernel.org/r/20191011135458.7399da44@gandalf.local.home

Fixes: ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 fs/tracefs/inode.c | 42 +-----------------------------------------
 1 file changed, 1 insertion(+), 41 deletions(-)

diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 9fc14e38927f..eeeae0475da9 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -20,7 +20,6 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
-#include <linux/security.h>
 
 #define TRACEFS_DEFAULT_MODE	0700
 
@@ -28,25 +27,6 @@ static struct vfsmount *tracefs_mount;
 static int tracefs_mount_count;
 static bool tracefs_registered;
 
-static int default_open_file(struct inode *inode, struct file *filp)
-{
-	struct dentry *dentry = filp->f_path.dentry;
-	struct file_operations *real_fops;
-	int ret;
-
-	if (!dentry)
-		return -EINVAL;
-
-	ret = security_locked_down(LOCKDOWN_TRACEFS);
-	if (ret)
-		return ret;
-
-	real_fops = dentry->d_fsdata;
-	if (!real_fops->open)
-		return 0;
-	return real_fops->open(inode, filp);
-}
-
 static ssize_t default_read_file(struct file *file, char __user *buf,
 				 size_t count, loff_t *ppos)
 {
@@ -241,12 +221,6 @@ static int tracefs_apply_options(struct super_block *sb)
 	return 0;
 }
 
-static void tracefs_destroy_inode(struct inode *inode)
-{
-	if (S_ISREG(inode->i_mode))
-		kfree(inode->i_fop);
-}
-
 static int tracefs_remount(struct super_block *sb, int *flags, char *data)
 {
 	int err;
@@ -283,7 +257,6 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root)
 static const struct super_operations tracefs_super_operations = {
 	.statfs		= simple_statfs,
 	.remount_fs	= tracefs_remount,
-	.destroy_inode  = tracefs_destroy_inode,
 	.show_options	= tracefs_show_options,
 };
 
@@ -414,7 +387,6 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops)
 {
-	struct file_operations *proxy_fops;
 	struct dentry *dentry;
 	struct inode *inode;
 
@@ -430,20 +402,8 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
 	if (unlikely(!inode))
 		return failed_creating(dentry);
 
-	proxy_fops = kzalloc(sizeof(struct file_operations), GFP_KERNEL);
-	if (unlikely(!proxy_fops)) {
-		iput(inode);
-		return failed_creating(dentry);
-	}
-
-	if (!fops)
-		fops = &tracefs_file_operations;
-
-	dentry->d_fsdata = (void *)fops;
-	memcpy(proxy_fops, fops, sizeof(*proxy_fops));
-	proxy_fops->open = default_open_file;
 	inode->i_mode = mode;
-	inode->i_fop = proxy_fops;
+	inode->i_fop = fops ? fops : &tracefs_file_operations;
 	inode->i_private = data;
 	d_instantiate(dentry, inode);
 	fsnotify_create(dentry->d_parent->d_inode, dentry);
-- 
2.23.0



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox