* [RFC PATCH 05/21] pipe: Conditionalise wakeup in pipe_read()
From: David Howells @ 2019-10-15 21:48 UTC (permalink / raw)
To: torvalds
Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
linux-usb, linux-block, linux-security-module, linux-fsdevel,
linux-api, linux-security-module, linux-kernel
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>
Only do a wakeup in pipe_read() if we made space in a completely full
buffer. The producer shouldn't be waiting on pipe->wait otherwise.
Signed-off-by: David Howells <dhowells@redhat.com>
---
fs/pipe.c | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/fs/pipe.c b/fs/pipe.c
index 08af7e7bbea2..0d25cb090a03 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -327,11 +327,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
spin_lock_irq(&pipe->wait.lock);
tail++;
pipe_commit_read(pipe, tail);
- do_wakeup = 0;
- prelocked_wake_up_interruptible_sync_poll(
- &pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ do_wakeup = 1;
+ if (head - (tail - 1) == pipe->max_usage)
+ prelocked_wake_up_interruptible_sync_poll(
+ &pipe->wait, EPOLLOUT | EPOLLWRNORM);
spin_unlock_irq(&pipe->wait.lock);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ if (head - (tail - 1) == pipe->max_usage)
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
total_len -= chars;
if (!total_len)
@@ -360,11 +362,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
ret = -ERESTARTSYS;
break;
}
- if (do_wakeup) {
- wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
- do_wakeup = 0;
- }
pipe_wait(pipe);
}
__pipe_unlock(pipe);
^ permalink raw reply related
* [RFC PATCH 04/21] pipe: Advance tail pointer inside of wait spinlock in pipe_read()
From: David Howells @ 2019-10-15 21:48 UTC (permalink / raw)
To: torvalds
Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
linux-usb, linux-block, linux-security-module, linux-fsdevel,
linux-api, linux-security-module, linux-kernel
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>
Advance the pipe ring tail pointer inside of wait spinlock in pipe_read()
so that the pipe can be written into with kernel notifications from
contexts where pipe->mutex cannot be taken.
Signed-off-by: David Howells <dhowells@redhat.com>
---
fs/pipe.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/pipe.c b/fs/pipe.c
index 0574277bad18..08af7e7bbea2 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -324,9 +324,14 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (!buf->len) {
pipe_buf_release(pipe, buf);
+ spin_lock_irq(&pipe->wait.lock);
tail++;
pipe_commit_read(pipe, tail);
- do_wakeup = 1;
+ do_wakeup = 0;
+ prelocked_wake_up_interruptible_sync_poll(
+ &pipe->wait, EPOLLOUT | EPOLLWRNORM);
+ spin_unlock_irq(&pipe->wait.lock);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
total_len -= chars;
if (!total_len)
@@ -358,6 +363,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (do_wakeup) {
wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ do_wakeup = 0;
}
pipe_wait(pipe);
}
^ permalink raw reply related
* [RFC PATCH 03/21] pipe: Use head and tail pointers for the ring, not cursor and length
From: David Howells @ 2019-10-15 21:48 UTC (permalink / raw)
To: torvalds
Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
linux-usb, linux-block, linux-security-module, linux-fsdevel,
linux-api, linux-security-module, linux-kernel
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>
Convert pipes to use head and tail pointers for the buffer ring rather than
pointer and length as the latter requires two atomic ops to update (or a
combined op) whereas the former only requires one.
(1) The head pointer is the point at which production occurs and points to
the slot in which the next buffer will be placed. This is equivalent
to pipe->curbuf + pipe->nrbufs.
The head pointer belongs to the write-side.
(2) The tail pointer is the point at which consumption occurs. It points
to the next slot to be consumed. This is equivalent to pipe->curbuf.
The tail pointer belongs to the read-side.
(3) head and tail are allowed to run to UINT_MAX and wrap naturally. They
are only masked off when the array is being accessed, e.g.:
pipe->bufs[head & mask]
This means that it is not necessary to have a dead slot in the ring as
head == tail isn't ambiguous.
(4) The ring is empty if "head == tail".
(5) The occupancy of the ring is "head - tail".
(6) The number of free slots in the ring is "(tail + pipe->ring_size) -
head".
(7) The ring is full if "head >= (tail + pipe->ring_size)", which can also
be written as "head - tail >= pipe->ring_size".
Also split pipe->buffers into pipe->ring_size (which indicates the size of the
ring) and pipe->max_usage (which restricts the amount of ring that write() is
allowed to fill). This allows for a pipe that is both writable by the kernel
notification facility and by userspace, allowing plenty of ring space for
notifications to be added whilst preventing userspace from being able to use
up too much buffer space.
Signed-off-by: David Howells <dhowells@redhat.com>
---
fs/fuse/dev.c | 31 +++--
fs/pipe.c | 168 ++++++++++++++++------------
fs/splice.c | 183 ++++++++++++++++++------------
include/linux/pipe_fs_i.h | 38 ++++++
include/linux/uio.h | 4 -
lib/iov_iter.c | 274 ++++++++++++++++++++++++++-------------------
6 files changed, 421 insertions(+), 277 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index dadd617d826c..8bbe5ec6c336 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -703,7 +703,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
cs->pipebufs++;
cs->nr_segs--;
} else {
- if (cs->nr_segs == cs->pipe->buffers)
+ if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
page = alloc_page(GFP_HIGHUSER);
@@ -879,7 +879,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
struct pipe_buffer *buf;
int err;
- if (cs->nr_segs == cs->pipe->buffers)
+ if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
err = unlock_request(cs->req);
@@ -1341,7 +1341,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
if (!fud)
return -EPERM;
- bufs = kvmalloc_array(pipe->buffers, sizeof(struct pipe_buffer),
+ bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer),
GFP_KERNEL);
if (!bufs)
return -ENOMEM;
@@ -1353,7 +1353,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
if (ret < 0)
goto out;
- if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+ if (pipe->head - pipe->tail + cs.nr_segs > pipe->max_usage) {
ret = -EIO;
goto out;
}
@@ -1935,6 +1935,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct file *out, loff_t *ppos,
size_t len, unsigned int flags)
{
+ unsigned int head, tail, mask, count;
unsigned nbuf;
unsigned idx;
struct pipe_buffer *bufs;
@@ -1949,8 +1950,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe_lock(pipe);
- bufs = kvmalloc_array(pipe->nrbufs, sizeof(struct pipe_buffer),
- GFP_KERNEL);
+ head = pipe->head;
+ tail = pipe->tail;
+ mask = pipe->ring_size - 1;
+ count = head - tail;
+
+ bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
if (!bufs) {
pipe_unlock(pipe);
return -ENOMEM;
@@ -1958,8 +1963,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
nbuf = 0;
rem = 0;
- for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
- rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+ for (idx = tail; idx < head && rem < len; idx++)
+ rem += pipe->bufs[idx & mask].len;
ret = -EINVAL;
if (rem < len)
@@ -1970,16 +1975,16 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct pipe_buffer *ibuf;
struct pipe_buffer *obuf;
- BUG_ON(nbuf >= pipe->buffers);
- BUG_ON(!pipe->nrbufs);
- ibuf = &pipe->bufs[pipe->curbuf];
+ BUG_ON(nbuf >= pipe->ring_size);
+ BUG_ON(tail == head);
+ ibuf = &pipe->bufs[tail];
obuf = &bufs[nbuf];
if (rem >= ibuf->len) {
*obuf = *ibuf;
ibuf->ops = NULL;
- pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
- pipe->nrbufs--;
+ tail++;
+ pipe_commit_read(pipe, tail);
} else {
if (!pipe_buf_get(pipe, ibuf))
goto out_free;
diff --git a/fs/pipe.c b/fs/pipe.c
index 8a2ab2f974bd..0574277bad18 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -43,10 +43,11 @@ unsigned long pipe_user_pages_hard;
unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
/*
- * We use a start+len construction, which provides full use of the
- * allocated memory.
- * -- Florian Coosmann (FGC)
- *
+ * We use head and tail indices that aren't masked off, except at the point of
+ * dereference, but rather they're allowed to wrap naturally. This means there
+ * isn't a dead spot in the buffer, provided the ring size < INT_MAX.
+ * -- David Howells 2019-09-23.
+ *
* Reads with count = 0 should always return 0.
* -- Julian Bradfield 1999-06-07.
*
@@ -285,10 +286,12 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
ret = 0;
__pipe_lock(pipe);
for (;;) {
- int bufs = pipe->nrbufs;
- if (bufs) {
- int curbuf = pipe->curbuf;
- struct pipe_buffer *buf = pipe->bufs + curbuf;
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
+
+ if (tail != head) {
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t chars = buf->len;
size_t written;
int error;
@@ -321,17 +324,17 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (!buf->len) {
pipe_buf_release(pipe, buf);
- curbuf = (curbuf + 1) & (pipe->buffers - 1);
- pipe->curbuf = curbuf;
- pipe->nrbufs = --bufs;
+ tail++;
+ pipe_commit_read(pipe, tail);
do_wakeup = 1;
}
total_len -= chars;
if (!total_len)
break; /* common path: read succeeded */
+ if (tail != head) /* More to do? */
+ continue;
}
- if (bufs) /* More to do? */
- continue;
+
if (!pipe->writers)
break;
if (!pipe->waiting_writers) {
@@ -380,6 +383,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
+ unsigned int head, tail, buffers, mask;
ssize_t ret = 0;
int do_wakeup = 0;
size_t total_len = iov_iter_count(from);
@@ -397,12 +401,15 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
+ tail = pipe->tail;
+ head = pipe->head;
+ buffers = pipe->max_usage;
+ mask = pipe->ring_size - 1;
+
/* We try to merge small writes */
chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
- if (pipe->nrbufs && chars != 0) {
- int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
- (pipe->buffers - 1);
- struct pipe_buffer *buf = pipe->bufs + lastbuf;
+ if (head != tail && chars != 0) {
+ struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;
if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
@@ -423,18 +430,16 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
}
for (;;) {
- int bufs;
-
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}
- bufs = pipe->nrbufs;
- if (bufs < pipe->buffers) {
- int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
- struct pipe_buffer *buf = pipe->bufs + newbuf;
+
+ tail = pipe->tail;
+ if (head - tail < buffers) {
+ struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page;
int copied;
@@ -470,14 +475,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
buf->ops = &packet_pipe_buf_ops;
buf->flags = PIPE_BUF_FLAG_PACKET;
}
- pipe->nrbufs = ++bufs;
+
+ head++;
+ pipe_commit_write(pipe, head);
pipe->tmp_page = NULL;
if (!iov_iter_count(from))
break;
}
- if (bufs < pipe->buffers)
+
+ if (head - tail < buffers)
continue;
+
+ /* Wait for buffer space to become available. */
if (filp->f_flags & O_NONBLOCK) {
if (!ret)
ret = -EAGAIN;
@@ -515,17 +525,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe = filp->private_data;
- int count, buf, nrbufs;
+ int count, head, tail, mask;
switch (cmd) {
case FIONREAD:
__pipe_lock(pipe);
count = 0;
- buf = pipe->curbuf;
- nrbufs = pipe->nrbufs;
- while (--nrbufs >= 0) {
- count += pipe->bufs[buf].len;
- buf = (buf+1) & (pipe->buffers - 1);
+ head = pipe->head;
+ tail = pipe->tail;
+ mask = pipe->ring_size - 1;
+
+ while (tail < head) {
+ count += pipe->bufs[tail & mask].len;
+ tail++;
}
__pipe_unlock(pipe);
@@ -541,21 +553,26 @@ pipe_poll(struct file *filp, poll_table *wait)
{
__poll_t mask;
struct pipe_inode_info *pipe = filp->private_data;
- int nrbufs;
+ unsigned int head = READ_ONCE(pipe->head);
+ unsigned int tail = READ_ONCE(pipe->tail);
+ unsigned int occupancy = head - tail;
poll_wait(filp, &pipe->wait, wait);
+ BUG_ON(occupancy > pipe->ring_size);
+
/* Reading only -- no need for acquiring the semaphore. */
- nrbufs = pipe->nrbufs;
mask = 0;
if (filp->f_mode & FMODE_READ) {
- mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
+ if (occupancy > 0)
+ mask |= EPOLLIN | EPOLLRDNORM;
if (!pipe->writers && filp->f_version != pipe->w_counter)
mask |= EPOLLHUP;
}
if (filp->f_mode & FMODE_WRITE) {
- mask |= (nrbufs < pipe->buffers) ? EPOLLOUT | EPOLLWRNORM : 0;
+ if (occupancy < pipe->max_usage)
+ mask |= EPOLLOUT | EPOLLWRNORM;
/*
* Most Unices do not set EPOLLERR for FIFOs but on Linux they
* behave exactly like pipes for poll().
@@ -679,7 +696,8 @@ struct pipe_inode_info *alloc_pipe_info(void)
if (pipe->bufs) {
init_waitqueue_head(&pipe->wait);
pipe->r_counter = pipe->w_counter = 1;
- pipe->buffers = pipe_bufs;
+ pipe->max_usage = pipe_bufs;
+ pipe->ring_size = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
@@ -697,9 +715,9 @@ void free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
- (void) account_pipe_buffers(pipe->user, pipe->buffers, 0);
+ (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0);
free_uid(pipe->user);
- for (i = 0; i < pipe->buffers; i++) {
+ for (i = 0; i < pipe->ring_size; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
pipe_buf_release(pipe, buf);
@@ -880,7 +898,7 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
{
- int cur = *cnt;
+ int cur = *cnt;
while (cur == *cnt) {
pipe_wait(pipe);
@@ -955,7 +973,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
}
}
break;
-
+
case FMODE_WRITE:
/*
* O_WRONLY
@@ -975,7 +993,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
goto err_wr;
}
break;
-
+
case FMODE_READ | FMODE_WRITE:
/*
* O_RDWR
@@ -1054,14 +1072,14 @@ unsigned int round_pipe_size(unsigned long size)
static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
{
struct pipe_buffer *bufs;
- unsigned int size, nr_pages;
+ unsigned int size, nr_slots, head, tail, mask, n;
unsigned long user_bufs;
long ret = 0;
size = round_pipe_size(arg);
- nr_pages = size >> PAGE_SHIFT;
+ nr_slots = size >> PAGE_SHIFT;
- if (!nr_pages)
+ if (!nr_slots)
return -EINVAL;
/*
@@ -1071,13 +1089,13 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
* Decreasing the pipe capacity is always permitted, even
* if the user is currently over a limit.
*/
- if (nr_pages > pipe->buffers &&
+ if (nr_slots > pipe->ring_size &&
size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
return -EPERM;
- user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages);
+ user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots);
- if (nr_pages > pipe->buffers &&
+ if (nr_slots > pipe->ring_size &&
(too_many_pipe_buffers_hard(user_bufs) ||
too_many_pipe_buffers_soft(user_bufs)) &&
is_unprivileged_user()) {
@@ -1086,17 +1104,21 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
}
/*
- * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
- * expect a lot of shrink+grow operations, just free and allocate
- * again like we would do for growing. If the pipe currently
+ * We can shrink the pipe, if arg is greater than the ring occupancy.
+ * Since we don't expect a lot of shrink+grow operations, just free and
+ * allocate again like we would do for growing. If the pipe currently
* contains more buffers than arg, then return busy.
*/
- if (nr_pages < pipe->nrbufs) {
+ mask = pipe->ring_size - 1;
+ head = pipe->head & mask;
+ tail = pipe->tail & mask;
+ n = pipe->head - pipe->tail;
+ if (nr_slots < n) {
ret = -EBUSY;
goto out_revert_acct;
}
- bufs = kcalloc(nr_pages, sizeof(*bufs),
+ bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs)) {
ret = -ENOMEM;
@@ -1105,33 +1127,33 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
/*
* The pipe array wraps around, so just start the new one at zero
- * and adjust the indexes.
+ * and adjust the indices.
*/
- if (pipe->nrbufs) {
- unsigned int tail;
- unsigned int head;
-
- tail = pipe->curbuf + pipe->nrbufs;
- if (tail < pipe->buffers)
- tail = 0;
- else
- tail &= (pipe->buffers - 1);
-
- head = pipe->nrbufs - tail;
- if (head)
- memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
- if (tail)
- memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
+ if (n > 0) {
+ if (head > tail) {
+ memcpy(bufs, &pipe->bufs[tail], n * sizeof(struct pipe_buffer));
+ } else {
+ unsigned int s = pipe->ring_size - tail;
+ if (n - s > 0)
+ memcpy(bufs + s, &pipe->bufs[0],
+ (n - s) * sizeof(struct pipe_buffer));
+ memcpy(bufs, &pipe->bufs[tail], s * sizeof(struct pipe_buffer));
+ }
}
- pipe->curbuf = 0;
+ head = n;
+ tail = 0;
+
kfree(pipe->bufs);
pipe->bufs = bufs;
- pipe->buffers = nr_pages;
- return nr_pages * PAGE_SIZE;
+ pipe->ring_size = nr_slots;
+ pipe->max_usage = nr_slots;
+ pipe->tail = tail;
+ pipe->head = head;
+ return pipe->max_usage * PAGE_SIZE;
out_revert_acct:
- (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers);
+ (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size);
return ret;
}
@@ -1161,7 +1183,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
ret = pipe_set_size(pipe, arg);
break;
case F_GETPIPE_SZ:
- ret = pipe->buffers * PAGE_SIZE;
+ ret = pipe->max_usage * PAGE_SIZE;
break;
default:
ret = -EINVAL;
diff --git a/fs/splice.c b/fs/splice.c
index 98412721f056..7f5f59e2967b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -185,6 +185,9 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
struct splice_pipe_desc *spd)
{
unsigned int spd_pages = spd->nr_pages;
+ unsigned int tail = pipe->tail;
+ unsigned int head = pipe->head;
+ unsigned int mask = pipe->ring_size - 1;
int ret = 0, page_nr = 0;
if (!spd_pages)
@@ -196,9 +199,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
goto out;
}
- while (pipe->nrbufs < pipe->buffers) {
- int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
- struct pipe_buffer *buf = pipe->bufs + newbuf;
+ while (head - tail < pipe->max_usage) {
+ struct pipe_buffer *buf = &pipe->bufs[head & mask];
buf->page = spd->pages[page_nr];
buf->offset = spd->partial[page_nr].offset;
@@ -207,7 +209,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
buf->ops = spd->ops;
buf->flags = 0;
- pipe->nrbufs++;
+ head++;
+ pipe_commit_write(pipe, head);
page_nr++;
ret += buf->len;
@@ -228,17 +231,19 @@ EXPORT_SYMBOL_GPL(splice_to_pipe);
ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
int ret;
if (unlikely(!pipe->readers)) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
- } else if (pipe->nrbufs == pipe->buffers) {
+ } else if (head - tail >= pipe->max_usage) {
ret = -EAGAIN;
} else {
- int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
- pipe->bufs[newbuf] = *buf;
- pipe->nrbufs++;
+ pipe->bufs[head & mask] = *buf;
+ pipe_commit_write(pipe, head + 1);
return buf->len;
}
pipe_buf_release(pipe, buf);
@@ -252,14 +257,14 @@ EXPORT_SYMBOL(add_to_pipe);
*/
int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
{
- unsigned int buffers = READ_ONCE(pipe->buffers);
+ unsigned int max_usage = READ_ONCE(pipe->max_usage);
- spd->nr_pages_max = buffers;
- if (buffers <= PIPE_DEF_BUFFERS)
+ spd->nr_pages_max = max_usage;
+ if (max_usage <= PIPE_DEF_BUFFERS)
return 0;
- spd->pages = kmalloc_array(buffers, sizeof(struct page *), GFP_KERNEL);
- spd->partial = kmalloc_array(buffers, sizeof(struct partial_page),
+ spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
+ spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
GFP_KERNEL);
if (spd->pages && spd->partial)
@@ -298,10 +303,11 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
{
struct iov_iter to;
struct kiocb kiocb;
- int idx, ret;
+ unsigned int i_head;
+ int ret;
iov_iter_pipe(&to, READ, pipe, len);
- idx = to.idx;
+ i_head = to.head;
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
@@ -309,7 +315,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
*ppos = kiocb.ki_pos;
file_accessed(in);
} else if (ret < 0) {
- to.idx = idx;
+ to.head = i_head;
to.iov_offset = 0;
iov_iter_advance(&to, 0); /* to free what was emitted */
/*
@@ -370,11 +376,12 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
struct iov_iter to;
struct page **pages;
unsigned int nr_pages;
+ unsigned int mask;
size_t offset, base, copied = 0;
ssize_t res;
int i;
- if (pipe->nrbufs == pipe->buffers)
+ if (pipe->head - pipe->tail == pipe->max_usage)
return -EAGAIN;
/*
@@ -400,8 +407,9 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
}
}
- pipe->bufs[to.idx].offset = offset;
- pipe->bufs[to.idx].len -= offset;
+ mask = pipe->ring_size - 1;
+ pipe->bufs[to.head & mask].offset = offset;
+ pipe->bufs[to.head & mask].len -= offset;
for (i = 0; i < nr_pages; i++) {
size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
@@ -443,7 +451,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
- if (sd->len < sd->total_len && pipe->nrbufs > 1)
+ if (sd->len < sd->total_len && pipe->head - pipe->tail > 1)
more |= MSG_SENDPAGE_NOTLAST;
return file->f_op->sendpage(file, buf->page, buf->offset,
@@ -481,10 +489,13 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
splice_actor *actor)
{
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
int ret;
- while (pipe->nrbufs) {
- struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+ while (tail != head) {
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
sd->len = buf->len;
if (sd->len > sd->total_len)
@@ -511,8 +522,8 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
if (!buf->len) {
pipe_buf_release(pipe, buf);
- pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
- pipe->nrbufs--;
+ tail++;
+ pipe_commit_read(pipe, tail);
if (pipe->files)
sd->need_wakeup = true;
}
@@ -543,7 +554,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
if (signal_pending(current))
return -ERESTARTSYS;
- while (!pipe->nrbufs) {
+ while (pipe->tail == pipe->head) {
if (!pipe->writers)
return 0;
@@ -686,7 +697,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
.pos = *ppos,
.u.file = out,
};
- int nbufs = pipe->buffers;
+ int nbufs = pipe->max_usage;
struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
GFP_KERNEL);
ssize_t ret;
@@ -699,16 +710,19 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
splice_from_pipe_begin(&sd);
while (sd.total_len) {
struct iov_iter from;
+ unsigned int head = pipe->head;
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
size_t left;
- int n, idx;
+ int n;
ret = splice_from_pipe_next(pipe, &sd);
if (ret <= 0)
break;
- if (unlikely(nbufs < pipe->buffers)) {
+ if (unlikely(nbufs < pipe->max_usage)) {
kfree(array);
- nbufs = pipe->buffers;
+ nbufs = pipe->max_usage;
array = kcalloc(nbufs, sizeof(struct bio_vec),
GFP_KERNEL);
if (!array) {
@@ -719,16 +733,13 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
/* build the vector */
left = sd.total_len;
- for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
- struct pipe_buffer *buf = pipe->bufs + idx;
+ for (n = 0; head - tail > 0 && left && n < nbufs; tail++, n++) {
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t this_len = buf->len;
if (this_len > left)
this_len = left;
- if (idx == pipe->buffers - 1)
- idx = -1;
-
ret = pipe_buf_confirm(pipe, buf);
if (unlikely(ret)) {
if (ret == -ENODATA)
@@ -752,14 +763,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
*ppos = sd.pos;
/* dismiss the fully eaten buffers, adjust the partial one */
+ tail = pipe->tail;
while (ret) {
- struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
if (ret >= buf->len) {
ret -= buf->len;
buf->len = 0;
pipe_buf_release(pipe, buf);
- pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
- pipe->nrbufs--;
+ tail++;
+ pipe_commit_read(pipe, tail);
if (pipe->files)
sd.need_wakeup = true;
} else {
@@ -942,15 +954,17 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
sd->flags &= ~SPLICE_F_NONBLOCK;
more = sd->flags & SPLICE_F_MORE;
- WARN_ON_ONCE(pipe->nrbufs != 0);
+ WARN_ON_ONCE(pipe->head != pipe->tail);
while (len) {
+ unsigned int p_occupancy, p_space;
size_t read_len;
loff_t pos = sd->pos, prev_pos = pos;
/* Don't try to read more the pipe has space for. */
- read_len = min_t(size_t, len,
- (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT);
+ p_occupancy = READ_ONCE(pipe->head) - READ_ONCE(pipe->tail);
+ p_space = pipe->max_usage - p_occupancy;
+ read_len = min_t(size_t, len, p_space << PAGE_SHIFT);
ret = do_splice_to(in, &pos, pipe, read_len, flags);
if (unlikely(ret <= 0))
goto out_release;
@@ -989,7 +1003,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
}
done:
- pipe->nrbufs = pipe->curbuf = 0;
+ pipe->tail = pipe->head = 0;
file_accessed(in);
return bytes;
@@ -998,8 +1012,8 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
* If we did an incomplete transfer we must release
* the pipe buffers in question:
*/
- for (i = 0; i < pipe->buffers; i++) {
- struct pipe_buffer *buf = pipe->bufs + i;
+ for (i = 0; i < pipe->ring_size; i++) {
+ struct pipe_buffer *buf = &pipe->bufs[i];
if (buf->ops)
pipe_buf_release(pipe, buf);
@@ -1075,7 +1089,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
send_sig(SIGPIPE, current, 0);
return -EPIPE;
}
- if (pipe->nrbufs != pipe->buffers)
+ if (pipe->head - pipe->tail < pipe->max_usage)
return 0;
if (flags & SPLICE_F_NONBLOCK)
return -EAGAIN;
@@ -1442,16 +1456,16 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
int ret;
/*
- * Check ->nrbufs without the inode lock first. This function
+ * Check the pipe occupancy without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (pipe->nrbufs)
+ if (pipe->head != pipe->tail)
return 0;
ret = 0;
pipe_lock(pipe);
- while (!pipe->nrbufs) {
+ while (pipe->head == pipe->tail) {
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
@@ -1483,13 +1497,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
* Check ->nrbufs without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (pipe->nrbufs < pipe->buffers)
+ if (pipe->head - pipe->tail >= pipe->max_usage)
return 0;
ret = 0;
pipe_lock(pipe);
- while (pipe->nrbufs >= pipe->buffers) {
+ while (pipe->head - pipe->tail >= pipe->max_usage) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
@@ -1520,7 +1534,10 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
- int ret = 0, nbuf;
+ unsigned int i_head, o_head;
+ unsigned int i_tail, o_tail;
+ unsigned int i_mask, o_mask;
+ int ret = 0;
bool input_wakeup = false;
@@ -1540,7 +1557,14 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
*/
pipe_double_lock(ipipe, opipe);
+ i_tail = ipipe->tail;
+ i_mask = ipipe->ring_size - 1;
+ o_head = opipe->head;
+ o_mask = opipe->ring_size - 1;
+
do {
+ size_t o_len;
+
if (!opipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
@@ -1548,14 +1572,17 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
break;
}
- if (!ipipe->nrbufs && !ipipe->writers)
+ i_head = ipipe->head;
+ o_tail = opipe->tail;
+
+ if (i_head == i_tail && !ipipe->writers)
break;
/*
* Cannot make any progress, because either the input
* pipe is empty or the output pipe is full.
*/
- if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
+ if (i_head == i_tail || o_head - o_tail >= opipe->max_usage) {
/* Already processed some buffers, break */
if (ret)
break;
@@ -1575,9 +1602,8 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
goto retry;
}
- ibuf = ipipe->bufs + ipipe->curbuf;
- nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
- obuf = opipe->bufs + nbuf;
+ ibuf = &ipipe->bufs[i_tail & i_mask];
+ obuf = &opipe->bufs[o_head & o_mask];
if (len >= ibuf->len) {
/*
@@ -1585,10 +1611,12 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
*/
*obuf = *ibuf;
ibuf->ops = NULL;
- opipe->nrbufs++;
- ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
- ipipe->nrbufs--;
+ i_tail++;
+ pipe_commit_read(ipipe, i_tail);
input_wakeup = true;
+ o_len = obuf->len;
+ o_head++;
+ pipe_commit_write(opipe, o_head);
} else {
/*
* Get a reference to this pipe buffer,
@@ -1610,12 +1638,14 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
pipe_buf_mark_unmergeable(obuf);
obuf->len = len;
- opipe->nrbufs++;
- ibuf->offset += obuf->len;
- ibuf->len -= obuf->len;
+ ibuf->offset += len;
+ ibuf->len -= len;
+ o_len = len;
+ o_head++;
+ pipe_commit_write(opipe, o_head);
}
- ret += obuf->len;
- len -= obuf->len;
+ ret += o_len;
+ len -= o_len;
} while (len);
pipe_unlock(ipipe);
@@ -1641,7 +1671,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
- int ret = 0, i = 0, nbuf;
+ unsigned int i_head, o_head;
+ unsigned int i_tail, o_tail;
+ unsigned int i_mask, o_mask;
+ int ret = 0, i = 0;
/*
* Potential ABBA deadlock, work around it by ordering lock
@@ -1650,6 +1683,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
*/
pipe_double_lock(ipipe, opipe);
+ i_tail = ipipe->tail;
+ i_mask = ipipe->ring_size - 1;
+ o_head = opipe->head;
+ o_mask = opipe->ring_size - 1;
+
do {
if (!opipe->readers) {
send_sig(SIGPIPE, current, 0);
@@ -1658,15 +1696,18 @@ static int link_pipe(struct pipe_inode_info *ipipe,
break;
}
+ i_head = ipipe->head;
+ o_tail = opipe->tail;
+
/*
- * If we have iterated all input buffers or ran out of
+ * If we have iterated all input buffers or run out of
* output room, break.
*/
- if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
+ if (i_head != i_tail || o_head - o_tail >= opipe->max_usage)
break;
- ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
- nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
+ ibuf = &ipipe->bufs[i_tail & i_mask];
+ obuf = &opipe->bufs[o_head & o_mask];
/*
* Get a reference to this pipe buffer,
@@ -1678,7 +1719,6 @@ static int link_pipe(struct pipe_inode_info *ipipe,
break;
}
- obuf = opipe->bufs + nbuf;
*obuf = *ibuf;
/*
@@ -1691,10 +1731,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
if (obuf->len > len)
obuf->len = len;
-
- opipe->nrbufs++;
ret += obuf->len;
len -= obuf->len;
+
+ o_head++;
+ pipe_commit_write(opipe, o_head);
i++;
} while (len);
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 5c626fdc10db..ec82c5374410 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -30,9 +30,10 @@ struct pipe_buffer {
* struct pipe_inode_info - a linux kernel pipe
* @mutex: mutex protecting the whole thing
* @wait: reader/writer wait point in case of empty/full pipe
- * @nrbufs: the number of non-empty pipe buffers in this pipe
- * @buffers: total number of buffers (should be a power of 2)
- * @curbuf: the current pipe buffer entry
+ * @head: The point of buffer production
+ * @tail: The point of buffer consumption
+ * @max_usage: The maximum number of slots that may be used in the ring
+ * @ring_size: total number of buffers (should be a power of 2)
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
@@ -48,7 +49,10 @@ struct pipe_buffer {
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t wait;
- unsigned int nrbufs, curbuf, buffers;
+ unsigned int head;
+ unsigned int tail;
+ unsigned int max_usage;
+ unsigned int ring_size;
unsigned int readers;
unsigned int writers;
unsigned int files;
@@ -104,6 +108,32 @@ struct pipe_buf_operations {
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};
+/**
+ * pipe_commit_read - Set pipe buffer tail pointer in the read-side
+ * @pipe: The pipe in question
+ * @tail: The new tail pointer
+ *
+ * Update the tail pointer in the read-side code after a read has taken place.
+ */
+static inline void pipe_commit_read(struct pipe_inode_info *pipe,
+ unsigned int tail)
+{
+ pipe->tail = tail;
+}
+
+/**
+ * pipe_commit_write - Set pipe buffer head pointer in the write-side
+ * @pipe: The pipe in question
+ * @head: The new head pointer
+ *
+ * Update the head pointer in the write-side code after a write has taken place.
+ */
+static inline void pipe_commit_write(struct pipe_inode_info *pipe,
+ unsigned int head)
+{
+ pipe->head = head;
+}
+
/**
* pipe_buf_get - get a reference to a pipe_buffer
* @pipe: the pipe that the buffer belongs to
diff --git a/include/linux/uio.h b/include/linux/uio.h
index ab5f523bc0df..9576fd8158d7 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -45,8 +45,8 @@ struct iov_iter {
union {
unsigned long nr_segs;
struct {
- int idx;
- int start_idx;
+ unsigned int head;
+ unsigned int start_head;
};
};
};
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 639d5e7014c1..c8a046c87e3f 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -325,28 +325,33 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
static bool sanity(const struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
- int idx = i->idx;
- int next = pipe->curbuf + pipe->nrbufs;
+ unsigned int p_head = pipe->head;
+ unsigned int p_tail = pipe->tail;
+ unsigned int p_mask = pipe->ring_size - 1;
+ unsigned int p_occupancy = p_head - p_tail;
+ unsigned int i_head = i->head;
+ unsigned int idx;
+
if (i->iov_offset) {
struct pipe_buffer *p;
- if (unlikely(!pipe->nrbufs))
+ if (unlikely(p_occupancy == 0))
goto Bad; // pipe must be non-empty
- if (unlikely(idx != ((next - 1) & (pipe->buffers - 1))))
+ if (unlikely(i_head != p_head - 1))
goto Bad; // must be at the last buffer...
- p = &pipe->bufs[idx];
+ p = &pipe->bufs[i_head & p_mask];
if (unlikely(p->offset + p->len != i->iov_offset))
goto Bad; // ... at the end of segment
} else {
- if (idx != (next & (pipe->buffers - 1)))
+ if (i_head != p_head)
goto Bad; // must be right after the last buffer
}
return true;
Bad:
- printk(KERN_ERR "idx = %d, offset = %zd\n", i->idx, i->iov_offset);
- printk(KERN_ERR "curbuf = %d, nrbufs = %d, buffers = %d\n",
- pipe->curbuf, pipe->nrbufs, pipe->buffers);
- for (idx = 0; idx < pipe->buffers; idx++)
+ printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
+ printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
+ p_head, p_tail, pipe->ring_size);
+ for (idx = 0; idx < pipe->ring_size; idx++)
printk(KERN_ERR "[%p %p %d %d]\n",
pipe->bufs[idx].ops,
pipe->bufs[idx].page,
@@ -359,18 +364,15 @@ static bool sanity(const struct iov_iter *i)
#define sanity(i) true
#endif
-static inline int next_idx(int idx, struct pipe_inode_info *pipe)
-{
- return (idx + 1) & (pipe->buffers - 1);
-}
-
static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
struct pipe_buffer *buf;
+ unsigned int p_tail = pipe->tail;
+ unsigned int p_mask = pipe->ring_size - 1;
+ unsigned int i_head = i->head;
size_t off;
- int idx;
if (unlikely(bytes > i->count))
bytes = i->count;
@@ -382,8 +384,7 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
return 0;
off = i->iov_offset;
- idx = i->idx;
- buf = &pipe->bufs[idx];
+ buf = &pipe->bufs[i_head & p_mask];
if (off) {
if (offset == off && buf->page == page) {
/* merge with the last one */
@@ -391,18 +392,21 @@ static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t by
i->iov_offset += bytes;
goto out;
}
- idx = next_idx(idx, pipe);
- buf = &pipe->bufs[idx];
+ i_head++;
+ buf = &pipe->bufs[i_head & p_mask];
}
- if (idx == pipe->curbuf && pipe->nrbufs)
+ if (i_head - p_tail >= pipe->max_usage)
return 0;
- pipe->nrbufs++;
+
buf->ops = &page_cache_pipe_buf_ops;
- get_page(buf->page = page);
+ get_page(page);
+ buf->page = page;
buf->offset = offset;
buf->len = bytes;
+
+ pipe_commit_read(pipe, i_head);
i->iov_offset = offset + bytes;
- i->idx = idx;
+ i->head = i_head;
out:
i->count -= bytes;
return bytes;
@@ -480,24 +484,30 @@ static inline bool allocated(struct pipe_buffer *buf)
return buf->ops == &default_pipe_buf_ops;
}
-static inline void data_start(const struct iov_iter *i, int *idxp, size_t *offp)
+static inline void data_start(const struct iov_iter *i, unsigned int *i_headp,
+ size_t *offp)
{
+ unsigned int p_mask = i->pipe->ring_size - 1;
+ unsigned int i_head = i->head;
size_t off = i->iov_offset;
- int idx = i->idx;
- if (off && (!allocated(&i->pipe->bufs[idx]) || off == PAGE_SIZE)) {
- idx = next_idx(idx, i->pipe);
+
+ if (off && (!allocated(&i->pipe->bufs[i_head & p_mask]) ||
+ off == PAGE_SIZE)) {
+ i_head++;
off = 0;
}
- *idxp = idx;
+ *i_headp = i_head;
*offp = off;
}
static size_t push_pipe(struct iov_iter *i, size_t size,
- int *idxp, size_t *offp)
+ int *i_headp, size_t *offp)
{
struct pipe_inode_info *pipe = i->pipe;
+ unsigned int p_tail = pipe->tail;
+ unsigned int p_mask = pipe->ring_size - 1;
+ unsigned int i_head;
size_t off;
- int idx;
ssize_t left;
if (unlikely(size > i->count))
@@ -506,33 +516,34 @@ static size_t push_pipe(struct iov_iter *i, size_t size,
return 0;
left = size;
- data_start(i, &idx, &off);
- *idxp = idx;
+ data_start(i, &i_head, &off);
+ *i_headp = i_head;
*offp = off;
if (off) {
left -= PAGE_SIZE - off;
if (left <= 0) {
- pipe->bufs[idx].len += size;
+ pipe->bufs[i_head & p_mask].len += size;
return size;
}
- pipe->bufs[idx].len = PAGE_SIZE;
- idx = next_idx(idx, pipe);
+ pipe->bufs[i_head & p_mask].len = PAGE_SIZE;
+ i_head++;
}
- while (idx != pipe->curbuf || !pipe->nrbufs) {
+ while (i_head <= p_tail + pipe->max_usage) {
+ struct pipe_buffer *buf = &pipe->bufs[i_head & p_mask];
struct page *page = alloc_page(GFP_USER);
if (!page)
break;
- pipe->nrbufs++;
- pipe->bufs[idx].ops = &default_pipe_buf_ops;
- pipe->bufs[idx].page = page;
- pipe->bufs[idx].offset = 0;
- if (left <= PAGE_SIZE) {
- pipe->bufs[idx].len = left;
+
+ buf->ops = &default_pipe_buf_ops;
+ buf->page = page;
+ buf->offset = 0;
+ buf->len = max_t(ssize_t, left, PAGE_SIZE);
+ left -= buf->len;
+ i_head++;
+ pipe_commit_write(pipe, i_head);
+
+ if (left == 0)
return size;
- }
- pipe->bufs[idx].len = PAGE_SIZE;
- left -= PAGE_SIZE;
- idx = next_idx(idx, pipe);
}
return size - left;
}
@@ -541,23 +552,26 @@ static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
+ unsigned int p_mask = pipe->ring_size - 1;
+ unsigned int i_head;
size_t n, off;
- int idx;
if (!sanity(i))
return 0;
- bytes = n = push_pipe(i, bytes, &idx, &off);
+ bytes = n = push_pipe(i, bytes, &i_head, &off);
if (unlikely(!n))
return 0;
- for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+ do {
size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
- memcpy_to_page(pipe->bufs[idx].page, off, addr, chunk);
- i->idx = idx;
+ memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
+ i->head = i_head;
i->iov_offset = off + chunk;
n -= chunk;
addr += chunk;
- }
+ off = 0;
+ i_head++;
+ } while (n);
i->count -= bytes;
return bytes;
}
@@ -573,28 +587,31 @@ static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
__wsum *csum, struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
+ unsigned int p_mask = pipe->ring_size - 1;
+ unsigned int i_head;
size_t n, r;
size_t off = 0;
__wsum sum = *csum;
- int idx;
if (!sanity(i))
return 0;
- bytes = n = push_pipe(i, bytes, &idx, &r);
+ bytes = n = push_pipe(i, bytes, &i_head, &r);
if (unlikely(!n))
return 0;
- for ( ; n; idx = next_idx(idx, pipe), r = 0) {
+ do {
size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
- char *p = kmap_atomic(pipe->bufs[idx].page);
+ char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
kunmap_atomic(p);
- i->idx = idx;
+ i->head = i_head;
i->iov_offset = r + chunk;
n -= chunk;
off += chunk;
addr += chunk;
- }
+ r = 0;
+ i_head++;
+ } while (n);
i->count -= bytes;
*csum = sum;
return bytes;
@@ -645,29 +662,32 @@ static size_t copy_pipe_to_iter_mcsafe(const void *addr, size_t bytes,
struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
+ unsigned int p_mask = pipe->ring_size - 1;
+ unsigned int i_head;
size_t n, off, xfer = 0;
- int idx;
if (!sanity(i))
return 0;
- bytes = n = push_pipe(i, bytes, &idx, &off);
+ bytes = n = push_pipe(i, bytes, &i_head, &off);
if (unlikely(!n))
return 0;
- for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+ do {
size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
unsigned long rem;
- rem = memcpy_mcsafe_to_page(pipe->bufs[idx].page, off, addr,
- chunk);
- i->idx = idx;
+ rem = memcpy_mcsafe_to_page(pipe->bufs[i_head & p_mask].page,
+ off, addr, chunk);
+ i->head = i_head;
i->iov_offset = off + chunk - rem;
xfer += chunk - rem;
if (rem)
break;
n -= chunk;
addr += chunk;
- }
+ off = 0;
+ i_head++;
+ } while (n);
i->count -= xfer;
return xfer;
}
@@ -925,6 +945,8 @@ EXPORT_SYMBOL(copy_page_from_iter);
static size_t pipe_zero(size_t bytes, struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
+ unsigned int p_mask = pipe->ring_size - 1;
+ unsigned int i_head;
size_t n, off;
int idx;
@@ -935,13 +957,15 @@ static size_t pipe_zero(size_t bytes, struct iov_iter *i)
if (unlikely(!n))
return 0;
- for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+ do {
size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
- memzero_page(pipe->bufs[idx].page, off, chunk);
- i->idx = idx;
+ memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
+ i->head = i_head;
i->iov_offset = off + chunk;
n -= chunk;
- }
+ off = 0;
+ i_head++;
+ } while (n);
i->count -= bytes;
return bytes;
}
@@ -987,20 +1011,26 @@ EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
static inline void pipe_truncate(struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
- if (pipe->nrbufs) {
+ unsigned int p_tail = pipe->tail;
+ unsigned int p_head = pipe->head;
+ unsigned int p_mask = pipe->ring_size - 1;
+
+ if (p_head != p_tail) {
+ struct pipe_buffer *buf;
+ unsigned int i_head = i->head;
size_t off = i->iov_offset;
- int idx = i->idx;
- int nrbufs = (idx - pipe->curbuf) & (pipe->buffers - 1);
+
if (off) {
- pipe->bufs[idx].len = off - pipe->bufs[idx].offset;
- idx = next_idx(idx, pipe);
- nrbufs++;
+ buf = &pipe->bufs[i_head & p_mask];
+ buf->len = off - buf->offset;
+ i_head++;
}
- while (pipe->nrbufs > nrbufs) {
- pipe_buf_release(pipe, &pipe->bufs[idx]);
- idx = next_idx(idx, pipe);
- pipe->nrbufs--;
+ while (p_head != i_head) {
+ p_head--;
+ pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
}
+
+ pipe_commit_write(pipe, p_head);
}
}
@@ -1011,18 +1041,20 @@ static void pipe_advance(struct iov_iter *i, size_t size)
size = i->count;
if (size) {
struct pipe_buffer *buf;
+ unsigned int p_mask = pipe->ring_size - 1;
+ unsigned int i_head = i->head;
size_t off = i->iov_offset, left = size;
- int idx = i->idx;
+
if (off) /* make it relative to the beginning of buffer */
- left += off - pipe->bufs[idx].offset;
+ left += off - pipe->bufs[i_head & p_mask].offset;
while (1) {
- buf = &pipe->bufs[idx];
+ buf = &pipe->bufs[i_head & p_mask];
if (left <= buf->len)
break;
left -= buf->len;
- idx = next_idx(idx, pipe);
+ i_head++;
}
- i->idx = idx;
+ i->head = i_head;
i->iov_offset = buf->offset + left;
}
i->count -= size;
@@ -1053,25 +1085,27 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
i->count += unroll;
if (unlikely(iov_iter_is_pipe(i))) {
struct pipe_inode_info *pipe = i->pipe;
- int idx = i->idx;
+ unsigned int p_mask = pipe->ring_size - 1;
+ unsigned int i_head = i->head;
size_t off = i->iov_offset;
while (1) {
- size_t n = off - pipe->bufs[idx].offset;
+ struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
+ size_t n = off - b->offset;
if (unroll < n) {
off -= unroll;
break;
}
unroll -= n;
- if (!unroll && idx == i->start_idx) {
+ if (!unroll && i_head == i->start_head) {
off = 0;
break;
}
- if (!idx--)
- idx = pipe->buffers - 1;
- off = pipe->bufs[idx].offset + pipe->bufs[idx].len;
+ i_head--;
+ b = &pipe->bufs[i_head & p_mask];
+ off = b->offset + b->len;
}
i->iov_offset = off;
- i->idx = idx;
+ i->head = i_head;
pipe_truncate(i);
return;
}
@@ -1159,13 +1193,13 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
size_t count)
{
BUG_ON(direction != READ);
- WARN_ON(pipe->nrbufs == pipe->buffers);
+ WARN_ON(pipe->head - pipe->tail >= pipe->ring_size);
i->type = ITER_PIPE | READ;
i->pipe = pipe;
- i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+ i->head = pipe->head;
i->iov_offset = 0;
i->count = count;
- i->start_idx = i->idx;
+ i->start_head = i->head;
}
EXPORT_SYMBOL(iov_iter_pipe);
@@ -1189,11 +1223,12 @@ EXPORT_SYMBOL(iov_iter_discard);
unsigned long iov_iter_alignment(const struct iov_iter *i)
{
+ unsigned int p_mask = i->pipe->ring_size - 1;
unsigned long res = 0;
size_t size = i->count;
if (unlikely(iov_iter_is_pipe(i))) {
- if (size && i->iov_offset && allocated(&i->pipe->bufs[i->idx]))
+ if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
return size | i->iov_offset;
return size;
}
@@ -1231,19 +1266,20 @@ EXPORT_SYMBOL(iov_iter_gap_alignment);
static inline ssize_t __pipe_get_pages(struct iov_iter *i,
size_t maxsize,
struct page **pages,
- int idx,
+ int i_head,
size_t *start)
{
struct pipe_inode_info *pipe = i->pipe;
- ssize_t n = push_pipe(i, maxsize, &idx, start);
+ unsigned int p_mask = pipe->ring_size - 1;
+ ssize_t n = push_pipe(i, maxsize, &i_head, start);
if (!n)
return -EFAULT;
maxsize = n;
n += *start;
while (n > 0) {
- get_page(*pages++ = pipe->bufs[idx].page);
- idx = next_idx(idx, pipe);
+ get_page(*pages++ = pipe->bufs[i_head & p_mask].page);
+ i_head++;
n -= PAGE_SIZE;
}
@@ -1254,9 +1290,10 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
struct page **pages, size_t maxsize, unsigned maxpages,
size_t *start)
{
+ unsigned int p_tail;
+ unsigned int i_head;
unsigned npages;
size_t capacity;
- int idx;
if (!maxsize)
return 0;
@@ -1264,12 +1301,15 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
if (!sanity(i))
return -EFAULT;
- data_start(i, &idx, start);
- /* some of this one + all after this one */
- npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
- capacity = min(npages,maxpages) * PAGE_SIZE - *start;
+ data_start(i, &i_head, start);
+ p_tail = i->pipe->tail;
+ /* Amount of free space: some of this one + all after this one */
+ npages = (p_tail + i->pipe->ring_size) - i_head;
+ if (npages > i->pipe->max_usage)
+ npages = i->pipe->max_usage;
+ capacity = min(npages, maxpages) * PAGE_SIZE - *start;
- return __pipe_get_pages(i, min(maxsize, capacity), pages, idx, start);
+ return __pipe_get_pages(i, min(maxsize, capacity), pages, i_head, start);
}
ssize_t iov_iter_get_pages(struct iov_iter *i,
@@ -1323,8 +1363,9 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
size_t *start)
{
struct page **p;
+ unsigned int p_tail;
+ unsigned int i_head;
ssize_t n;
- int idx;
int npages;
if (!maxsize)
@@ -1333,9 +1374,11 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
if (!sanity(i))
return -EFAULT;
- data_start(i, &idx, start);
- /* some of this one + all after this one */
- npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
+ data_start(i, &i_head, start);
+ /* Amount of free space: some of this one + all after this one */
+ npages = (p_tail + i->pipe->ring_size) - i_head;
+ if (npages > i->pipe->max_usage)
+ npages = i->pipe->max_usage;
n = npages * PAGE_SIZE - *start;
if (maxsize > n)
maxsize = n;
@@ -1344,7 +1387,7 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
p = get_pages_array(npages);
if (!p)
return -ENOMEM;
- n = __pipe_get_pages(i, maxsize, p, idx, start);
+ n = __pipe_get_pages(i, maxsize, p, i_head, start);
if (n > 0)
*pages = p;
else
@@ -1560,15 +1603,18 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
if (unlikely(iov_iter_is_pipe(i))) {
struct pipe_inode_info *pipe = i->pipe;
+ unsigned int p_tail = pipe->tail;
+ unsigned int i_head;
size_t off;
- int idx;
if (!sanity(i))
return 0;
- data_start(i, &idx, &off);
+ data_start(i, &i_head, &off);
/* some of this one + all after this one */
- npages = ((pipe->curbuf - idx - 1) & (pipe->buffers - 1)) + 1;
+ npages = (p_tail + pipe->ring_size) - i_head;
+ if (npages > i->pipe->max_usage)
+ npages = i->pipe->max_usage;
if (npages >= maxpages)
return maxpages;
} else iterate_all_kinds(i, size, v, ({
^ permalink raw reply related
* [RFC PATCH 02/21] Add a prelocked wake-up
From: David Howells @ 2019-10-15 21:48 UTC (permalink / raw)
To: torvalds
Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
linux-usb, linux-block, linux-security-module, linux-fsdevel,
linux-api, linux-security-module, linux-kernel
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>
Add a wakeup call for a case whereby the caller already has the waitqueue
spinlock held. This can be used by pipes to alter the ring buffer indices
under the spinlock.
Signed-off-by: David Howells <dhowells@redhat.com>
---
include/linux/wait.h | 2 ++
kernel/sched/wait.c | 7 +++++++
2 files changed, 9 insertions(+)
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3eb7cae8206c..d511b298a20c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -229,6 +229,8 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
__wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
#define wake_up_interruptible_sync_poll(x, m) \
__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, poll_to_key(m))
+void prelocked_wake_up_interruptible_sync_poll(struct wait_queue_head *wq_head,
+ __poll_t mask);
#define ___wait_cond_timeout(condition) \
({ \
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index c1e566a114ca..43fbbbe9af27 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -126,6 +126,13 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
} while (bookmark.flags & WQ_FLAG_BOOKMARK);
}
+void prelocked_wake_up_interruptible_sync_poll(struct wait_queue_head *wq_head,
+ __poll_t mask)
+{
+ __wake_up_common(wq_head, TASK_INTERRUPTIBLE, 1, WF_SYNC,
+ poll_to_key(mask), NULL);
+}
+
/**
* __wake_up - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
^ permalink raw reply related
* [RFC PATCH 01/21] pipe: Reduce #inclusion of pipe_fs_i.h
From: David Howells @ 2019-10-15 21:47 UTC (permalink / raw)
To: torvalds
Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
linux-usb, linux-block, linux-security-module, linux-fsdevel,
linux-api, linux-security-module, linux-kernel
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>
Remove some #inclusions of linux/pipe_fs_i.h that don't seem to be
necessary any more.
Signed-off-by: David Howells <dhowells@redhat.com>
---
fs/exec.c | 1 -
fs/ocfs2/aops.c | 1 -
security/smack/smack_lsm.c | 1 -
3 files changed, 3 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 555e93c7dec8..57bc7ef8d31b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,7 +59,6 @@
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
-#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
#include <linux/compat.h>
#include <linux/vmalloc.h>
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8de1c9d644f6..c50ac6b7415b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -11,7 +11,6 @@
#include <linux/pagemap.h>
#include <asm/byteorder.h>
#include <linux/swap.h>
-#include <linux/pipe_fs_i.h>
#include <linux/mpage.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index abeb09c30633..ecea41ce919b 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -28,7 +28,6 @@
#include <linux/icmpv6.h>
#include <linux/slab.h>
#include <linux/mutex.h>
-#include <linux/pipe_fs_i.h>
#include <net/cipso_ipv4.h>
#include <net/ip.h>
#include <net/ipv6.h>
^ permalink raw reply related
* [RFC PATCH 00/21] pipe: Keyrings, Block and USB notifications
From: David Howells @ 2019-10-15 21:47 UTC (permalink / raw)
To: torvalds
Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
nicolas.dichtel, raven, Christian Brauner, dhowells, keyrings,
linux-usb, linux-block, linux-security-module, linux-fsdevel,
linux-api, linux-security-module, linux-kernel
Here's a set of patches to add a general notification queue concept and to
add event sources such as:
(1) Keys/keyrings, such as linking and unlinking keys and changing their
attributes.
(2) General device events (single common queue) including:
- Block layer events, such as device errors
- USB subsystem events, such as device attach/remove, device reset,
device errors.
I have patches for adding superblock and mount topology watches also,
though those are not in this set as there are other dependencies.
LSM hooks are included:
(1) A set of hooks are provided that allow an LSM to rule on whether or
not a watch may be set. Each of these hooks takes a different
"watched object" parameter, so they're not really shareable. The LSM
should use current's credentials. [Wanted by SELinux & Smack]
(2) A hook is provided to allow an LSM to rule on whether or not a
particular message may be posted to a particular queue. This is given
the credentials from the event generator (which may be the system) and
the watch setter. [Wanted by Smack]
I've provided SELinux and Smack with implementations of some of these hooks.
Design decisions:
(1) The notification queue is built on top of a standard pipe. Messages are
effectively spliced in.
pipe2(fds, O_TMPFILE); // Note that O_TMPFILE is just hacked in atm
which is then configured::
ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, queue_depth);
ioctl(fds[1], IOC_WATCH_QUEUE_SET_FILTER, &filter);
Messages are then read out of the pipe using read().
(2) It should be possible to allow write() to insert data into the
notification pipes too, but this is currently disabled as the kernel has
to be able to insert messages into the pipe *without* holding pipe->mutex
and the code to make this work needs careful auditing.
(3) sendfile(), splice() and vmsplice() are disabled on notification pipes
because of the pipe->mutex issue and also because they sometimes want to
revert what they just did - but one or more notification messages
might've been interleaved in the ring.
(4) The kernel inserts messages with the wait queue spinlock held. This
means that pipe_read() and pipe_write() have to take the spinlock to
update the queue pointers.
(5) Records in the buffer are binary, typed and have a length so that they
can be of varying size.
This allows multiple heterogeneous sources to share a common buffer;
there are 16 million types available, of which I've used just a few,
so there is scope for others to be used. Tags may be specified when a
watchpoint is created to help distinguish the sources.
(6) Records are filterable as types have up to 256 subtypes that can be
individually filtered. Other filtration is also available.
(7) Notification pipes don't interfere with each other; each may be bound to
a different set of watches. Any particular notification will be copied
to all the queues that are currently watching for it - and only those
that are watching for it.
(8) When recording a notification, the kernel will not sleep, but will rather
mark a queue as having lost a message if there's insufficient space.
read() will fabricate a loss notification message at an appropriate point
later.
(9) The notification pipe is created and then watchpoints are attached to it,
using one of:
keyctl_watch_key(KEY_SPEC_SESSION_KEYRING, fds[1], 0x01);
watch_devices(fds[1], 0x02, 0);
where in both cases, fd indicates the queue and the number after is a
tag between 0 and 255.
(10) Watches are removed if either the notification pipe is destroyed or the
watched object is destroyed. In the latter case, a message will be
generated indicating the enforced watch removal.
Things I want to avoid:
(1) Introducing features that make the core VFS dependent on the network
stack or networking namespaces (ie. usage of netlink).
(2) Dumping all this stuff into dmesg and having a daemon that sits there
parsing the output and distributing it as this then puts the
responsibility for security into userspace and makes handling
namespaces tricky. Further, dmesg might not exist or might be
inaccessible inside a container.
(3) Letting users see events they shouldn't be able to see.
Benchmarks:
PATCHES BENCHMARK BEST TOTAL BYTES AVG BYTES STDDEV
======= =============== =============== =============== =============== ===============
0 pipe 305798752 36324188187 302701568 6128384
0 vmsplice 432261032 49377769035 411481408 60638553
0 splice 284943309 27769149338 231409577 171374033
prelock pipe 307500893 36470120589 303917671 9970065
prelock vmsplice 437991897 51502859114 429190492 22841773
prelock splice 285469128 28158931451 234657762 138350254
ht pipe 304526362 35934813852 299456782 14371537
ht vmsplice 437819690 51481865190 429015543 25713854
ht splice 242954439 23296773094 194139775 148906490
r pipe 306166161 36117069156 300975576 11493141
r vmsplice 335931911 39837793527 331981612 9098195
r splice 282334654 27472565449 228938045 219399557
rx pipe 304664843 36059231110 300493592 9800700
rx vmsplice 446602247 52216984680 435141539 31456017
rx splice 277978890 26516522263 220971018 199170870
rxw pipe 304180415 36317294286 302644119 5967557
rxw vmsplice 447361082 51238998643 426991655 62191270
rxw splice 277213923 27627387718 230228230 173675370
rxwx pipe 305390974 36110185475 300918212 11895583
rxwx vmsplice 445372781 50668744616 422239538 63133543
rxwx splice 280218603 27677264384 230643869 196958675
rxwxf pipe 305001592 36325565843 302713048 7551300
rxwxf vmsplice 444316544 50213093070 418442442 77117994
rxwxf splice 278371338 28859765396 240498044 160245812
all pipe 298858861 35543407781 296195064 5173617
all vmsplice 453414388 53792295991 448269133 10914547
all splice 279264055 26812726990 223439391 212421416
The patches column indicates the point in the patchset at which the benchmarks
were taken:
0 No patches
prelock "Add a prelocked wake-up"
hr "pipe: Use head and tail pointers for the ring, not cursor and length"
r "pipe: Advance tail pointer inside of wait spinlock in pipe_read()"
rx "pipe: Conditionalise wakeup in pipe_read()"
rxw "pipe: Rearrange sequence in pipe_write() to preallocate slot"
rxwx "pipe: Remove redundant wakeup from pipe_write()"
rxwxf "pipe: Check for ring full inside of the spinlock in pipe_write()"
all All of the patches
The benchmark programs can be found here:
http://people.redhat.com/dhowells/pipe-bench.c
http://people.redhat.com/dhowells/vmsplice-bench.c
http://people.redhat.com/dhowells/splice-bench.c
The patches can be found here also:
http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=pipe-experimental
Changes:
ver #1:
(*) Build on top of standard pipes instead of having a driver.
David
---
David Howells (21):
pipe: Reduce #inclusion of pipe_fs_i.h
Add a prelocked wake-up
pipe: Use head and tail pointers for the ring, not cursor and length
pipe: Advance tail pointer inside of wait spinlock in pipe_read()
pipe: Conditionalise wakeup in pipe_read()
pipe: Rearrange sequence in pipe_write() to preallocate slot
pipe: Remove redundant wakeup from pipe_write()
pipe: Check for ring full inside of the spinlock in pipe_write()
uapi: General notification queue definitions
security: Add hooks to rule on setting a watch
security: Add a hook for the point of notification insertion
pipe: Add general notification queue support
keys: Add a notification facility
Add sample notification program
pipe: Allow buffers to be marked read-whole-or-error for notifications
pipe: Add notification lossage handling
Add a general, global device notification watch list
block: Add block layer notifications
usb: Add USB subsystem notifications
selinux: Implement the watch_key security hook
smack: Implement the watch_key and post_notification hooks
Documentation/ioctl/ioctl-number.rst | 1
Documentation/security/keys/core.rst | 58 ++
Documentation/watch_queue.rst | 385 ++++++++++++++++
arch/alpha/kernel/syscalls/syscall.tbl | 1
arch/arm/tools/syscall.tbl | 1
arch/arm64/include/asm/unistd.h | 2
arch/arm64/include/asm/unistd32.h | 2
arch/ia64/kernel/syscalls/syscall.tbl | 1
arch/m68k/kernel/syscalls/syscall.tbl | 1
arch/microblaze/kernel/syscalls/syscall.tbl | 1
arch/mips/kernel/syscalls/syscall_n32.tbl | 1
arch/mips/kernel/syscalls/syscall_n64.tbl | 1
arch/mips/kernel/syscalls/syscall_o32.tbl | 1
arch/parisc/kernel/syscalls/syscall.tbl | 1
arch/powerpc/kernel/syscalls/syscall.tbl | 1
arch/s390/kernel/syscalls/syscall.tbl | 1
arch/sh/kernel/syscalls/syscall.tbl | 1
arch/sparc/kernel/syscalls/syscall.tbl | 1
arch/x86/entry/syscalls/syscall_32.tbl | 1
arch/x86/entry/syscalls/syscall_64.tbl | 1
arch/xtensa/kernel/syscalls/syscall.tbl | 1
block/Kconfig | 9
block/blk-core.c | 29 +
drivers/base/Kconfig | 9
drivers/base/Makefile | 1
drivers/base/watch.c | 90 ++++
drivers/usb/core/Kconfig | 9
drivers/usb/core/devio.c | 47 ++
drivers/usb/core/hub.c | 4
fs/exec.c | 1
fs/fuse/dev.c | 31 +
fs/ocfs2/aops.c | 1
fs/pipe.c | 392 +++++++++++-----
fs/splice.c | 195 +++++---
include/linux/blkdev.h | 15 +
include/linux/device.h | 7
include/linux/key.h | 3
include/linux/lsm_audit.h | 1
include/linux/lsm_hooks.h | 38 ++
include/linux/pipe_fs_i.h | 63 ++-
include/linux/security.h | 32 +
include/linux/syscalls.h | 1
include/linux/uio.h | 4
include/linux/usb.h | 18 +
include/linux/wait.h | 2
include/linux/watch_queue.h | 127 +++++
include/uapi/asm-generic/unistd.h | 4
include/uapi/linux/keyctl.h | 2
include/uapi/linux/watch_queue.h | 155 ++++++
init/Kconfig | 12
kernel/Makefile | 1
kernel/sched/wait.c | 7
kernel/sys_ni.c | 1
kernel/watch_queue.c | 660 +++++++++++++++++++++++++++
lib/iov_iter.c | 274 +++++++----
samples/Kconfig | 7
samples/Makefile | 1
samples/watch_queue/Makefile | 7
samples/watch_queue/watch_test.c | 252 ++++++++++
security/keys/Kconfig | 9
security/keys/compat.c | 3
security/keys/gc.c | 5
security/keys/internal.h | 30 +
security/keys/key.c | 38 +-
security/keys/keyctl.c | 99 ++++
security/keys/keyring.c | 20 +
security/keys/request_key.c | 4
security/security.c | 23 +
security/selinux/hooks.c | 14 +
security/smack/smack_lsm.c | 83 +++
70 files changed, 2927 insertions(+), 377 deletions(-)
create mode 100644 Documentation/watch_queue.rst
create mode 100644 drivers/base/watch.c
create mode 100644 include/linux/watch_queue.h
create mode 100644 include/uapi/linux/watch_queue.h
create mode 100644 kernel/watch_queue.c
create mode 100644 samples/watch_queue/Makefile
create mode 100644 samples/watch_queue/watch_test.c
^ permalink raw reply
* Re: [PATCH v2] perf_event: Add support for LSM and SELinux checks
From: Stephen Smalley @ 2019-10-15 14:35 UTC (permalink / raw)
To: Joel Fernandes (Google), linux-kernel
Cc: Peter Zijlstra, rostedt, primiano, rsavitski, jeffv, kernel-team,
James Morris, Alexei Starovoitov, Arnaldo Carvalho de Melo, bpf,
Daniel Borkmann, Ingo Molnar, Jiri Olsa, Kees Cook,
linux-security-module, Matthew Garrett, Namhyung Kim, selinux,
Song Liu, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
Yonghong Song
In-Reply-To: <20191014170308.70668-1-joel@joelfernandes.org>
On 10/14/19 1:03 PM, Joel Fernandes (Google) wrote:
> In current mainline, the degree of access to perf_event_open(2) system
> call depends on the perf_event_paranoid sysctl. This has a number of
> limitations:
>
> 1. The sysctl is only a single value. Many types of accesses are controlled
> based on the single value thus making the control very limited and
> coarse grained.
> 2. The sysctl is global, so if the sysctl is changed, then that means
> all processes get access to perf_event_open(2) opening the door to
> security issues.
>
> This patch adds LSM and SELinux access checking which will be used in
> Android to access perf_event_open(2) for the purposes of attaching BPF
> programs to tracepoints, perf profiling and other operations from
> userspace. These operations are intended for production systems.
>
> 5 new LSM hooks are added:
> 1. perf_event_open: This controls access during the perf_event_open(2)
> syscall itself. The hook is called from all the places that the
> perf_event_paranoid sysctl is checked to keep it consistent with the
> systctl. The hook gets passed a 'type' argument which controls CPU,
> kernel and tracepoint accesses (in this context, CPU, kernel and
> tracepoint have the same semantics as the perf_event_paranoid sysctl).
> Additionally, I added an 'open' type which is similar to
> perf_event_paranoid sysctl == 3 patch carried in Android and several other
> distros but was rejected in mainline [1] in 2016.
>
> 2. perf_event_alloc: This allocates a new security object for the event
> which stores the current SID within the event. It will be useful when
> the perf event's FD is passed through IPC to another process which may
> try to read the FD. Appropriate security checks will limit access.
>
> 3. perf_event_free: Called when the event is closed.
>
> 4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event.
>
> 5. perf_event_write: Called from the ioctl(2) syscalls for the event.
>
> [1] https://lwn.net/Articles/696240/
>
> Since Peter had suggest LSM hooks in 2016 [1], I am adding his
> Suggested-by tag below.
>
> To use this patch, we set the perf_event_paranoid sysctl to -1 and then
> apply selinux checking as appropriate (default deny everything, and then
> add policy rules to give access to domains that need it). In the future
> we can remove the perf_event_paranoid sysctl altogether.
>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: rostedt@goodmis.org
> Cc: primiano@google.com
> Cc: rsavitski@google.com
> Cc: jeffv@google.com
> Cc: kernel-team@android.com
> Acked-by: James Morris <jmorris@namei.org>
> Co-developed-by: Peter Zijlstra <peterz@infradead.org>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
> ---
>
> Changes since v1:
> o Fixes from Peter Ziljstra.
> o Added Ack from James Morris and Co-developed-by tag for Peter.
> Changes since RFC:
> o Small nits, style changes (James Morris).
> o Consolidation of code (Peter Zijlstra).
>
>
> arch/x86/events/intel/bts.c | 8 ++--
> arch/x86/events/intel/core.c | 5 ++-
> arch/x86/events/intel/p4.c | 5 ++-
> include/linux/lsm_hooks.h | 15 +++++++
> include/linux/perf_event.h | 28 +++++++++---
> include/linux/security.h | 39 +++++++++++++++-
> include/uapi/linux/perf_event.h | 9 ++++
> kernel/events/core.c | 57 +++++++++++++++++++-----
> kernel/trace/trace_event_perf.c | 15 ++++---
> security/security.c | 27 +++++++++++
> security/selinux/hooks.c | 69 +++++++++++++++++++++++++++++
> security/selinux/include/classmap.h | 2 +
> security/selinux/include/objsec.h | 6 ++-
> 13 files changed, 255 insertions(+), 30 deletions(-)
>
> diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
> index 5ee3fed881d3..38de4a7f6752 100644
> --- a/arch/x86/events/intel/bts.c
> +++ b/arch/x86/events/intel/bts.c
> @@ -549,9 +549,11 @@ static int bts_event_init(struct perf_event *event)
> * Note that the default paranoia setting permits unprivileged
> * users to profile the kernel.
> */
> - if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
> - !capable(CAP_SYS_ADMIN))
> - return -EACCES;
> + if (event->attr.exclude_kernel) {
> + ret = perf_allow_kernel(&event->attr);
> + if (ret)
> + return ret;
> + }
>
> if (x86_add_exclusive(x86_lbr_exclusive_bts))
> return -EBUSY;
> diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
> index 27ee47a7be66..32967a9e9962 100644
> --- a/arch/x86/events/intel/core.c
> +++ b/arch/x86/events/intel/core.c
> @@ -3315,8 +3315,9 @@ static int intel_pmu_hw_config(struct perf_event *event)
> if (x86_pmu.version < 3)
> return -EINVAL;
>
> - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
> - return -EACCES;
> + ret = perf_allow_cpu(&event->attr);
> + if (ret)
> + return ret;
>
> event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
>
> diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
> index dee579efb2b2..a4cc66005ce8 100644
> --- a/arch/x86/events/intel/p4.c
> +++ b/arch/x86/events/intel/p4.c
> @@ -776,8 +776,9 @@ static int p4_validate_raw_event(struct perf_event *event)
> * the user needs special permissions to be able to use it
> */
> if (p4_ht_active() && p4_event_bind_map[v].shared) {
> - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
> - return -EACCES;
> + v = perf_allow_cpu(&event->attr);
> + if (v)
> + return v;
> }
>
> /* ESCR EventMask bits may be invalid */
> diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
> index a3763247547c..20d8cf194fb7 100644
> --- a/include/linux/lsm_hooks.h
> +++ b/include/linux/lsm_hooks.h
> @@ -1818,6 +1818,14 @@ union security_list_options {
> void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
> #endif /* CONFIG_BPF_SYSCALL */
> int (*locked_down)(enum lockdown_reason what);
> +#ifdef CONFIG_PERF_EVENTS
> + int (*perf_event_open)(struct perf_event_attr *attr, int type);
> + int (*perf_event_alloc)(struct perf_event *event);
> + void (*perf_event_free)(struct perf_event *event);
> + int (*perf_event_read)(struct perf_event *event);
> + int (*perf_event_write)(struct perf_event *event);
> +
> +#endif
> };
>
> struct security_hook_heads {
> @@ -2060,6 +2068,13 @@ struct security_hook_heads {
> struct hlist_head bpf_prog_free_security;
> #endif /* CONFIG_BPF_SYSCALL */
> struct hlist_head locked_down;
> +#ifdef CONFIG_PERF_EVENTS
> + struct hlist_head perf_event_open;
> + struct hlist_head perf_event_alloc;
> + struct hlist_head perf_event_free;
> + struct hlist_head perf_event_read;
> + struct hlist_head perf_event_write;
> +#endif
> } __randomize_layout;
>
> /*
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 61448c19a132..664bb7f99c46 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -56,6 +56,7 @@ struct perf_guest_info_callbacks {
> #include <linux/perf_regs.h>
> #include <linux/cgroup.h>
> #include <linux/refcount.h>
> +#include <linux/security.h>
> #include <asm/local.h>
>
> struct perf_callchain_entry {
> @@ -721,6 +722,9 @@ struct perf_event {
> struct perf_cgroup *cgrp; /* cgroup event is attach to */
> #endif
>
> +#ifdef CONFIG_SECURITY
> + void *security;
> +#endif
> struct list_head sb_list;
> #endif /* CONFIG_PERF_EVENTS */
> };
> @@ -1241,19 +1245,33 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
> int perf_event_max_stack_handler(struct ctl_table *table, int write,
> void __user *buffer, size_t *lenp, loff_t *ppos);
>
> -static inline bool perf_paranoid_tracepoint_raw(void)
> +static inline int perf_is_paranoid(void)
> {
> return sysctl_perf_event_paranoid > -1;
> }
>
> -static inline bool perf_paranoid_cpu(void)
> +static inline int perf_allow_kernel(struct perf_event_attr *attr)
> {
> - return sysctl_perf_event_paranoid > 0;
> + if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN))
> + return -EACCES;
> +
> + return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
> }
>
> -static inline bool perf_paranoid_kernel(void)
> +static inline int perf_allow_cpu(struct perf_event_attr *attr)
> {
> - return sysctl_perf_event_paranoid > 1;
> + if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN))
> + return -EACCES;
> +
> + return security_perf_event_open(attr, PERF_SECURITY_CPU);
> +}
> +
> +static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
> +{
> + if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN))
> + return -EPERM;
> +
> + return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
> }
>
> extern void perf_event_init(void);
> diff --git a/include/linux/security.h b/include/linux/security.h
> index a8d59d612d27..273e11c66ed7 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -1894,5 +1894,42 @@ static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
> #endif /* CONFIG_SECURITY */
> #endif /* CONFIG_BPF_SYSCALL */
>
> -#endif /* ! __LINUX_SECURITY_H */
> +#ifdef CONFIG_PERF_EVENTS
> +struct perf_event_attr;
> +
> +#ifdef CONFIG_SECURITY
> +extern int security_perf_event_open(struct perf_event_attr *attr, int type);
> +extern int security_perf_event_alloc(struct perf_event *event);
> +extern void security_perf_event_free(struct perf_event *event);
> +extern int security_perf_event_read(struct perf_event *event);
> +extern int security_perf_event_write(struct perf_event *event);
> +#else
> +static inline int security_perf_event_open(struct perf_event_attr *attr,
> + int type)
> +{
> + return 0;
> +}
>
> +static inline int security_perf_event_alloc(struct perf_event *event)
> +{
> + return 0;
> +}
> +
> +static inline void security_perf_event_free(struct perf_event *event)
> +{
> + return 0;
> +}
> +
> +static inline int security_perf_event_read(struct perf_event *event)
> +{
> + return 0;
> +}
> +
> +static inline int security_perf_event_write(struct perf_event *event)
> +{
> + return 0;
> +}
> +#endif /* CONFIG_SECURITY */
> +#endif /* CONFIG_PERF_EVENTS */
> +
> +#endif /* ! __LINUX_SECURITY_H */
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index bb7b271397a6..2af95f937a5b 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -427,6 +427,15 @@ struct perf_event_attr {
> __u16 __reserved_2; /* align to __u64 */
> };
>
> +
> +/* Access to perf_event_open(2) syscall. */
> +#define PERF_SECURITY_OPEN 0
> +
> +/* Finer grained perf_event_open(2) access control. */
> +#define PERF_SECURITY_CPU 1
> +#define PERF_SECURITY_KERNEL 2
> +#define PERF_SECURITY_TRACEPOINT 3
> +
Why are these definitions part of the uapi header and not private to the
kernel? You map them to SELinux permissions in the kernel, and the
SELinux permissions are mapped to policy values at policy load time (so
the values need not be identical). We also have mechanisms in SELinux
to permit evolution of the permissions over time in a
backward-compatible manner.
> /*
> * Structure used by below PERF_EVENT_IOC_QUERY_BPF command
> * to query bpf programs attached to the same perf tracepoint
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 4655adbbae10..8ad597fae5f4 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -4217,8 +4217,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
>
> if (!task) {
> /* Must be root to operate on a CPU event: */
> - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
> - return ERR_PTR(-EACCES);
> + err = perf_allow_cpu(&event->attr);
> + if (err)
> + return ERR_PTR(err);
>
> cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
> ctx = &cpuctx->ctx;
> @@ -4527,6 +4528,8 @@ static void _free_event(struct perf_event *event)
>
> unaccount_event(event);
>
> + security_perf_event_free(event);
> +
> if (event->rb) {
> /*
> * Can happen when we close an event with re-directed output.
> @@ -4980,6 +4983,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
> struct perf_event_context *ctx;
> int ret;
>
> + ret = security_perf_event_read(event);
> + if (ret)
> + return ret;
> +
> ctx = perf_event_ctx_lock(event);
> ret = __perf_read(event, buf, count);
> perf_event_ctx_unlock(event, ctx);
> @@ -5244,6 +5251,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
> struct perf_event_context *ctx;
> long ret;
>
> + /* Treat ioctl like writes as it is likely a mutating operation. */
> + ret = security_perf_event_write(event);
> + if (ret)
> + return ret;
> +
> ctx = perf_event_ctx_lock(event);
> ret = _perf_ioctl(event, cmd, arg);
> perf_event_ctx_unlock(event, ctx);
> @@ -5706,6 +5718,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> if (!(vma->vm_flags & VM_SHARED))
> return -EINVAL;
>
> + ret = security_perf_event_read(event);
> + if (ret)
> + return ret;
> +
> vma_size = vma->vm_end - vma->vm_start;
>
> if (vma->vm_pgoff == 0) {
> @@ -5819,7 +5835,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
> lock_limit >>= PAGE_SHIFT;
> locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
>
> - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
> + if ((locked > lock_limit) && perf_is_paranoid() &&
> !capable(CAP_IPC_LOCK)) {
> ret = -EPERM;
> goto unlock;
> @@ -10553,11 +10569,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> }
> }
>
> + err = security_perf_event_alloc(event);
> + if (err)
> + goto err_callchain_buffer;
> +
> /* symmetric to unaccount_event() in _free_event() */
> account_event(event);
>
> return event;
>
> +err_callchain_buffer:
> + if (!event->parent) {
> + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
> + put_callchain_buffers();
> + }
> err_addr_filters:
> kfree(event->addr_filter_ranges);
>
> @@ -10675,9 +10700,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
> attr->branch_sample_type = mask;
> }
> /* privileged levels capture (kernel, hv): check permissions */
> - if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
> - && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
> - return -EACCES;
> + if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
> + ret = perf_allow_kernel(attr);
> + if (ret)
> + return ret;
> + }
> }
>
> if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
> @@ -10890,13 +10917,19 @@ SYSCALL_DEFINE5(perf_event_open,
> if (flags & ~PERF_FLAG_ALL)
> return -EINVAL;
>
> + /* Do we allow access to perf_event_open(2) ? */
> + err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
> + if (err)
> + return err;
> +
> err = perf_copy_attr(attr_uptr, &attr);
> if (err)
> return err;
>
> if (!attr.exclude_kernel) {
> - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
> - return -EACCES;
> + err = perf_allow_kernel(&attr);
> + if (err)
> + return err;
> }
>
> if (attr.namespaces) {
> @@ -10913,9 +10946,11 @@ SYSCALL_DEFINE5(perf_event_open,
> }
>
> /* Only privileged users can get physical addresses */
> - if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
> - perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
> - return -EACCES;
> + if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
> + err = perf_allow_kernel(&attr);
> + if (err)
> + return err;
> + }
>
> err = security_locked_down(LOCKDOWN_PERF);
> if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
> diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
> index 0892e38ed6fb..0917fee6ee7c 100644
> --- a/kernel/trace/trace_event_perf.c
> +++ b/kernel/trace/trace_event_perf.c
> @@ -8,6 +8,7 @@
>
> #include <linux/module.h>
> #include <linux/kprobes.h>
> +#include <linux/security.h>
> #include "trace.h"
> #include "trace_probe.h"
>
> @@ -26,8 +27,10 @@ static int total_ref_count;
> static int perf_trace_event_perm(struct trace_event_call *tp_event,
> struct perf_event *p_event)
> {
> + int ret;
> +
> if (tp_event->perf_perm) {
> - int ret = tp_event->perf_perm(tp_event, p_event);
> + ret = tp_event->perf_perm(tp_event, p_event);
> if (ret)
> return ret;
> }
> @@ -46,8 +49,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
>
> /* The ftrace function trace is allowed only for root. */
> if (ftrace_event_is_function(tp_event)) {
> - if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
> - return -EPERM;
> + ret = perf_allow_tracepoint(&p_event->attr);
> + if (ret)
> + return ret;
>
> if (!is_sampling_event(p_event))
> return 0;
> @@ -82,8 +86,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
> * ...otherwise raw tracepoint data can be a severe data leak,
> * only allow root to have these.
> */
> - if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
> - return -EPERM;
> + ret = perf_allow_tracepoint(&p_event->attr);
> + if (ret)
> + return ret;
>
> return 0;
> }
> diff --git a/security/security.c b/security/security.c
> index 1bc000f834e2..cd2d18d2d279 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -2404,3 +2404,30 @@ int security_locked_down(enum lockdown_reason what)
> return call_int_hook(locked_down, 0, what);
> }
> EXPORT_SYMBOL(security_locked_down);
> +
> +#ifdef CONFIG_PERF_EVENTS
> +int security_perf_event_open(struct perf_event_attr *attr, int type)
> +{
> + return call_int_hook(perf_event_open, 0, attr, type);
> +}
> +
> +int security_perf_event_alloc(struct perf_event *event)
> +{
> + return call_int_hook(perf_event_alloc, 0, event);
> +}
> +
> +void security_perf_event_free(struct perf_event *event)
> +{
> + call_void_hook(perf_event_free, event);
> +}
> +
> +int security_perf_event_read(struct perf_event *event)
> +{
> + return call_int_hook(perf_event_read, 0, event);
> +}
> +
> +int security_perf_event_write(struct perf_event *event)
> +{
> + return call_int_hook(perf_event_write, 0, event);
> +}
> +#endif /* CONFIG_PERF_EVENTS */
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index 9625b99e677f..28eb05490d59 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -6795,6 +6795,67 @@ struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
> .lbs_msg_msg = sizeof(struct msg_security_struct),
> };
>
> +#ifdef CONFIG_PERF_EVENTS
> +static int selinux_perf_event_open(struct perf_event_attr *attr, int type)
> +{
> + u32 requested, sid = current_sid();
> +
> + if (type == PERF_SECURITY_OPEN)
> + requested = PERF_EVENT__OPEN;
> + else if (type == PERF_SECURITY_CPU)
> + requested = PERF_EVENT__CPU;
> + else if (type == PERF_SECURITY_KERNEL)
> + requested = PERF_EVENT__KERNEL;
> + else if (type == PERF_SECURITY_TRACEPOINT)
> + requested = PERF_EVENT__TRACEPOINT;
> + else
> + return -EINVAL;
> +
> + return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PERF_EVENT,
> + requested, NULL);
> +}
> +
> +static int selinux_perf_event_alloc(struct perf_event *event)
> +{
> + struct perf_event_security_struct *perfsec;
> +
> + perfsec = kzalloc(sizeof(*perfsec), GFP_KERNEL);
> + if (!perfsec)
> + return -ENOMEM;
> +
> + perfsec->sid = current_sid();
> + event->security = perfsec;
> +
> + return 0;
> +}
> +
> +static void selinux_perf_event_free(struct perf_event *event)
> +{
> + struct perf_event_security_struct *perfsec = event->security;
> +
> + event->security = NULL;
> + kfree(perfsec);
> +}
> +
> +static int selinux_perf_event_read(struct perf_event *event)
> +{
> + struct perf_event_security_struct *perfsec = event->security;
> + u32 sid = current_sid();
> +
> + return avc_has_perm(&selinux_state, sid, perfsec->sid,
> + SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL);
> +}
> +
> +static int selinux_perf_event_write(struct perf_event *event)
> +{
> + struct perf_event_security_struct *perfsec = event->security;
> + u32 sid = current_sid();
> +
> + return avc_has_perm(&selinux_state, sid, perfsec->sid,
> + SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL);
> +}
> +#endif
> +
> static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
> LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
> LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
> @@ -7030,6 +7091,14 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
> LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
> LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
> #endif
> +
> +#ifdef CONFIG_PERF_EVENTS
> + LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open),
> + LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
> + LSM_HOOK_INIT(perf_event_free, selinux_perf_event_free),
> + LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read),
> + LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write),
> +#endif
> };
>
> static __init int selinux_init(void)
> diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
> index 32e9b03be3dd..7db24855e12d 100644
> --- a/security/selinux/include/classmap.h
> +++ b/security/selinux/include/classmap.h
> @@ -244,6 +244,8 @@ struct security_class_mapping secclass_map[] = {
> {"map_create", "map_read", "map_write", "prog_load", "prog_run"} },
> { "xdp_socket",
> { COMMON_SOCK_PERMS, NULL } },
> + { "perf_event",
> + {"open", "cpu", "kernel", "tracepoint", "read", "write"} },
> { NULL }
> };
>
> diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
> index 586b7abd0aa7..a4a86cbcfb0a 100644
> --- a/security/selinux/include/objsec.h
> +++ b/security/selinux/include/objsec.h
> @@ -141,7 +141,11 @@ struct pkey_security_struct {
> };
>
> struct bpf_security_struct {
> - u32 sid; /*SID of bpf obj creater*/
> + u32 sid; /* SID of bpf obj creator */
> +};
> +
> +struct perf_event_security_struct {
> + u32 sid; /* SID of perf_event obj creator */
> };
>
> extern struct lsm_blob_sizes selinux_blob_sizes;
>
^ permalink raw reply
* Re: [Patch v7 0/4] Create and consolidate trusted keys subsystem
From: Sumit Garg @ 2019-10-15 8:32 UTC (permalink / raw)
To: Jarkko Sakkinen
Cc: dhowells, peterhuewe, keyrings, linux-integrity,
open list:HARDWARE RANDOM NUMBER GENERATOR CORE,
linux-security-module, Herbert Xu, davem, jgg, Arnd Bergmann,
Greg Kroah-Hartman, jejb, Mimi Zohar, James Morris,
Serge E. Hallyn, Linux Kernel Mailing List, Daniel Thompson
In-Reply-To: <20191014201604.GN15552@linux.intel.com>
On Tue, 15 Oct 2019 at 01:46, Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> On Fri, Oct 11, 2019 at 02:05:17PM -0700, Jerry Snitselaar wrote:
> > On Fri Oct 11 19, Jarkko Sakkinen wrote:
> > > On Mon, Oct 07, 2019 at 10:55:31AM +0530, Sumit Garg wrote:
> > > > This patch-set does restructuring of trusted keys code to create and
> > > > consolidate trusted keys subsystem.
> > > >
> > > > Also, patch #2 replaces tpm1_buf code used in security/keys/trusted.c and
> > > > crypto/asymmertic_keys/asym_tpm.c files to use the common tpm_buf code.
> > > >
> > > > Changes in v7:
> > > > 1. Rebased to top of tpmdd/master
> > > > 2. Patch #4: update tpm2 trusted keys code to use tpm_send() instead of
> > > > tpm_transmit_cmd() which is an internal function.
> > > >
> > > > Changes in v6:
> > > > 1. Switch TPM asymmetric code also to use common tpm_buf code. These
> > > > changes required patches #1 and #2 update, so I have dropped review
> > > > tags from those patches.
> > > > 2. Incorporated miscellaneous comments from Jarkko.
> > > >
> > > > Changes in v5:
> > > > 1. Drop 5/5 patch as its more relavant along with TEE patch-set.
> > > > 2. Add Reviewed-by tag for patch #2.
> > > > 3. Fix build failure when "CONFIG_HEADER_TEST" and
> > > > "CONFIG_KERNEL_HEADER_TEST" config options are enabled.
> > > > 4. Misc changes to rename files.
> > > >
> > > > Changes in v4:
> > > > 1. Separate patch for export of tpm_buf code to include/linux/tpm.h
> > > > 2. Change TPM1.x trusted keys code to use common tpm_buf
> > > > 3. Keep module name as trusted.ko only
> > > >
> > > > Changes in v3:
> > > >
> > > > Move TPM2 trusted keys code to trusted keys subsystem.
> > > >
> > > > Changes in v2:
> > > >
> > > > Split trusted keys abstraction patch for ease of review.
> > > >
> > > > Sumit Garg (4):
> > > > tpm: Move tpm_buf code to include/linux/
> > > > KEYS: Use common tpm_buf for trusted and asymmetric keys
> > > > KEYS: trusted: Create trusted keys subsystem
> > > > KEYS: trusted: Move TPM2 trusted keys code
> > > >
> > > > crypto/asymmetric_keys/asym_tpm.c | 101 +++----
> > > > drivers/char/tpm/tpm-interface.c | 56 ----
> > > > drivers/char/tpm/tpm.h | 226 ---------------
> > > > drivers/char/tpm/tpm2-cmd.c | 307 --------------------
> > > > include/Kbuild | 1 -
> > > > include/keys/{trusted.h => trusted_tpm.h} | 49 +---
> > > > include/linux/tpm.h | 251 ++++++++++++++--
> > > > security/keys/Makefile | 2 +-
> > > > security/keys/trusted-keys/Makefile | 8 +
> > > > .../{trusted.c => trusted-keys/trusted_tpm1.c} | 96 +++----
> > > > security/keys/trusted-keys/trusted_tpm2.c | 314 +++++++++++++++++++++
> > > > 11 files changed, 652 insertions(+), 759 deletions(-)
> > > > rename include/keys/{trusted.h => trusted_tpm.h} (77%)
> > > > create mode 100644 security/keys/trusted-keys/Makefile
> > > > rename security/keys/{trusted.c => trusted-keys/trusted_tpm1.c} (94%)
> > > > create mode 100644 security/keys/trusted-keys/trusted_tpm2.c
> > > >
> > > > --
> > > > 2.7.4
> > > >
> > >
> > > I fixed a merge conflict caused by James' commit. Already pushed.
> > > Compiling test kernel ATM i.e. tested-by's will follow later.
> > >
> > > /Jarkko
> >
> > Are you missing patch 4 on master?
>
> Already removed the patch set given the sparse issues.
The sparse issues weren't due to this patch-set but they already
existed in "security/keys/trusted.c" and this patch-set only did a
rename for that file. So I think we should have a separate patch to
fix sparse issues.
-Sumit
> Read this email
> after doing that. Thanks anyway for pointing that out.
>
> /Jarkko
^ permalink raw reply
* Re: [PATCH v2] perf_event: Add support for LSM and SELinux checks
From: Peter Zijlstra @ 2019-10-15 8:30 UTC (permalink / raw)
To: Joel Fernandes (Google)
Cc: linux-kernel, rostedt, primiano, rsavitski, jeffv, kernel-team,
James Morris, Alexei Starovoitov, Arnaldo Carvalho de Melo, bpf,
Daniel Borkmann, Ingo Molnar, Jiri Olsa, Kees Cook,
linux-security-module, Matthew Garrett, Namhyung Kim, selinux,
Song Liu, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
Yonghong Song
In-Reply-To: <20191014170308.70668-1-joel@joelfernandes.org>
On Mon, Oct 14, 2019 at 01:03:08PM -0400, Joel Fernandes (Google) wrote:
> In current mainline, the degree of access to perf_event_open(2) system
> call depends on the perf_event_paranoid sysctl. This has a number of
> limitations:
>
> 1. The sysctl is only a single value. Many types of accesses are controlled
> based on the single value thus making the control very limited and
> coarse grained.
> 2. The sysctl is global, so if the sysctl is changed, then that means
> all processes get access to perf_event_open(2) opening the door to
> security issues.
>
> This patch adds LSM and SELinux access checking which will be used in
> Android to access perf_event_open(2) for the purposes of attaching BPF
> programs to tracepoints, perf profiling and other operations from
> userspace. These operations are intended for production systems.
>
> 5 new LSM hooks are added:
> 1. perf_event_open: This controls access during the perf_event_open(2)
> syscall itself. The hook is called from all the places that the
> perf_event_paranoid sysctl is checked to keep it consistent with the
> systctl. The hook gets passed a 'type' argument which controls CPU,
> kernel and tracepoint accesses (in this context, CPU, kernel and
> tracepoint have the same semantics as the perf_event_paranoid sysctl).
> Additionally, I added an 'open' type which is similar to
> perf_event_paranoid sysctl == 3 patch carried in Android and several other
> distros but was rejected in mainline [1] in 2016.
>
> 2. perf_event_alloc: This allocates a new security object for the event
> which stores the current SID within the event. It will be useful when
> the perf event's FD is passed through IPC to another process which may
> try to read the FD. Appropriate security checks will limit access.
>
> 3. perf_event_free: Called when the event is closed.
>
> 4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event.
>
> 5. perf_event_write: Called from the ioctl(2) syscalls for the event.
>
> [1] https://lwn.net/Articles/696240/
>
> Since Peter had suggest LSM hooks in 2016 [1], I am adding his
> Suggested-by tag below.
Thanks, I've queued the patch!
> To use this patch, we set the perf_event_paranoid sysctl to -1 and then
> apply selinux checking as appropriate (default deny everything, and then
> add policy rules to give access to domains that need it). In the future
> we can remove the perf_event_paranoid sysctl altogether.
This I'm not sure about; the sysctl is only redundant when you actually
use a security thingy, not everyone is. I always find them things to be
mightily unfriendly.
^ permalink raw reply
* Re: [Patch v7 0/4] Create and consolidate trusted keys subsystem
From: Sumit Garg @ 2019-10-15 8:28 UTC (permalink / raw)
To: Jarkko Sakkinen
Cc: dhowells, peterhuewe, keyrings, linux-integrity,
open list:HARDWARE RANDOM NUMBER GENERATOR CORE,
linux-security-module, Herbert Xu, davem, jgg, Arnd Bergmann,
Greg Kroah-Hartman, jejb, Mimi Zohar, James Morris,
Serge E. Hallyn, Jerry Snitselaar, Linux Kernel Mailing List,
Daniel Thompson
In-Reply-To: <20191014193350.GG15552@linux.intel.com>
On Tue, 15 Oct 2019 at 01:04, Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> On Fri, Oct 11, 2019 at 03:37:57PM +0300, Jarkko Sakkinen wrote:
> > On Mon, Oct 07, 2019 at 10:55:31AM +0530, Sumit Garg wrote:
> > > This patch-set does restructuring of trusted keys code to create and
> > > consolidate trusted keys subsystem.
> > >
> > > Also, patch #2 replaces tpm1_buf code used in security/keys/trusted.c and
> > > crypto/asymmertic_keys/asym_tpm.c files to use the common tpm_buf code.
> > >
> > > Changes in v7:
> > > 1. Rebased to top of tpmdd/master
> > > 2. Patch #4: update tpm2 trusted keys code to use tpm_send() instead of
> > > tpm_transmit_cmd() which is an internal function.
> > >
> > > Changes in v6:
> > > 1. Switch TPM asymmetric code also to use common tpm_buf code. These
> > > changes required patches #1 and #2 update, so I have dropped review
> > > tags from those patches.
> > > 2. Incorporated miscellaneous comments from Jarkko.
> > >
> > > Changes in v5:
> > > 1. Drop 5/5 patch as its more relavant along with TEE patch-set.
> > > 2. Add Reviewed-by tag for patch #2.
> > > 3. Fix build failure when "CONFIG_HEADER_TEST" and
> > > "CONFIG_KERNEL_HEADER_TEST" config options are enabled.
> > > 4. Misc changes to rename files.
> > >
> > > Changes in v4:
> > > 1. Separate patch for export of tpm_buf code to include/linux/tpm.h
> > > 2. Change TPM1.x trusted keys code to use common tpm_buf
> > > 3. Keep module name as trusted.ko only
> > >
> > > Changes in v3:
> > >
> > > Move TPM2 trusted keys code to trusted keys subsystem.
> > >
> > > Changes in v2:
> > >
> > > Split trusted keys abstraction patch for ease of review.
> > >
> > > Sumit Garg (4):
> > > tpm: Move tpm_buf code to include/linux/
> > > KEYS: Use common tpm_buf for trusted and asymmetric keys
> > > KEYS: trusted: Create trusted keys subsystem
> > > KEYS: trusted: Move TPM2 trusted keys code
> > >
> > > crypto/asymmetric_keys/asym_tpm.c | 101 +++----
> > > drivers/char/tpm/tpm-interface.c | 56 ----
> > > drivers/char/tpm/tpm.h | 226 ---------------
> > > drivers/char/tpm/tpm2-cmd.c | 307 --------------------
> > > include/Kbuild | 1 -
> > > include/keys/{trusted.h => trusted_tpm.h} | 49 +---
> > > include/linux/tpm.h | 251 ++++++++++++++--
> > > security/keys/Makefile | 2 +-
> > > security/keys/trusted-keys/Makefile | 8 +
> > > .../{trusted.c => trusted-keys/trusted_tpm1.c} | 96 +++----
> > > security/keys/trusted-keys/trusted_tpm2.c | 314 +++++++++++++++++++++
> > > 11 files changed, 652 insertions(+), 759 deletions(-)
> > > rename include/keys/{trusted.h => trusted_tpm.h} (77%)
> > > create mode 100644 security/keys/trusted-keys/Makefile
> > > rename security/keys/{trusted.c => trusted-keys/trusted_tpm1.c} (94%)
> > > create mode 100644 security/keys/trusted-keys/trusted_tpm2.c
> > >
> > > --
> > > 2.7.4
> > >
> >
> > I fixed a merge conflict caused by James' commit. Already pushed.
> > Compiling test kernel ATM i.e. tested-by's will follow later.
>
> Update to my latest master for v8 (otherwise there won't be a clean
> merge).
>
Okay, I will send v8 to rebase to your latest master.
-Sumit
> /Jarkko
^ permalink raw reply
* Re: [Patch v7 0/4] Create and consolidate trusted keys subsystem
From: Jarkko Sakkinen @ 2019-10-14 20:16 UTC (permalink / raw)
To: Sumit Garg, dhowells, peterhuewe, keyrings, linux-integrity,
linux-crypto, linux-security-module, herbert, davem, jgg, arnd,
gregkh, jejb, zohar, jmorris, serge, linux-kernel,
daniel.thompson
In-Reply-To: <20191011210517.qxjemugqczsvscu6@cantor>
On Fri, Oct 11, 2019 at 02:05:17PM -0700, Jerry Snitselaar wrote:
> On Fri Oct 11 19, Jarkko Sakkinen wrote:
> > On Mon, Oct 07, 2019 at 10:55:31AM +0530, Sumit Garg wrote:
> > > This patch-set does restructuring of trusted keys code to create and
> > > consolidate trusted keys subsystem.
> > >
> > > Also, patch #2 replaces tpm1_buf code used in security/keys/trusted.c and
> > > crypto/asymmertic_keys/asym_tpm.c files to use the common tpm_buf code.
> > >
> > > Changes in v7:
> > > 1. Rebased to top of tpmdd/master
> > > 2. Patch #4: update tpm2 trusted keys code to use tpm_send() instead of
> > > tpm_transmit_cmd() which is an internal function.
> > >
> > > Changes in v6:
> > > 1. Switch TPM asymmetric code also to use common tpm_buf code. These
> > > changes required patches #1 and #2 update, so I have dropped review
> > > tags from those patches.
> > > 2. Incorporated miscellaneous comments from Jarkko.
> > >
> > > Changes in v5:
> > > 1. Drop 5/5 patch as its more relavant along with TEE patch-set.
> > > 2. Add Reviewed-by tag for patch #2.
> > > 3. Fix build failure when "CONFIG_HEADER_TEST" and
> > > "CONFIG_KERNEL_HEADER_TEST" config options are enabled.
> > > 4. Misc changes to rename files.
> > >
> > > Changes in v4:
> > > 1. Separate patch for export of tpm_buf code to include/linux/tpm.h
> > > 2. Change TPM1.x trusted keys code to use common tpm_buf
> > > 3. Keep module name as trusted.ko only
> > >
> > > Changes in v3:
> > >
> > > Move TPM2 trusted keys code to trusted keys subsystem.
> > >
> > > Changes in v2:
> > >
> > > Split trusted keys abstraction patch for ease of review.
> > >
> > > Sumit Garg (4):
> > > tpm: Move tpm_buf code to include/linux/
> > > KEYS: Use common tpm_buf for trusted and asymmetric keys
> > > KEYS: trusted: Create trusted keys subsystem
> > > KEYS: trusted: Move TPM2 trusted keys code
> > >
> > > crypto/asymmetric_keys/asym_tpm.c | 101 +++----
> > > drivers/char/tpm/tpm-interface.c | 56 ----
> > > drivers/char/tpm/tpm.h | 226 ---------------
> > > drivers/char/tpm/tpm2-cmd.c | 307 --------------------
> > > include/Kbuild | 1 -
> > > include/keys/{trusted.h => trusted_tpm.h} | 49 +---
> > > include/linux/tpm.h | 251 ++++++++++++++--
> > > security/keys/Makefile | 2 +-
> > > security/keys/trusted-keys/Makefile | 8 +
> > > .../{trusted.c => trusted-keys/trusted_tpm1.c} | 96 +++----
> > > security/keys/trusted-keys/trusted_tpm2.c | 314 +++++++++++++++++++++
> > > 11 files changed, 652 insertions(+), 759 deletions(-)
> > > rename include/keys/{trusted.h => trusted_tpm.h} (77%)
> > > create mode 100644 security/keys/trusted-keys/Makefile
> > > rename security/keys/{trusted.c => trusted-keys/trusted_tpm1.c} (94%)
> > > create mode 100644 security/keys/trusted-keys/trusted_tpm2.c
> > >
> > > --
> > > 2.7.4
> > >
> >
> > I fixed a merge conflict caused by James' commit. Already pushed.
> > Compiling test kernel ATM i.e. tested-by's will follow later.
> >
> > /Jarkko
>
> Are you missing patch 4 on master?
Already removed the patch set given the sparse issues. Read this email
after doing that. Thanks anyway for pointing that out.
/Jarkko
^ permalink raw reply
* Re: [Patch v7 0/4] Create and consolidate trusted keys subsystem
From: Jarkko Sakkinen @ 2019-10-14 19:33 UTC (permalink / raw)
To: Sumit Garg
Cc: dhowells, peterhuewe, keyrings, linux-integrity, linux-crypto,
linux-security-module, herbert, davem, jgg, arnd, gregkh, jejb,
zohar, jmorris, serge, jsnitsel, linux-kernel, daniel.thompson
In-Reply-To: <20191011123757.GD3129@linux.intel.com>
On Fri, Oct 11, 2019 at 03:37:57PM +0300, Jarkko Sakkinen wrote:
> On Mon, Oct 07, 2019 at 10:55:31AM +0530, Sumit Garg wrote:
> > This patch-set does restructuring of trusted keys code to create and
> > consolidate trusted keys subsystem.
> >
> > Also, patch #2 replaces tpm1_buf code used in security/keys/trusted.c and
> > crypto/asymmertic_keys/asym_tpm.c files to use the common tpm_buf code.
> >
> > Changes in v7:
> > 1. Rebased to top of tpmdd/master
> > 2. Patch #4: update tpm2 trusted keys code to use tpm_send() instead of
> > tpm_transmit_cmd() which is an internal function.
> >
> > Changes in v6:
> > 1. Switch TPM asymmetric code also to use common tpm_buf code. These
> > changes required patches #1 and #2 update, so I have dropped review
> > tags from those patches.
> > 2. Incorporated miscellaneous comments from Jarkko.
> >
> > Changes in v5:
> > 1. Drop 5/5 patch as its more relavant along with TEE patch-set.
> > 2. Add Reviewed-by tag for patch #2.
> > 3. Fix build failure when "CONFIG_HEADER_TEST" and
> > "CONFIG_KERNEL_HEADER_TEST" config options are enabled.
> > 4. Misc changes to rename files.
> >
> > Changes in v4:
> > 1. Separate patch for export of tpm_buf code to include/linux/tpm.h
> > 2. Change TPM1.x trusted keys code to use common tpm_buf
> > 3. Keep module name as trusted.ko only
> >
> > Changes in v3:
> >
> > Move TPM2 trusted keys code to trusted keys subsystem.
> >
> > Changes in v2:
> >
> > Split trusted keys abstraction patch for ease of review.
> >
> > Sumit Garg (4):
> > tpm: Move tpm_buf code to include/linux/
> > KEYS: Use common tpm_buf for trusted and asymmetric keys
> > KEYS: trusted: Create trusted keys subsystem
> > KEYS: trusted: Move TPM2 trusted keys code
> >
> > crypto/asymmetric_keys/asym_tpm.c | 101 +++----
> > drivers/char/tpm/tpm-interface.c | 56 ----
> > drivers/char/tpm/tpm.h | 226 ---------------
> > drivers/char/tpm/tpm2-cmd.c | 307 --------------------
> > include/Kbuild | 1 -
> > include/keys/{trusted.h => trusted_tpm.h} | 49 +---
> > include/linux/tpm.h | 251 ++++++++++++++--
> > security/keys/Makefile | 2 +-
> > security/keys/trusted-keys/Makefile | 8 +
> > .../{trusted.c => trusted-keys/trusted_tpm1.c} | 96 +++----
> > security/keys/trusted-keys/trusted_tpm2.c | 314 +++++++++++++++++++++
> > 11 files changed, 652 insertions(+), 759 deletions(-)
> > rename include/keys/{trusted.h => trusted_tpm.h} (77%)
> > create mode 100644 security/keys/trusted-keys/Makefile
> > rename security/keys/{trusted.c => trusted-keys/trusted_tpm1.c} (94%)
> > create mode 100644 security/keys/trusted-keys/trusted_tpm2.c
> >
> > --
> > 2.7.4
> >
>
> I fixed a merge conflict caused by James' commit. Already pushed.
> Compiling test kernel ATM i.e. tested-by's will follow later.
Update to my latest master for v8 (otherwise there won't be a clean
merge).
/Jarkko
^ permalink raw reply
* [PATCH v2] perf_event: Add support for LSM and SELinux checks
From: Joel Fernandes (Google) @ 2019-10-14 17:03 UTC (permalink / raw)
To: linux-kernel
Cc: Joel Fernandes (Google), Peter Zijlstra, rostedt, primiano,
rsavitski, jeffv, kernel-team, James Morris, Alexei Starovoitov,
Arnaldo Carvalho de Melo, bpf, Daniel Borkmann, Ingo Molnar,
Jiri Olsa, Kees Cook, linux-security-module, Matthew Garrett,
Namhyung Kim, selinux, Song Liu,
maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT), Yonghong Song
In current mainline, the degree of access to perf_event_open(2) system
call depends on the perf_event_paranoid sysctl. This has a number of
limitations:
1. The sysctl is only a single value. Many types of accesses are controlled
based on the single value thus making the control very limited and
coarse grained.
2. The sysctl is global, so if the sysctl is changed, then that means
all processes get access to perf_event_open(2) opening the door to
security issues.
This patch adds LSM and SELinux access checking which will be used in
Android to access perf_event_open(2) for the purposes of attaching BPF
programs to tracepoints, perf profiling and other operations from
userspace. These operations are intended for production systems.
5 new LSM hooks are added:
1. perf_event_open: This controls access during the perf_event_open(2)
syscall itself. The hook is called from all the places that the
perf_event_paranoid sysctl is checked to keep it consistent with the
systctl. The hook gets passed a 'type' argument which controls CPU,
kernel and tracepoint accesses (in this context, CPU, kernel and
tracepoint have the same semantics as the perf_event_paranoid sysctl).
Additionally, I added an 'open' type which is similar to
perf_event_paranoid sysctl == 3 patch carried in Android and several other
distros but was rejected in mainline [1] in 2016.
2. perf_event_alloc: This allocates a new security object for the event
which stores the current SID within the event. It will be useful when
the perf event's FD is passed through IPC to another process which may
try to read the FD. Appropriate security checks will limit access.
3. perf_event_free: Called when the event is closed.
4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event.
5. perf_event_write: Called from the ioctl(2) syscalls for the event.
[1] https://lwn.net/Articles/696240/
Since Peter had suggest LSM hooks in 2016 [1], I am adding his
Suggested-by tag below.
To use this patch, we set the perf_event_paranoid sysctl to -1 and then
apply selinux checking as appropriate (default deny everything, and then
add policy rules to give access to domains that need it). In the future
we can remove the perf_event_paranoid sysctl altogether.
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: rostedt@goodmis.org
Cc: primiano@google.com
Cc: rsavitski@google.com
Cc: jeffv@google.com
Cc: kernel-team@android.com
Acked-by: James Morris <jmorris@namei.org>
Co-developed-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
---
Changes since v1:
o Fixes from Peter Ziljstra.
o Added Ack from James Morris and Co-developed-by tag for Peter.
Changes since RFC:
o Small nits, style changes (James Morris).
o Consolidation of code (Peter Zijlstra).
arch/x86/events/intel/bts.c | 8 ++--
arch/x86/events/intel/core.c | 5 ++-
arch/x86/events/intel/p4.c | 5 ++-
include/linux/lsm_hooks.h | 15 +++++++
include/linux/perf_event.h | 28 +++++++++---
include/linux/security.h | 39 +++++++++++++++-
include/uapi/linux/perf_event.h | 9 ++++
kernel/events/core.c | 57 +++++++++++++++++++-----
kernel/trace/trace_event_perf.c | 15 ++++---
security/security.c | 27 +++++++++++
security/selinux/hooks.c | 69 +++++++++++++++++++++++++++++
security/selinux/include/classmap.h | 2 +
security/selinux/include/objsec.h | 6 ++-
13 files changed, 255 insertions(+), 30 deletions(-)
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 5ee3fed881d3..38de4a7f6752 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -549,9 +549,11 @@ static int bts_event_init(struct perf_event *event)
* Note that the default paranoia setting permits unprivileged
* users to profile the kernel.
*/
- if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
- !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if (event->attr.exclude_kernel) {
+ ret = perf_allow_kernel(&event->attr);
+ if (ret)
+ return ret;
+ }
if (x86_add_exclusive(x86_lbr_exclusive_bts))
return -EBUSY;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 27ee47a7be66..32967a9e9962 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3315,8 +3315,9 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (x86_pmu.version < 3)
return -EINVAL;
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ ret = perf_allow_cpu(&event->attr);
+ if (ret)
+ return ret;
event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index dee579efb2b2..a4cc66005ce8 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -776,8 +776,9 @@ static int p4_validate_raw_event(struct perf_event *event)
* the user needs special permissions to be able to use it
*/
if (p4_ht_active() && p4_event_bind_map[v].shared) {
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ v = perf_allow_cpu(&event->attr);
+ if (v)
+ return v;
}
/* ESCR EventMask bits may be invalid */
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a3763247547c..20d8cf194fb7 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1818,6 +1818,14 @@ union security_list_options {
void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
#endif /* CONFIG_BPF_SYSCALL */
int (*locked_down)(enum lockdown_reason what);
+#ifdef CONFIG_PERF_EVENTS
+ int (*perf_event_open)(struct perf_event_attr *attr, int type);
+ int (*perf_event_alloc)(struct perf_event *event);
+ void (*perf_event_free)(struct perf_event *event);
+ int (*perf_event_read)(struct perf_event *event);
+ int (*perf_event_write)(struct perf_event *event);
+
+#endif
};
struct security_hook_heads {
@@ -2060,6 +2068,13 @@ struct security_hook_heads {
struct hlist_head bpf_prog_free_security;
#endif /* CONFIG_BPF_SYSCALL */
struct hlist_head locked_down;
+#ifdef CONFIG_PERF_EVENTS
+ struct hlist_head perf_event_open;
+ struct hlist_head perf_event_alloc;
+ struct hlist_head perf_event_free;
+ struct hlist_head perf_event_read;
+ struct hlist_head perf_event_write;
+#endif
} __randomize_layout;
/*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61448c19a132..664bb7f99c46 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -56,6 +56,7 @@ struct perf_guest_info_callbacks {
#include <linux/perf_regs.h>
#include <linux/cgroup.h>
#include <linux/refcount.h>
+#include <linux/security.h>
#include <asm/local.h>
struct perf_callchain_entry {
@@ -721,6 +722,9 @@ struct perf_event {
struct perf_cgroup *cgrp; /* cgroup event is attach to */
#endif
+#ifdef CONFIG_SECURITY
+ void *security;
+#endif
struct list_head sb_list;
#endif /* CONFIG_PERF_EVENTS */
};
@@ -1241,19 +1245,33 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
int perf_event_max_stack_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
-static inline bool perf_paranoid_tracepoint_raw(void)
+static inline int perf_is_paranoid(void)
{
return sysctl_perf_event_paranoid > -1;
}
-static inline bool perf_paranoid_cpu(void)
+static inline int perf_allow_kernel(struct perf_event_attr *attr)
{
- return sysctl_perf_event_paranoid > 0;
+ if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
}
-static inline bool perf_paranoid_kernel(void)
+static inline int perf_allow_cpu(struct perf_event_attr *attr)
{
- return sysctl_perf_event_paranoid > 1;
+ if (sysctl_perf_event_paranoid > 0 && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ return security_perf_event_open(attr, PERF_SECURITY_CPU);
+}
+
+static inline int perf_allow_tracepoint(struct perf_event_attr *attr)
+{
+ if (sysctl_perf_event_paranoid > -1 && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT);
}
extern void perf_event_init(void);
diff --git a/include/linux/security.h b/include/linux/security.h
index a8d59d612d27..273e11c66ed7 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1894,5 +1894,42 @@ static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
#endif /* CONFIG_SECURITY */
#endif /* CONFIG_BPF_SYSCALL */
-#endif /* ! __LINUX_SECURITY_H */
+#ifdef CONFIG_PERF_EVENTS
+struct perf_event_attr;
+
+#ifdef CONFIG_SECURITY
+extern int security_perf_event_open(struct perf_event_attr *attr, int type);
+extern int security_perf_event_alloc(struct perf_event *event);
+extern void security_perf_event_free(struct perf_event *event);
+extern int security_perf_event_read(struct perf_event *event);
+extern int security_perf_event_write(struct perf_event *event);
+#else
+static inline int security_perf_event_open(struct perf_event_attr *attr,
+ int type)
+{
+ return 0;
+}
+static inline int security_perf_event_alloc(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void security_perf_event_free(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline int security_perf_event_read(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline int security_perf_event_write(struct perf_event *event)
+{
+ return 0;
+}
+#endif /* CONFIG_SECURITY */
+#endif /* CONFIG_PERF_EVENTS */
+
+#endif /* ! __LINUX_SECURITY_H */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index bb7b271397a6..2af95f937a5b 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -427,6 +427,15 @@ struct perf_event_attr {
__u16 __reserved_2; /* align to __u64 */
};
+
+/* Access to perf_event_open(2) syscall. */
+#define PERF_SECURITY_OPEN 0
+
+/* Finer grained perf_event_open(2) access control. */
+#define PERF_SECURITY_CPU 1
+#define PERF_SECURITY_KERNEL 2
+#define PERF_SECURITY_TRACEPOINT 3
+
/*
* Structure used by below PERF_EVENT_IOC_QUERY_BPF command
* to query bpf programs attached to the same perf tracepoint
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4655adbbae10..8ad597fae5f4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4217,8 +4217,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (!task) {
/* Must be root to operate on a CPU event: */
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return ERR_PTR(-EACCES);
+ err = perf_allow_cpu(&event->attr);
+ if (err)
+ return ERR_PTR(err);
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -4527,6 +4528,8 @@ static void _free_event(struct perf_event *event)
unaccount_event(event);
+ security_perf_event_free(event);
+
if (event->rb) {
/*
* Can happen when we close an event with re-directed output.
@@ -4980,6 +4983,10 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct perf_event_context *ctx;
int ret;
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
ret = __perf_read(event, buf, count);
perf_event_ctx_unlock(event, ctx);
@@ -5244,6 +5251,11 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct perf_event_context *ctx;
long ret;
+ /* Treat ioctl like writes as it is likely a mutating operation. */
+ ret = security_perf_event_write(event);
+ if (ret)
+ return ret;
+
ctx = perf_event_ctx_lock(event);
ret = _perf_ioctl(event, cmd, arg);
perf_event_ctx_unlock(event, ctx);
@@ -5706,6 +5718,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
+ ret = security_perf_event_read(event);
+ if (ret)
+ return ret;
+
vma_size = vma->vm_end - vma->vm_start;
if (vma->vm_pgoff == 0) {
@@ -5819,7 +5835,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
lock_limit >>= PAGE_SHIFT;
locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
- if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+ if ((locked > lock_limit) && perf_is_paranoid() &&
!capable(CAP_IPC_LOCK)) {
ret = -EPERM;
goto unlock;
@@ -10553,11 +10569,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
+ err = security_perf_event_alloc(event);
+ if (err)
+ goto err_callchain_buffer;
+
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
return event;
+err_callchain_buffer:
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
err_addr_filters:
kfree(event->addr_filter_ranges);
@@ -10675,9 +10700,11 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
attr->branch_sample_type = mask;
}
/* privileged levels capture (kernel, hv): check permissions */
- if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
- && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
+ ret = perf_allow_kernel(attr);
+ if (ret)
+ return ret;
+ }
}
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -10890,13 +10917,19 @@ SYSCALL_DEFINE5(perf_event_open,
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
+ /* Do we allow access to perf_event_open(2) ? */
+ err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ if (err)
+ return err;
+
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
if (!attr.exclude_kernel) {
- if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ err = perf_allow_kernel(&attr);
+ if (err)
+ return err;
}
if (attr.namespaces) {
@@ -10913,9 +10946,11 @@ SYSCALL_DEFINE5(perf_event_open,
}
/* Only privileged users can get physical addresses */
- if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
- perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
+ err = perf_allow_kernel(&attr);
+ if (err)
+ return err;
+ }
err = security_locked_down(LOCKDOWN_PERF);
if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0892e38ed6fb..0917fee6ee7c 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -8,6 +8,7 @@
#include <linux/module.h>
#include <linux/kprobes.h>
+#include <linux/security.h>
#include "trace.h"
#include "trace_probe.h"
@@ -26,8 +27,10 @@ static int total_ref_count;
static int perf_trace_event_perm(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
+ int ret;
+
if (tp_event->perf_perm) {
- int ret = tp_event->perf_perm(tp_event, p_event);
+ ret = tp_event->perf_perm(tp_event, p_event);
if (ret)
return ret;
}
@@ -46,8 +49,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
- if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ ret = perf_allow_tracepoint(&p_event->attr);
+ if (ret)
+ return ret;
if (!is_sampling_event(p_event))
return 0;
@@ -82,8 +86,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
* ...otherwise raw tracepoint data can be a severe data leak,
* only allow root to have these.
*/
- if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ ret = perf_allow_tracepoint(&p_event->attr);
+ if (ret)
+ return ret;
return 0;
}
diff --git a/security/security.c b/security/security.c
index 1bc000f834e2..cd2d18d2d279 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2404,3 +2404,30 @@ int security_locked_down(enum lockdown_reason what)
return call_int_hook(locked_down, 0, what);
}
EXPORT_SYMBOL(security_locked_down);
+
+#ifdef CONFIG_PERF_EVENTS
+int security_perf_event_open(struct perf_event_attr *attr, int type)
+{
+ return call_int_hook(perf_event_open, 0, attr, type);
+}
+
+int security_perf_event_alloc(struct perf_event *event)
+{
+ return call_int_hook(perf_event_alloc, 0, event);
+}
+
+void security_perf_event_free(struct perf_event *event)
+{
+ call_void_hook(perf_event_free, event);
+}
+
+int security_perf_event_read(struct perf_event *event)
+{
+ return call_int_hook(perf_event_read, 0, event);
+}
+
+int security_perf_event_write(struct perf_event *event)
+{
+ return call_int_hook(perf_event_write, 0, event);
+}
+#endif /* CONFIG_PERF_EVENTS */
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9625b99e677f..28eb05490d59 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6795,6 +6795,67 @@ struct lsm_blob_sizes selinux_blob_sizes __lsm_ro_after_init = {
.lbs_msg_msg = sizeof(struct msg_security_struct),
};
+#ifdef CONFIG_PERF_EVENTS
+static int selinux_perf_event_open(struct perf_event_attr *attr, int type)
+{
+ u32 requested, sid = current_sid();
+
+ if (type == PERF_SECURITY_OPEN)
+ requested = PERF_EVENT__OPEN;
+ else if (type == PERF_SECURITY_CPU)
+ requested = PERF_EVENT__CPU;
+ else if (type == PERF_SECURITY_KERNEL)
+ requested = PERF_EVENT__KERNEL;
+ else if (type == PERF_SECURITY_TRACEPOINT)
+ requested = PERF_EVENT__TRACEPOINT;
+ else
+ return -EINVAL;
+
+ return avc_has_perm(&selinux_state, sid, sid, SECCLASS_PERF_EVENT,
+ requested, NULL);
+}
+
+static int selinux_perf_event_alloc(struct perf_event *event)
+{
+ struct perf_event_security_struct *perfsec;
+
+ perfsec = kzalloc(sizeof(*perfsec), GFP_KERNEL);
+ if (!perfsec)
+ return -ENOMEM;
+
+ perfsec->sid = current_sid();
+ event->security = perfsec;
+
+ return 0;
+}
+
+static void selinux_perf_event_free(struct perf_event *event)
+{
+ struct perf_event_security_struct *perfsec = event->security;
+
+ event->security = NULL;
+ kfree(perfsec);
+}
+
+static int selinux_perf_event_read(struct perf_event *event)
+{
+ struct perf_event_security_struct *perfsec = event->security;
+ u32 sid = current_sid();
+
+ return avc_has_perm(&selinux_state, sid, perfsec->sid,
+ SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL);
+}
+
+static int selinux_perf_event_write(struct perf_event *event)
+{
+ struct perf_event_security_struct *perfsec = event->security;
+ u32 sid = current_sid();
+
+ return avc_has_perm(&selinux_state, sid, perfsec->sid,
+ SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL);
+}
+#endif
+
static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
@@ -7030,6 +7091,14 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
#endif
+
+#ifdef CONFIG_PERF_EVENTS
+ LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open),
+ LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
+ LSM_HOOK_INIT(perf_event_free, selinux_perf_event_free),
+ LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read),
+ LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write),
+#endif
};
static __init int selinux_init(void)
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 32e9b03be3dd..7db24855e12d 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -244,6 +244,8 @@ struct security_class_mapping secclass_map[] = {
{"map_create", "map_read", "map_write", "prog_load", "prog_run"} },
{ "xdp_socket",
{ COMMON_SOCK_PERMS, NULL } },
+ { "perf_event",
+ {"open", "cpu", "kernel", "tracepoint", "read", "write"} },
{ NULL }
};
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 586b7abd0aa7..a4a86cbcfb0a 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -141,7 +141,11 @@ struct pkey_security_struct {
};
struct bpf_security_struct {
- u32 sid; /*SID of bpf obj creater*/
+ u32 sid; /* SID of bpf obj creator */
+};
+
+struct perf_event_security_struct {
+ u32 sid; /* SID of perf_event obj creator */
};
extern struct lsm_blob_sizes selinux_blob_sizes;
--
2.23.0.700.g56cf767bdb-goog
^ permalink raw reply related
* Re: [PATCH] perf_event: Add support for LSM and SELinux checks
From: Joel Fernandes @ 2019-10-14 16:54 UTC (permalink / raw)
To: Peter Zijlstra
Cc: linux-kernel, rostedt, primiano, rsavitski, jeffv, kernel-team,
Alexei Starovoitov, Arnaldo Carvalho de Melo, bpf,
Daniel Borkmann, Ingo Molnar, James Morris, Jiri Olsa, Kees Cook,
linux-security-module, Matthew Garrett, Namhyung Kim, selinux,
Song Liu, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
Yonghong Song
In-Reply-To: <20191014093544.GB2328@hirez.programming.kicks-ass.net>
On Mon, Oct 14, 2019 at 11:35:44AM +0200, Peter Zijlstra wrote:
> On Fri, Oct 11, 2019 at 12:03:30PM -0400, Joel Fernandes (Google) wrote:
>
> > @@ -4761,6 +4762,7 @@ int perf_event_release_kernel(struct perf_event *event)
> > }
> >
> > no_ctx:
> > + security_perf_event_free(event);
> > put_event(event); /* Must be the 'last' reference */
> > return 0;
> > }
>
> > @@ -10553,11 +10568,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> > }
> > }
> >
> > + err = security_perf_event_alloc(event);
> > + if (err)
> > + goto err_security;
> > +
> > /* symmetric to unaccount_event() in _free_event() */
> > account_event(event);
> >
> > return event;
> >
> > +err_security:
> > err_addr_filters:
> > kfree(event->addr_filter_ranges);
> >
>
> There's a bunch of problems here I think:
>
> - err_security is named wrong; the naming scheme is to name the label
> after the last thing that succeeded / first thing that needs to be
> undone.
>
> - per that, you're forgetting to undo 'get_callchain_buffers()'
Yes, you're right. Tested your fix below. Sorry to miss this.
> - perf_event_release_kernel() is not a full match to
> perf_event_alloc(), inherited events get created by
> perf_event_alloc() but never pass through
> perf_event_release_kernel().
Oh, through inherit_event(). Thanks for pointing this semantic out, did not
know that.
> I'm thinking the below patch on top should ammend these issues; please
> verify.
Yes, applied your diff below and verified that the events are getting freed
as they were in my initial set of tests. The diff also looks good to me.
I squashed your diff below and will resend as v3. Since you modified this
patch a lot, I will add your Co-developed-by tag as well.
thanks, Peter!
- Joel
> ---
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -4540,6 +4540,8 @@ static void _free_event(struct perf_even
>
> unaccount_event(event);
>
> + security_perf_event_free(event);
> +
> if (event->rb) {
> /*
> * Can happen when we close an event with re-directed output.
> @@ -4774,7 +4776,6 @@ int perf_event_release_kernel(struct per
> }
>
> no_ctx:
> - security_perf_event_free(event);
> put_event(event); /* Must be the 'last' reference */
> return 0;
> }
> @@ -10595,14 +10596,18 @@ perf_event_alloc(struct perf_event_attr
>
> err = security_perf_event_alloc(event);
> if (err)
> - goto err_security;
> + goto err_callchain_buffer;
>
> /* symmetric to unaccount_event() in _free_event() */
> account_event(event);
>
> return event;
>
> -err_security:
> +err_callchain_buffer:
> + if (!event->parent) {
> + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
> + put_callchain_buffers();
> + }
> err_addr_filters:
> kfree(event->addr_filter_ranges);
>
^ permalink raw reply
* Re: [PATCH] perf_event: Add support for LSM and SELinux checks
From: Peter Zijlstra @ 2019-10-14 9:35 UTC (permalink / raw)
To: Joel Fernandes (Google)
Cc: linux-kernel, rostedt, primiano, rsavitski, jeffv, kernel-team,
Alexei Starovoitov, Arnaldo Carvalho de Melo, bpf,
Daniel Borkmann, Ingo Molnar, James Morris, Jiri Olsa, Kees Cook,
linux-security-module, Matthew Garrett, Namhyung Kim, selinux,
Song Liu, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
Yonghong Song
In-Reply-To: <20191011160330.199604-1-joel@joelfernandes.org>
On Fri, Oct 11, 2019 at 12:03:30PM -0400, Joel Fernandes (Google) wrote:
> @@ -4761,6 +4762,7 @@ int perf_event_release_kernel(struct perf_event *event)
> }
>
> no_ctx:
> + security_perf_event_free(event);
> put_event(event); /* Must be the 'last' reference */
> return 0;
> }
> @@ -10553,11 +10568,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
> }
> }
>
> + err = security_perf_event_alloc(event);
> + if (err)
> + goto err_security;
> +
> /* symmetric to unaccount_event() in _free_event() */
> account_event(event);
>
> return event;
>
> +err_security:
> err_addr_filters:
> kfree(event->addr_filter_ranges);
>
There's a bunch of problems here I think:
- err_security is named wrong; the naming scheme is to name the label
after the last thing that succeeded / first thing that needs to be
undone.
- per that, you're forgetting to undo 'get_callchain_buffers()'
- perf_event_release_kernel() is not a full match to
perf_event_alloc(), inherited events get created by
perf_event_alloc() but never pass through
perf_event_release_kernel().
I'm thinking the below patch on top should ammend these issues; please
verify.
---
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4540,6 +4540,8 @@ static void _free_event(struct perf_even
unaccount_event(event);
+ security_perf_event_free(event);
+
if (event->rb) {
/*
* Can happen when we close an event with re-directed output.
@@ -4774,7 +4776,6 @@ int perf_event_release_kernel(struct per
}
no_ctx:
- security_perf_event_free(event);
put_event(event); /* Must be the 'last' reference */
return 0;
}
@@ -10595,14 +10596,18 @@ perf_event_alloc(struct perf_event_attr
err = security_perf_event_alloc(event);
if (err)
- goto err_security;
+ goto err_callchain_buffer;
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
return event;
-err_security:
+err_callchain_buffer:
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
err_addr_filters:
kfree(event->addr_filter_ranges);
^ permalink raw reply
* kernel panic while using get_random_bytes
From: Temp Sha @ 2019-10-14 8:21 UTC (permalink / raw)
To: linux-security-module
hi,
i use get_random_bytes() function for my randomness requirement in
kernel version 4.14.142
however is gives panic as soon as I call get_random_bytes() in my module.
Oct 10 07:20:18 BUG: unable to handle kernel paging request at 00007f5563ced000
IP: chacha20_block+0x24d/0x280
PGD 800000010f7f8067 P4D 800000010f7f8067 PUD 161316067 PMD 1015a8067 PTE 0
Oops: 0002 [#1] PREEMPT SMP PTI
Modules linked in: ipi_hsl(PO) mymod(PO) e1000 ipv6 ftdi_sio usbserial
xt_tcpudp xt_mark iptable_nat nf_nat_ipv4 nf_conntrack_ipv4
nf_defrag_ipv4 nf_nat xt_connlimit nf_conntrack iptable_filter
ip_tables x_tables
CPU: 0 PID: 1841 Comm: hexdump Tainted: P O 4.14.142-ws-symbol #1
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
task: ffff8881611da000 task.stack: ffffc900504c4000
RIP: 0010:chacha20_block+0x24d/0x280
RSP: 0018:ffffc900504c7c70 EFLAGS: 00010886
RAX: 0000000000000000 RBX: 00000000a88c95b0 RCX: 00007f5563ced000
RDX: ffff88810f79da00 RSI: 0000000015c4332e RDI: 000000007613f298
RBP: ffffc900504c7d00 R08: 000000009d39d68d R09: 00000000bfbdb51f
R10: 00000000ed798a26 R11: 0000000083c184dc R12: 0000000036fc61e0
R13: 00000000f9004639 R14: 0000000042c0d351 R15: 000000008a6cef0f
FS: 00007f5563cef700(0000) GS:ffff888167e00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f5563ced000 CR3: 000000016117c000 CR4: 00000000000006b0
Call Trace:
_extract_crng+0x6d/0xc0
extract_crng+0x3a/0x40
_get_random_bytes+0x56/0x1c0
? vprintk_func+0x3f/0xd0
? printk+0x3e/0x46
get_random_bytes+0x2f/0x40
xyz_packets+0x1110/0x11e0 [mymod]
proc_reg_read+0x3d/0x60
__vfs_read+0x23/0x120
? vm_mmap_pgoff+0x9d/0xd0
vfs_read+0x8e/0x110
SyS_read+0x48/0xc0
do_syscall_64+0x5c/0x260
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x7f55631f
what could be the problem?
thanks
^ permalink raw reply
* Re: [PATCH 1/7 v2] tracefs: Revert ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
From: Steven Rostedt @ 2019-10-13 0:39 UTC (permalink / raw)
To: Linus Torvalds
Cc: Linux Kernel Mailing List, Ingo Molnar, Andrew Morton,
Matthew Garrett, James Morris James Morris, LSM List, Linux API,
Ben Hutchings, Al Viro
In-Reply-To: <20191012203502.065258d2@gandalf.local.home>
On Sat, 12 Oct 2019 20:35:02 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:
> On Sat, 12 Oct 2019 15:56:15 -0700
> Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> > On Fri, Oct 11, 2019 at 5:59 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> > >
> > >
> > > I bisected this down to the addition of the proxy_ops into tracefs for
> > > lockdown. It appears that the allocation of the proxy_ops and then freeing
> > > it in the destroy_inode callback, is causing havoc with the memory system.
> > > Reading the documentation about destroy_inode and talking with Linus about
> > > this, this is buggy and wrong.
> >
> > Can you still add the explanation about the inode memory leak to this message?
> >
> > Right now it just says "it's buggy and wrong". True. But doesn't
> > explain _why_ it is buggy and wrong.
> >
>
> Sure. The patches just finished my testing (along with other fixes that
> I need to send you). I have to make a few other updates in the change
> log though, so I'll be rebasing them (but not touching the code), to
> clean up the change logs.
>
I updated this change log to state:
"I bisected this down to the addition of the proxy_ops into tracefs for
lockdown. It appears that the allocation of the proxy_ops and then freeing
it in the destroy_inode callback, is causing havoc with the memory system.
Reading the documentation about destroy_inode and talking with Linus about
this, this is buggy and wrong. When defining the destroy_inode() method, it
is expected that the destroy_inode() will also free the inode, and not just
the extra allocations done in the creation of the inode. The faulty commit
causes a memory leak of the inode data structure when they are deleted."
-- Steve
^ permalink raw reply
* Re: [PATCH 1/7 v2] tracefs: Revert ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
From: Steven Rostedt @ 2019-10-13 0:35 UTC (permalink / raw)
To: Linus Torvalds
Cc: Linux Kernel Mailing List, Ingo Molnar, Andrew Morton,
Matthew Garrett, James Morris James Morris, LSM List, Linux API,
Ben Hutchings, Al Viro
In-Reply-To: <CAHk-=whE7GjKz9LtEVNw=zEgWr65N1mU7t2rA4MLiia8Zit6DQ@mail.gmail.com>
On Sat, 12 Oct 2019 15:56:15 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Fri, Oct 11, 2019 at 5:59 PM Steven Rostedt <rostedt@goodmis.org> wrote:
> >
> >
> > I bisected this down to the addition of the proxy_ops into tracefs for
> > lockdown. It appears that the allocation of the proxy_ops and then freeing
> > it in the destroy_inode callback, is causing havoc with the memory system.
> > Reading the documentation about destroy_inode and talking with Linus about
> > this, this is buggy and wrong.
>
> Can you still add the explanation about the inode memory leak to this message?
>
> Right now it just says "it's buggy and wrong". True. But doesn't
> explain _why_ it is buggy and wrong.
>
Sure. The patches just finished my testing (along with other fixes that
I need to send you). I have to make a few other updates in the change
log though, so I'll be rebasing them (but not touching the code), to
clean up the change logs.
-- Steve
^ permalink raw reply
* Re: [PATCH 1/7 v2] tracefs: Revert ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
From: Linus Torvalds @ 2019-10-12 22:56 UTC (permalink / raw)
To: Steven Rostedt
Cc: Linux Kernel Mailing List, Ingo Molnar, Andrew Morton,
Matthew Garrett, James Morris James Morris, LSM List, Linux API,
Ben Hutchings, Al Viro
In-Reply-To: <20191012005920.630331484@goodmis.org>
On Fri, Oct 11, 2019 at 5:59 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
>
> I bisected this down to the addition of the proxy_ops into tracefs for
> lockdown. It appears that the allocation of the proxy_ops and then freeing
> it in the destroy_inode callback, is causing havoc with the memory system.
> Reading the documentation about destroy_inode and talking with Linus about
> this, this is buggy and wrong.
Can you still add the explanation about the inode memory leak to this message?
Right now it just says "it's buggy and wrong". True. But doesn't
explain _why_ it is buggy and wrong.
Linus
^ permalink raw reply
* Re: [PATCH 4/7 v2] tracing: Have trace events system open call tracing_open_generic_tr()
From: Steven Rostedt @ 2019-10-12 2:09 UTC (permalink / raw)
To: linux-kernel
Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
James Morris James Morris, LSM List, Linux API, Ben Hutchings,
Al Viro
In-Reply-To: <20191012005921.091872328@goodmis.org>
On Fri, 11 Oct 2019 20:57:51 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -1391,9 +1391,6 @@ static int subsystem_open(struct inode *inode, struct file *filp)
> struct trace_array *tr;
> int ret;
>
> - if (tracing_is_disabled())
> - return -ENODEV;
> -
> /* Make sure the system still exists */
> mutex_lock(&event_mutex);
> mutex_lock(&trace_types_lock);
> @@ -1420,16 +1417,9 @@ static int subsystem_open(struct inode *inode, struct file *filp)
> WARN_ON(!dir);
>
> /* Still need to increment the ref count of the system */
> - if (trace_array_get(tr) < 0) {
> - put_system(dir);
> - return -ENODEV;
> - }
> -
> - ret = tracing_open_generic(inode, filp);
> - if (ret < 0) {
> - trace_array_put(tr);
> + ret = tracing_open_generic_tr(inode, filp);
> + if (ret < 0)
> put_system(dir);
> - }
>
> return ret;
> }
I got a bit too aggressive on this patch. The subsystem_open() gets the
tr from a search, not from the inode. Thus it can't use the
tracing_open_generic_tr() call. It needs to do the trace_array_get()
directly.
V2 of this patch:
-- Steve
From b69083301044f587afefd5a4ac754a8c43ba0f0d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 11 Oct 2019 19:12:21 -0400
Subject: [PATCH] tracing: Have trace events system open call
tracing_open_generic_tr()
Instead of having the trace events system open calls open code the taking of
the trace_array descriptor (with trace_array_get()) and then calling
trace_open_generic(), have it use the tracing_open_generic_tr() that does
the combination of the two. This requires making tracing_open_generic_tr()
global.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
kernel/trace/trace.c | 2 +-
kernel/trace/trace.h | 1 +
kernel/trace/trace_events.c | 17 +++--------------
3 files changed, 5 insertions(+), 15 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fa7d813b04c6..94f1b9124939 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4156,7 +4156,7 @@ bool tracing_is_disabled(void)
* Open and update trace_array ref count.
* Must have the current trace_array passed to it.
*/
-static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
+int tracing_open_generic_tr(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f801d154ff6a..854dbf4050f8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -681,6 +681,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
+int tracing_open_generic_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b89cdfe20bc1..9613a757c902 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1440,28 +1440,17 @@ static int system_tr_open(struct inode *inode, struct file *filp)
struct trace_array *tr = inode->i_private;
int ret;
- if (tracing_is_disabled())
- return -ENODEV;
-
- if (trace_array_get(tr) < 0)
- return -ENODEV;
-
/* Make a temporary dir that has no system but points to tr */
dir = kzalloc(sizeof(*dir), GFP_KERNEL);
- if (!dir) {
- trace_array_put(tr);
+ if (!dir)
return -ENOMEM;
- }
-
- dir->tr = tr;
- ret = tracing_open_generic(inode, filp);
+ ret = tracing_open_generic_tr(inode, filp);
if (ret < 0) {
- trace_array_put(tr);
kfree(dir);
return ret;
}
-
+ dir->tr = tr;
filp->private_data = dir;
return 0;
--
2.20.1
^ permalink raw reply related
* [PATCH 3/7 v2] tracing: Get trace_array reference for available_tracers files
From: Steven Rostedt @ 2019-10-12 0:57 UTC (permalink / raw)
To: linux-kernel
Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
James Morris James Morris, LSM List, Linux API, Ben Hutchings,
Al Viro, stable
In-Reply-To: <20191012005747.210722465@goodmis.org>
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
As instances may have different tracers available, we need to look at the
trace_array descriptor that shows lists the available tracers for the
instance. But there's a race between opening the file and the admin from
deleting the instance. The trace_array_get() needs to be called before
accessing the trace_array.
Cc: stable@vger.kernel.org
Fixes: 607e2ea167e56 ("tracing: Set up infrastructure to allow tracers for instances")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
kernel/trace/trace.c | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 252f79c435f8..fa7d813b04c6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4355,9 +4355,14 @@ static int show_traces_open(struct inode *inode, struct file *file)
if (tracing_disabled)
return -ENODEV;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
ret = seq_open(file, &show_traces_seq_ops);
- if (ret)
+ if (ret) {
+ trace_array_put(tr);
return ret;
+ }
m = file->private_data;
m->private = tr;
@@ -4365,6 +4370,14 @@ static int show_traces_open(struct inode *inode, struct file *file)
return 0;
}
+static int show_traces_release(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+ return seq_release(inode, file);
+}
+
static ssize_t
tracing_write_stub(struct file *filp, const char __user *ubuf,
size_t count, loff_t *ppos)
@@ -4395,8 +4408,8 @@ static const struct file_operations tracing_fops = {
static const struct file_operations show_traces_fops = {
.open = show_traces_open,
.read = seq_read,
- .release = seq_release,
.llseek = seq_lseek,
+ .release = show_traces_release,
};
static ssize_t
--
2.23.0
^ permalink raw reply related
* [PATCH 0/7 v2] tracing: Fix tracefs lockdown and various clean ups
From: Steven Rostedt @ 2019-10-12 0:57 UTC (permalink / raw)
To: linux-kernel
Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
James Morris James Morris, LSM List, Linux API, Ben Hutchings,
Al Viro
It appears that using destroy_inode() to clean up the proxy_ops that was
used by the lockdown code to have all open calls to the tracefs directory
was totally broken. It caused the inodes to not be cleaned up as the
destroy_inode() method is expected to clean up the inode, and not just what
it allocated as extra.
Linus suggested to get rid of the proxy_ops in tracefs, and just put the
checks in the open functions themselves. This also gives us a bit more fine
grain control to what exactly can be accessed.
Currently, I left the event format files (as they may need to be used by
something other than tracing), and enabled_functions, which shows what
functions are currently being traced. Not sure it is wise to not display
that information.
They can always be locked down later if need be.
Steven Rostedt (VMware) (7):
tracefs: Revert ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
ftrace: Get a reference counter for the trace_array on filter files
tracing: Get trace_array reference for available_tracers files
tracing: Have trace events system open call tracing_open_generic_tr()
tracing: Add tracing_check_open_get_tr()
tracing: Add some more locked_down checks
tracing: Do not create tracefs files if tracefs lockdown is in effect
----
fs/tracefs/inode.c | 46 ++----------
kernel/trace/ftrace.c | 55 ++++++++++----
kernel/trace/trace.c | 138 ++++++++++++++++++++++--------------
kernel/trace/trace.h | 2 +
kernel/trace/trace_dynevent.c | 4 ++
kernel/trace/trace_events.c | 49 +++++--------
kernel/trace/trace_events_hist.c | 13 +++-
kernel/trace/trace_events_trigger.c | 8 ++-
kernel/trace/trace_kprobe.c | 12 +++-
kernel/trace/trace_printk.c | 7 ++
kernel/trace/trace_stack.c | 8 +++
kernel/trace/trace_stat.c | 6 +-
kernel/trace/trace_uprobe.c | 11 +++
13 files changed, 220 insertions(+), 139 deletions(-)
^ permalink raw reply
* [PATCH 4/7 v2] tracing: Have trace events system open call tracing_open_generic_tr()
From: Steven Rostedt @ 2019-10-12 0:57 UTC (permalink / raw)
To: linux-kernel
Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
James Morris James Morris, LSM List, Linux API, Ben Hutchings,
Al Viro
In-Reply-To: <20191012005747.210722465@goodmis.org>
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Instead of having the trace events system open calls open code the taking of
the trace_array descriptor (with trace_array_get()) and then calling
trace_open_generic(), have it use the tracing_open_generic_tr() that does
the combination of the two. This requires making tracing_open_generic_tr()
global.
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
kernel/trace/trace.c | 2 +-
kernel/trace/trace.h | 1 +
kernel/trace/trace_events.c | 31 +++++--------------------------
3 files changed, 7 insertions(+), 27 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fa7d813b04c6..94f1b9124939 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4156,7 +4156,7 @@ bool tracing_is_disabled(void)
* Open and update trace_array ref count.
* Must have the current trace_array passed to it.
*/
-static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
+int tracing_open_generic_tr(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f801d154ff6a..854dbf4050f8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -681,6 +681,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf);
void tracing_reset_current(int cpu);
void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
+int tracing_open_generic_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b89cdfe20bc1..0d29f2f13477 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1391,9 +1391,6 @@ static int subsystem_open(struct inode *inode, struct file *filp)
struct trace_array *tr;
int ret;
- if (tracing_is_disabled())
- return -ENODEV;
-
/* Make sure the system still exists */
mutex_lock(&event_mutex);
mutex_lock(&trace_types_lock);
@@ -1420,16 +1417,9 @@ static int subsystem_open(struct inode *inode, struct file *filp)
WARN_ON(!dir);
/* Still need to increment the ref count of the system */
- if (trace_array_get(tr) < 0) {
- put_system(dir);
- return -ENODEV;
- }
-
- ret = tracing_open_generic(inode, filp);
- if (ret < 0) {
- trace_array_put(tr);
+ ret = tracing_open_generic_tr(inode, filp);
+ if (ret < 0)
put_system(dir);
- }
return ret;
}
@@ -1440,28 +1430,17 @@ static int system_tr_open(struct inode *inode, struct file *filp)
struct trace_array *tr = inode->i_private;
int ret;
- if (tracing_is_disabled())
- return -ENODEV;
-
- if (trace_array_get(tr) < 0)
- return -ENODEV;
-
/* Make a temporary dir that has no system but points to tr */
dir = kzalloc(sizeof(*dir), GFP_KERNEL);
- if (!dir) {
- trace_array_put(tr);
+ if (!dir)
return -ENOMEM;
- }
- dir->tr = tr;
-
- ret = tracing_open_generic(inode, filp);
+ ret = tracing_open_generic_tr(inode, filp);
if (ret < 0) {
- trace_array_put(tr);
kfree(dir);
return ret;
}
-
+ dir->tr = tr;
filp->private_data = dir;
return 0;
--
2.23.0
^ permalink raw reply related
* [PATCH 7/7 v2] tracing: Do not create tracefs files if tracefs lockdown is in effect
From: Steven Rostedt @ 2019-10-12 0:57 UTC (permalink / raw)
To: linux-kernel
Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
James Morris James Morris, LSM List, Linux API, Ben Hutchings,
Al Viro
In-Reply-To: <20191012005747.210722465@goodmis.org>
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
If on boot up, lockdown is activated for tracefs, don't even bother creating
the files. This can also prevent instances from being created if lockdown is
in effect.
Link: http://lkml.kernel.org/r/CAHk-=whC6Ji=fWnjh2+eS4b15TnbsS4VPVtvBOwCy1jjEG_JHQ@mail.gmail.com
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
fs/tracefs/inode.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index eeeae0475da9..0caa151cae4e 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -16,6 +16,7 @@
#include <linux/namei.h>
#include <linux/tracefs.h>
#include <linux/fsnotify.h>
+#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/parser.h>
#include <linux/magic.h>
@@ -390,6 +391,9 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
struct dentry *dentry;
struct inode *inode;
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
if (!(mode & S_IFMT))
mode |= S_IFREG;
BUG_ON(!S_ISREG(mode));
--
2.23.0
^ permalink raw reply related
* [PATCH 1/7 v2] tracefs: Revert ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
From: Steven Rostedt @ 2019-10-12 0:57 UTC (permalink / raw)
To: linux-kernel
Cc: Linus Torvalds, Ingo Molnar, Andrew Morton, Matthew Garrett,
James Morris James Morris, LSM List, Linux API, Ben Hutchings,
Al Viro
In-Reply-To: <20191012005747.210722465@goodmis.org>
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Running the latest kernel through my "make instances" stress tests, I
triggered the following bug (with KASAN and kmemleak enabled):
mkdir invoked oom-killer:
gfp_mask=0x40cd0(GFP_KERNEL|__GFP_COMP|__GFP_RECLAIMABLE), order=0,
oom_score_adj=0
CPU: 1 PID: 2229 Comm: mkdir Not tainted 5.4.0-rc2-test #325
Hardware name: MSI MS-7823/CSM-H87M-G43 (MS-7823), BIOS V1.6 02/22/2014
Call Trace:
dump_stack+0x64/0x8c
dump_header+0x43/0x3b7
? trace_hardirqs_on+0x48/0x4a
oom_kill_process+0x68/0x2d5
out_of_memory+0x2aa/0x2d0
__alloc_pages_nodemask+0x96d/0xb67
__alloc_pages_node+0x19/0x1e
alloc_slab_page+0x17/0x45
new_slab+0xd0/0x234
___slab_alloc.constprop.86+0x18f/0x336
? alloc_inode+0x2c/0x74
? irq_trace+0x12/0x1e
? tracer_hardirqs_off+0x1d/0xd7
? __slab_alloc.constprop.85+0x21/0x53
__slab_alloc.constprop.85+0x31/0x53
? __slab_alloc.constprop.85+0x31/0x53
? alloc_inode+0x2c/0x74
kmem_cache_alloc+0x50/0x179
? alloc_inode+0x2c/0x74
alloc_inode+0x2c/0x74
new_inode_pseudo+0xf/0x48
new_inode+0x15/0x25
tracefs_get_inode+0x23/0x7c
? lookup_one_len+0x54/0x6c
tracefs_create_file+0x53/0x11d
trace_create_file+0x15/0x33
event_create_dir+0x2a3/0x34b
__trace_add_new_event+0x1c/0x26
event_trace_add_tracer+0x56/0x86
trace_array_create+0x13e/0x1e1
instance_mkdir+0x8/0x17
tracefs_syscall_mkdir+0x39/0x50
? get_dname+0x31/0x31
vfs_mkdir+0x78/0xa3
do_mkdirat+0x71/0xb0
sys_mkdir+0x19/0x1b
do_fast_syscall_32+0xb0/0xed
I bisected this down to the addition of the proxy_ops into tracefs for
lockdown. It appears that the allocation of the proxy_ops and then freeing
it in the destroy_inode callback, is causing havoc with the memory system.
Reading the documentation about destroy_inode and talking with Linus about
this, this is buggy and wrong.
Instead of allocating the proxy_ops (and then having to free it) the checks
should be done by the open functions themselves, and not hack into the
tracefs directory. First revert the tracefs updates for locked_down and then
later we can add the locked_down checks in the kernel/trace files.
Link: http://lkml.kernel.org/r/20191011135458.7399da44@gandalf.local.home
Fixes: ccbd54ff54e8 ("tracefs: Restrict tracefs when the kernel is locked down")
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
fs/tracefs/inode.c | 42 +-----------------------------------------
1 file changed, 1 insertion(+), 41 deletions(-)
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 9fc14e38927f..eeeae0475da9 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -20,7 +20,6 @@
#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
-#include <linux/security.h>
#define TRACEFS_DEFAULT_MODE 0700
@@ -28,25 +27,6 @@ static struct vfsmount *tracefs_mount;
static int tracefs_mount_count;
static bool tracefs_registered;
-static int default_open_file(struct inode *inode, struct file *filp)
-{
- struct dentry *dentry = filp->f_path.dentry;
- struct file_operations *real_fops;
- int ret;
-
- if (!dentry)
- return -EINVAL;
-
- ret = security_locked_down(LOCKDOWN_TRACEFS);
- if (ret)
- return ret;
-
- real_fops = dentry->d_fsdata;
- if (!real_fops->open)
- return 0;
- return real_fops->open(inode, filp);
-}
-
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -241,12 +221,6 @@ static int tracefs_apply_options(struct super_block *sb)
return 0;
}
-static void tracefs_destroy_inode(struct inode *inode)
-{
- if (S_ISREG(inode->i_mode))
- kfree(inode->i_fop);
-}
-
static int tracefs_remount(struct super_block *sb, int *flags, char *data)
{
int err;
@@ -283,7 +257,6 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root)
static const struct super_operations tracefs_super_operations = {
.statfs = simple_statfs,
.remount_fs = tracefs_remount,
- .destroy_inode = tracefs_destroy_inode,
.show_options = tracefs_show_options,
};
@@ -414,7 +387,6 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
- struct file_operations *proxy_fops;
struct dentry *dentry;
struct inode *inode;
@@ -430,20 +402,8 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
if (unlikely(!inode))
return failed_creating(dentry);
- proxy_fops = kzalloc(sizeof(struct file_operations), GFP_KERNEL);
- if (unlikely(!proxy_fops)) {
- iput(inode);
- return failed_creating(dentry);
- }
-
- if (!fops)
- fops = &tracefs_file_operations;
-
- dentry->d_fsdata = (void *)fops;
- memcpy(proxy_fops, fops, sizeof(*proxy_fops));
- proxy_fops->open = default_open_file;
inode->i_mode = mode;
- inode->i_fop = proxy_fops;
+ inode->i_fop = fops ? fops : &tracefs_file_operations;
inode->i_private = data;
d_instantiate(dentry, inode);
fsnotify_create(dentry->d_parent->d_inode, dentry);
--
2.23.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox