Linux userland API discussions

Linux userland API discussions
 help / color / mirror / Atom feed

* Re: [RFC PATCH 04/10] pipe: Use head and tail pointers for the ring, not cursor and length [ver #2]
From: Ilya Dryomov @ 2019-10-30 16:19 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Rasmus Villemoes, Greg Kroah-Hartman,
	Peter Zijlstra, nicolas.dichtel, raven, Christian Brauner,
	keyrings, linux-usb, linux-block, linux-security-module,
	linux-fsdevel, linux-api, LKML
In-Reply-To: <157186186167.3995.7568100174393739543.stgit@warthog.procyon.org.uk>

On Thu, Oct 24, 2019 at 11:49 AM David Howells <dhowells@redhat.com> wrote:
>
> Convert pipes to use head and tail pointers for the buffer ring rather than
> pointer and length as the latter requires two atomic ops to update (or a
> combined op) whereas the former only requires one.
>
>  (1) The head pointer is the point at which production occurs and points to
>      the slot in which the next buffer will be placed.  This is equivalent
>      to pipe->curbuf + pipe->nrbufs.
>
>      The head pointer belongs to the write-side.
>
>  (2) The tail pointer is the point at which consumption occurs.  It points
>      to the next slot to be consumed.  This is equivalent to pipe->curbuf.
>
>      The tail pointer belongs to the read-side.
>
>  (3) head and tail are allowed to run to UINT_MAX and wrap naturally.  They
>      are only masked off when the array is being accessed, e.g.:
>
>         pipe->bufs[head & mask]
>
>      This means that it is not necessary to have a dead slot in the ring as
>      head == tail isn't ambiguous.
>
>  (4) The ring is empty if "head == tail".
>
>      A helper, pipe_empty(), is provided for this.
>
>  (5) The occupancy of the ring is "head - tail".
>
>      A helper, pipe_occupancy(), is provided for this.
>
>  (6) The number of free slots in the ring is "pipe->ring_size - occupancy".
>
>      A helper, pipe_space_for_user() is provided to indicate how many slots
>      userspace may use.
>
>  (7) The ring is full if "head - tail >= pipe->ring_size".
>
>      A helper, pipe_full(), is provided for this.
>
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
>
>  fs/fuse/dev.c             |   31 +++--
>  fs/pipe.c                 |  169 ++++++++++++++++-------------
>  fs/splice.c               |  188 ++++++++++++++++++++------------
>  include/linux/pipe_fs_i.h |   86 ++++++++++++++-
>  include/linux/uio.h       |    4 -
>  lib/iov_iter.c            |  266 +++++++++++++++++++++++++--------------------
>  6 files changed, 464 insertions(+), 280 deletions(-)
>
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index dadd617d826c..1e4bc27573cc 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -703,7 +703,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
>                         cs->pipebufs++;
>                         cs->nr_segs--;
>                 } else {
> -                       if (cs->nr_segs == cs->pipe->buffers)
> +                       if (cs->nr_segs >= cs->pipe->ring_size)
>                                 return -EIO;
>
>                         page = alloc_page(GFP_HIGHUSER);
> @@ -879,7 +879,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
>         struct pipe_buffer *buf;
>         int err;
>
> -       if (cs->nr_segs == cs->pipe->buffers)
> +       if (cs->nr_segs >= cs->pipe->ring_size)
>                 return -EIO;
>
>         err = unlock_request(cs->req);
> @@ -1341,7 +1341,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
>         if (!fud)
>                 return -EPERM;
>
> -       bufs = kvmalloc_array(pipe->buffers, sizeof(struct pipe_buffer),
> +       bufs = kvmalloc_array(pipe->ring_size, sizeof(struct pipe_buffer),
>                               GFP_KERNEL);
>         if (!bufs)
>                 return -ENOMEM;
> @@ -1353,7 +1353,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
>         if (ret < 0)
>                 goto out;
>
> -       if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
> +       if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->ring_size) {
>                 ret = -EIO;
>                 goto out;
>         }
> @@ -1935,6 +1935,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>                                      struct file *out, loff_t *ppos,
>                                      size_t len, unsigned int flags)
>  {
> +       unsigned int head, tail, mask, count;
>         unsigned nbuf;
>         unsigned idx;
>         struct pipe_buffer *bufs;
> @@ -1949,8 +1950,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>
>         pipe_lock(pipe);
>
> -       bufs = kvmalloc_array(pipe->nrbufs, sizeof(struct pipe_buffer),
> -                             GFP_KERNEL);
> +       head = pipe->head;
> +       tail = pipe->tail;
> +       mask = pipe->ring_size - 1;
> +       count = head - tail;
> +
> +       bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
>         if (!bufs) {
>                 pipe_unlock(pipe);
>                 return -ENOMEM;
> @@ -1958,8 +1963,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>
>         nbuf = 0;
>         rem = 0;
> -       for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
> -               rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
> +       for (idx = tail; idx < head && rem < len; idx++)
> +               rem += pipe->bufs[idx & mask].len;
>
>         ret = -EINVAL;
>         if (rem < len)
> @@ -1970,16 +1975,16 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>                 struct pipe_buffer *ibuf;
>                 struct pipe_buffer *obuf;
>
> -               BUG_ON(nbuf >= pipe->buffers);
> -               BUG_ON(!pipe->nrbufs);
> -               ibuf = &pipe->bufs[pipe->curbuf];
> +               BUG_ON(nbuf >= pipe->ring_size);
> +               BUG_ON(tail == head);
> +               ibuf = &pipe->bufs[tail & mask];
>                 obuf = &bufs[nbuf];
>
>                 if (rem >= ibuf->len) {
>                         *obuf = *ibuf;
>                         ibuf->ops = NULL;
> -                       pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
> -                       pipe->nrbufs--;
> +                       tail++;
> +                       pipe_commit_read(pipe, tail);
>                 } else {
>                         if (!pipe_buf_get(pipe, ibuf))
>                                 goto out_free;
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 8a2ab2f974bd..8a0806fe12d3 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -43,10 +43,11 @@ unsigned long pipe_user_pages_hard;
>  unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
>
>  /*
> - * We use a start+len construction, which provides full use of the
> - * allocated memory.
> - * -- Florian Coosmann (FGC)
> - *
> + * We use head and tail indices that aren't masked off, except at the point of
> + * dereference, but rather they're allowed to wrap naturally.  This means there
> + * isn't a dead spot in the buffer, provided the ring size < INT_MAX.
> + * -- David Howells 2019-09-23.

Hi David,

Is "ring size < INT_MAX" constraint correct?

I've never had to implement this free running indices scheme, but
the way I've always visualized it is that the top bit of the index is
used as a lap (as in a race) indicator, leaving 31 bits to work with
(in case of unsigned ints).  Should that be

  ring size <= 2^31

or more precisely

  ring size is a power of two <= 2^31

or am I missing something?

Thanks,

                Ilya

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Alexei Starovoitov @ 2019-10-30 15:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Edgecombe, Rick P, adobriyan@gmail.com,
	linux-kernel@vger.kernel.org, rppt@kernel.org,
	rostedt@goodmis.org, jejb@linux.ibm.com, tglx@linutronix.de,
	linux-mm@kvack.org, dave.hansen@linux.intel.com,
	linux-api@vger.kernel.org, x86@kernel.org,
	akpm@linux-foundation.org, hpa@zytor.com, mingo@redhat.com,
	luto@kernel.org, kirill@shutemov.name, bp@alien8.de
In-Reply-To: <20191030100418.GV4097@hirez.programming.kicks-ass.net>

On Wed, Oct 30, 2019 at 3:06 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Tue, Oct 29, 2019 at 05:27:43PM +0000, Edgecombe, Rick P wrote:
> > On Mon, 2019-10-28 at 22:00 +0100, Peter Zijlstra wrote:
>
> > > That should be limited to the module range. Random data maps could
> > > shatter the world.
> >
> > BPF has one vmalloc space allocation for the byte code and one for the module
> > space allocation for the JIT. Both get RO also set on the direct map alias of
> > the pages, and reset RW when freed.
>
> Argh, I didn't know they mapped the bytecode RO; why does it do that? It
> can throw out the bytecode once it's JIT'ed.

because of endless security "concerns" that some folks had.
Like what if something can exploit another bug in the kernel
and modify bytecode that was already verified
then interpreter will execute that modified bytecode.
Sort of similar reasoning why .text is read-only.
I think it's not a realistic attack, but I didn't bother to argue back then.
The mere presence of interpreter itself is a real security concern.
People that care about speculation attacks should
have CONFIG_BPF_JIT_ALWAYS_ON=y,
so modifying bytecode via another exploit will be pointless.
Getting rid of RO for bytecode will save a ton of memory too,
since we won't need to allocate full page for each small programs.

^ permalink raw reply

* Re: [PATCH bpf-next v11 2/7] landlock: Add the management of domains
From: Mickaël Salaün @ 2019-10-30 14:03 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: linux-kernel, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Shuah Khan, Stephen Smalley,
	Tejun Heo
In-Reply-To: <20191030025621.GA27626@mail.hallyn.com>


On 30/10/2019 03:56, Serge E. Hallyn wrote:
> On Tue, Oct 29, 2019 at 06:15:00PM +0100, Mickaël Salaün wrote:
>> A Landlock domain is a set of eBPF programs.  There is a list for each
>> different program types that can be run on a specific Landlock hook
>> (e.g. ptrace).  A domain is tied to a set of subjects (i.e. tasks).  A
>> Landlock program should not try (nor be able) to infer which subject is
>> currently enforced, but to have a unique security policy for all
>> subjects tied to the same domain.  This make the reasoning much easier
>> and help avoid pitfalls.
>>
>> The next commits tie a domain to a task's credentials thanks to
>> seccomp(2), but we could use cgroups or a security file-system to
>> enforce a sysadmin-defined policy .
>>
>> Signed-off-by: Mickaël Salaün <mic@digikod.net>
>> Cc: Alexei Starovoitov <ast@kernel.org>
>> Cc: Andy Lutomirski <luto@amacapital.net>
>> Cc: Daniel Borkmann <daniel@iogearbox.net>
>> Cc: James Morris <jmorris@namei.org>
>> Cc: Kees Cook <keescook@chromium.org>
>> Cc: Serge E. Hallyn <serge@hallyn.com>
>> Cc: Will Drewry <wad@chromium.org>
>> ---
>>
>> Changes since v10:
>> * rename files and names to clearly define a domain
>> * create a standalone patch to ease review
>> ---
> 
> [...]
> 
>> +/**
>> + * store_landlock_prog - prepend and deduplicate a Landlock prog_list
>> + *
>> + * Prepend @prog to @init_domain while ignoring @prog if they are already in
>> + * @ref_domain.  Whatever is the result of this function call, you can call
>> + * bpf_prog_put(@prog) after.
>> + *
>> + * @init_domain: empty domain to prepend to
>> + * @ref_domain: domain to check for duplicate programs
>> + * @prog: program to prepend
>> + *
>> + * Return -errno on error or 0 if @prog was successfully stored.
>> + */
>> +static int store_landlock_prog(struct landlock_domain *init_domain,
>> +		const struct landlock_domain *ref_domain,
>> +		struct bpf_prog *prog)
>> +{
>> +	struct landlock_prog_list *tmp_list = NULL;
>> +	int err;
>> +	size_t hook;
>> +	enum landlock_hook_type last_type;
>> +	struct bpf_prog *new = prog;
>> +
>> +	/* allocate all the memory we need */
>> +	struct landlock_prog_list *new_list;
>> +
>> +	last_type = get_hook_type(new);
>> +
>> +	/* ignore duplicate programs */
> 
> This comment should be "don't allow" rather than "ignore", right?

Exactly, fixed.

> 
>> +	if (ref_domain) {
>> +		struct landlock_prog_list *ref;
>> +
>> +		hook = get_hook_index(get_hook_type(new));
>> +		for (ref = ref_domain->programs[hook]; ref;
>> +				ref = ref->prev) {
>> +			if (ref->prog == new)
>> +				return -EINVAL;
>> +		}
>> +	}
>> +
>> +	new = bpf_prog_inc(new);
>> +	if (IS_ERR(new)) {
>> +		err = PTR_ERR(new);
>> +		goto put_tmp_list;
>> +	}
>> +	new_list = kzalloc(sizeof(*new_list), GFP_KERNEL);
>> +	if (!new_list) {
>> +		bpf_prog_put(new);
>> +		err = -ENOMEM;
>> +		goto put_tmp_list;
>> +	}
>> +	/* ignore Landlock types in this tmp_list */
>> +	new_list->prog = new;
>> +	new_list->prev = tmp_list;
>> +	refcount_set(&new_list->usage, 1);
>> +	tmp_list = new_list;
>> +
>> +	if (!tmp_list)
>> +		/* inform user space that this program was already added */
> 
> I'm not following this.  You just kzalloc'd new_list, pointed
> tmp_list to new_list, so how could tmp_list be NULL?  Was there
> a bad code reorg here, or am i being dense?

Indeed, this was introduce with a code refactoring (while removing a
program "chaining" concept) where this code snippet was in a loop, hence
the weird use of tmp_list. I'm cleaning this up (and simplifying this
whole code), and replacing the -EINVAL for the duplicate program check
with the -EEXIST.

Thanks!


> 
>> +		return -EEXIST;
>> +
>> +	/* properly store the list (without error cases) */
>> +	while (tmp_list) {
>> +		struct landlock_prog_list *new_list;
>> +
>> +		new_list = tmp_list;
>> +		tmp_list = tmp_list->prev;
>> +		/* do not increment the previous prog list usage */
>> +		hook = get_hook_index(get_hook_type(new_list->prog));
>> +		new_list->prev = init_domain->programs[hook];
>> +		/* no need to add from the last program to the first because
>> +		 * each of them are a different Landlock type */
>> +		smp_store_release(&init_domain->programs[hook], new_list);
>> +	}
>> +	return 0;
>> +
>> +put_tmp_list:
>> +	put_landlock_prog_list(tmp_list);
>> +	return err;
>> +}
>> +
>> +/* limit Landlock programs set to 256KB */
>> +#define LANDLOCK_PROGRAMS_MAX_PAGES (1 << 6)
>> +
>> +/**
>> + * landlock_prepend_prog - attach a Landlock prog_list to @current_domain
>> + *
>> + * Whatever is the result of this function call, you can call
>> + * bpf_prog_put(@prog) after.
>> + *
>> + * @current_domain: landlock_domain pointer, must be (RCU-)locked (if needed)
>> + *                  to prevent a concurrent put/free. This pointer must not be
>> + *                  freed after the call.
>> + * @prog: non-NULL Landlock prog_list to prepend to @current_domain. @prog will
>> + *        be owned by landlock_prepend_prog() and freed if an error happened.
>> + *
>> + * Return @current_domain or a new pointer when OK. Return a pointer error
>> + * otherwise.
>> + */
>> +struct landlock_domain *landlock_prepend_prog(
>> +		struct landlock_domain *current_domain,
>> +		struct bpf_prog *prog)
>> +{
>> +	struct landlock_domain *new_domain = current_domain;
>> +	unsigned long pages;
>> +	int err;
>> +	size_t i;
>> +	struct landlock_domain tmp_domain = {};
>> +
>> +	if (prog->type != BPF_PROG_TYPE_LANDLOCK_HOOK)
>> +		return ERR_PTR(-EINVAL);
>> +
>> +	/* validate memory size allocation */
>> +	pages = prog->pages;
>> +	if (current_domain) {
>> +		size_t i;
>> +
>> +		for (i = 0; i < ARRAY_SIZE(current_domain->programs); i++) {
>> +			struct landlock_prog_list *walker_p;
>> +
>> +			for (walker_p = current_domain->programs[i];
>> +					walker_p; walker_p = walker_p->prev)
>> +				pages += walker_p->prog->pages;
>> +		}
>> +		/* count a struct landlock_domain if we need to allocate one */
>> +		if (refcount_read(&current_domain->usage) != 1)
>> +			pages += round_up(sizeof(*current_domain), PAGE_SIZE)
>> +				/ PAGE_SIZE;
>> +	}
>> +	if (pages > LANDLOCK_PROGRAMS_MAX_PAGES)
>> +		return ERR_PTR(-E2BIG);
>> +
>> +	/* ensure early that we can allocate enough memory for the new
>> +	 * prog_lists */
>> +	err = store_landlock_prog(&tmp_domain, current_domain, prog);
>> +	if (err)
>> +		return ERR_PTR(err);
>> +
>> +	/*
>> +	 * Each task_struct points to an array of prog list pointers.  These
>> +	 * tables are duplicated when additions are made (which means each
>> +	 * table needs to be refcounted for the processes using it). When a new
>> +	 * table is created, all the refcounters on the prog_list are bumped
>> +	 * (to track each table that references the prog). When a new prog is
>> +	 * added, it's just prepended to the list for the new table to point
>> +	 * at.
>> +	 *
>> +	 * Manage all the possible errors before this step to not uselessly
>> +	 * duplicate current_domain and avoid a rollback.
>> +	 */
>> +	if (!new_domain) {
>> +		/*
>> +		 * If there is no Landlock domain used by the current task,
>> +		 * then create a new one.
>> +		 */
>> +		new_domain = new_landlock_domain();
>> +		if (IS_ERR(new_domain))
>> +			goto put_tmp_lists;
>> +	} else if (refcount_read(&current_domain->usage) > 1) {
>> +		/*
>> +		 * If the current task is not the sole user of its Landlock
>> +		 * domain, then duplicate it.
>> +		 */
>> +		new_domain = new_landlock_domain();
>> +		if (IS_ERR(new_domain))
>> +			goto put_tmp_lists;
>> +		for (i = 0; i < ARRAY_SIZE(new_domain->programs); i++) {
>> +			new_domain->programs[i] =
>> +				READ_ONCE(current_domain->programs[i]);
>> +			if (new_domain->programs[i])
>> +				refcount_inc(&new_domain->programs[i]->usage);
>> +		}
>> +
>> +		/*
>> +		 * Landlock domain from the current task will not be freed here
>> +		 * because the usage is strictly greater than 1. It is only
>> +		 * prevented to be freed by another task thanks to the caller
>> +		 * of landlock_prepend_prog() which should be locked if needed.
>> +		 */
>> +		landlock_put_domain(current_domain);
>> +	}
>> +
>> +	/* prepend tmp_domain to new_domain */
>> +	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++) {
>> +		/* get the last new list */
>> +		struct landlock_prog_list *last_list =
>> +			tmp_domain.programs[i];
>> +
>> +		if (last_list) {
>> +			while (last_list->prev)
>> +				last_list = last_list->prev;
>> +			/* no need to increment usage (pointer replacement) */
>> +			last_list->prev = new_domain->programs[i];
>> +			new_domain->programs[i] = tmp_domain.programs[i];
>> +		}
>> +	}
>> +	return new_domain;
>> +
>> +put_tmp_lists:
>> +	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++)
>> +		put_landlock_prog_list(tmp_domain.programs[i]);
>> +	return new_domain;
>> +}
>> diff --git a/security/landlock/domain_manage.h b/security/landlock/domain_manage.h
>> new file mode 100644
>> index 000000000000..5b5b49f6e3e8
>> --- /dev/null
>> +++ b/security/landlock/domain_manage.h
>> @@ -0,0 +1,23 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Landlock LSM - domain management headers
>> + *
>> + * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
>> + * Copyright © 2018-2019 ANSSI
>> + */
>> +
>> +#ifndef _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
>> +#define _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
>> +
>> +#include <linux/filter.h>
>> +
>> +#include "common.h"
>> +
>> +void landlock_get_domain(struct landlock_domain *dom);
>> +void landlock_put_domain(struct landlock_domain *dom);
>> +
>> +struct landlock_domain *landlock_prepend_prog(
>> +		struct landlock_domain *current_domain,
>> +		struct bpf_prog *prog);
>> +
>> +#endif /* _SECURITY_LANDLOCK_DOMAIN_MANAGE_H */
>> -- 
>> 2.23.0
> 

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Christopher Lameter @ 2019-10-30 12:09 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Kirill A. Shutemov, linux-kernel, Alexey Dobriyan, Andrew Morton,
	Andy Lutomirski, Arnd Bergmann, Borislav Petkov, Dave Hansen,
	James Bottomley, Peter Zijlstra, Steven Rostedt, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, linux-api, linux-mm, x86,
	Mike Rapoport
In-Reply-To: <20191030071136.GA20624@rapoport-lnx>


On Wed, 30 Oct 2019, Mike Rapoport wrote:

> > /dev/securemem or so?
>
> A device driver will need to remove the secure area from the direct map and
> then we back to square one.

We have avoided the need for modifications to kernel core code. And its a
natural thing to treat this like special memory provided by a device
driver.

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Peter Zijlstra @ 2019-10-30 10:04 UTC (permalink / raw)
  To: Edgecombe, Rick P
  Cc: adobriyan@gmail.com, linux-kernel@vger.kernel.org,
	rppt@kernel.org, rostedt@goodmis.org, jejb@linux.ibm.com,
	tglx@linutronix.de, linux-mm@kvack.org,
	dave.hansen@linux.intel.com, linux-api@vger.kernel.org,
	x86@kernel.org, akpm@linux-foundation.org, hpa@zytor.com,
	mingo@redhat.com, luto@kernel.org, kirill@shutemov.name,
	bp@alien8.de, rppt@linux.ibm.com
In-Reply-To: <69c57f7fa9a1be145827673b37beff155a3adc3c.camel@intel.com>

On Tue, Oct 29, 2019 at 05:27:43PM +0000, Edgecombe, Rick P wrote:
> On Mon, 2019-10-28 at 22:00 +0100, Peter Zijlstra wrote:

> > That should be limited to the module range. Random data maps could
> > shatter the world.
> 
> BPF has one vmalloc space allocation for the byte code and one for the module
> space allocation for the JIT. Both get RO also set on the direct map alias of
> the pages, and reset RW when freed.

Argh, I didn't know they mapped the bytecode RO; why does it do that? It
can throw out the bytecode once it's JIT'ed.

> You mean shatter performance?

Shatter (all) large pages.

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Mike Rapoport @ 2019-10-30  8:40 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: LKML, Alexey Dobriyan, Andrew Morton, Arnd Bergmann,
	Borislav Petkov, Dave Hansen, James Bottomley, Peter Zijlstra,
	Steven Rostedt, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	Linux API, Linux-MM, X86 ML, Mike Rapoport
In-Reply-To: <CALCETrUuuc4DS0cdMBtS550Wkp0x9ND3M3SgtaMgyRROnDR5Kg@mail.gmail.com>

On Tue, Oct 29, 2019 at 10:00:55AM -0700, Andy Lutomirski wrote:
> On Tue, Oct 29, 2019 at 2:33 AM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > On Mon, Oct 28, 2019 at 02:44:23PM -0600, Andy Lutomirski wrote:
> > >
> > > > On Oct 27, 2019, at 4:17 AM, Mike Rapoport <rppt@kernel.org> wrote:
> > > >
> > > > From: Mike Rapoport <rppt@linux.ibm.com>
> > > >
> > > > Hi,
> > > >
> > > > The patch below aims to allow applications to create mappins that have
> > > > pages visible only to the owning process. Such mappings could be used to
> > > > store secrets so that these secrets are not visible neither to other
> > > > processes nor to the kernel.
> > > >
> > > > I've only tested the basic functionality, the changes should be verified
> > > > against THP/migration/compaction. Yet, I'd appreciate early feedback.
> > >
> > > I’ve contemplated the concept a fair amount, and I think you should
> > > consider a change to the API. In particular, rather than having it be a
> > > MAP_ flag, make it a chardev.  You can, at least at first, allow only
> > > MAP_SHARED, and admins can decide who gets to use it.  It might also play
> > > better with the VM overall, and you won’t need a VM_ flag for it — you
> > > can just wire up .fault to do the right thing.
> >
> > I think mmap()/mprotect()/madvise() are the natural APIs for such
> > interface.
> 
> Then you have a whole bunch of questions to answer.  For example:
> 
> What happens if you mprotect() or similar when the mapping is already
> in use in a way that's incompatible with MAP_EXCLUSIVE?

Then we refuse to mprotect()? Like in any other case when vm_flags are not
compatible with required madvise()/mprotect() operation.

> Is it actually reasonable to malloc() some memory and then make it exclusive?
> 
> Are you permitted to map a file MAP_EXCLUSIVE?  What does it mean?

I'd limit MAP_EXCLUSIVE only to anonymous memory.

> What does MAP_PRIVATE | MAP_EXCLUSIVE do?

My preference is to have only mmap() and then the semantics is more clear:

MAP_PRIVATE | MAP_EXCLUSIVE creates a pre-populated region, marks it locked
and drops the pages in this region from the direct map.
The pages are returned back on munmap(). 
Then there is no way to change an existing area to be exclusive or vice
versa.

> How does one pass exclusive memory via SCM_RIGHTS?  (If it's a
> memfd-like or chardev interface, it's trivial.  mmap(), not so much.)

Why passing such memory via SCM_RIGHTS would be useful?
 
> And finally, there's my personal giant pet peeve: a major use of this
> will be for virtualization.  I suspect that a lot of people would like
> the majority of KVM guest memory to be unmapped from the host
> pagetables.  But people might also like for guest memory to be
> unmapped in *QEMU's* pagetables, and mmap() is a basically worthless
> interface for this.  Getting fd-backed memory into a guest will take
> some possibly major work in the kernel, but getting vma-backed memory
> into a guest without mapping it in the host user address space seems
> much, much worse.

Well, in my view, the MAP_EXCLUSIVE is intended to keep small secrets
rather than use it for the entire guest memory. I even considered adding a
limit for the mapping size, but then I decided that since RLIMIT_MEMLOCK is
anyway enforced there is no need for a new one.

I agree that getting fd-backed memory into a guest would be less pain that
VMA, but KVM can already use memory outside the control of the kernel via
/dev/map [1].

So unless I'm missing something here, there is no need to use MAP_EXCLUSIVE
for the guest memory.

[1] https://lwn.net/Articles/778240/

> > Switching to a chardev doesn't solve the major problem of direct
> > map fragmentation and defeats the ability to use exclusive memory mappings
> > with the existing allocators, while mprotect() and madvise() do not.
> >
> 
> Will people really want to do malloc() and then remap it exclusive?
> This sounds dubiously useful at best.

Again, my preference is to have mmap() only, but I see a value in this use
case as well. Application developers allocate memory and then sometimes
change its properties rather than go mmap() something. For such usage
mprotect() may be usefull.


-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: David Hildenbrand @ 2019-10-30  8:19 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-kernel, Alexey Dobriyan, Andrew Morton, Andy Lutomirski,
	Arnd Bergmann, Borislav Petkov, Dave Hansen, James Bottomley,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, linux-api, linux-mm, x86, Mike Rapoport
In-Reply-To: <20191030081529.GB20624@rapoport-lnx>

On 30.10.19 09:15, Mike Rapoport wrote:
> On Tue, Oct 29, 2019 at 12:02:34PM +0100, David Hildenbrand wrote:
>> On 27.10.19 11:17, Mike Rapoport wrote:
>>> From: Mike Rapoport <rppt@linux.ibm.com>
>>>
>>> The mappings created with MAP_EXCLUSIVE are visible only in the context of
>>> the owning process and can be used by applications to store secret
>>> information that will not be visible not only to other processes but to the
>>> kernel as well.
>>>
>>> The pages in these mappings are removed from the kernel direct map and
>>> marked with PG_user_exclusive flag. When the exclusive area is unmapped,
>>> the pages are mapped back into the direct map.
>>>
>>
>> Just a thought, the kernel is still able to indirectly read the contents of
>> these pages by doing a kdump from kexec environment, right?
> 
> Right.
> 
>> Also, I wonder
>> what would happen if you map such pages via /dev/mem into another user space
>> application and e.g., use them along with kvm [1].
> 
> Do you mean that one application creates MAP_EXCLUSIVE and another
> applications accesses the same physical pages via /dev/mem?

Exactly.

> 
> With /dev/mem all physical memory is visible...

Okay, so the statement "information that will not be visible not only to 
other processes but to the kernel as well" is not correct. There are 
easy ways to access that information if you really want to (might 
require root permissions, though).

-- 

Thanks,

David / dhildenb

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Mike Rapoport @ 2019-10-30  8:15 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: linux-kernel, Alexey Dobriyan, Andrew Morton, Andy Lutomirski,
	Arnd Bergmann, Borislav Petkov, Dave Hansen, James Bottomley,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, linux-api, linux-mm, x86, Mike Rapoport
In-Reply-To: <085ed07e-e646-f7a4-0370-06f33a2a4e4a@redhat.com>

On Tue, Oct 29, 2019 at 12:02:34PM +0100, David Hildenbrand wrote:
> On 27.10.19 11:17, Mike Rapoport wrote:
> >From: Mike Rapoport <rppt@linux.ibm.com>
> >
> >The mappings created with MAP_EXCLUSIVE are visible only in the context of
> >the owning process and can be used by applications to store secret
> >information that will not be visible not only to other processes but to the
> >kernel as well.
> >
> >The pages in these mappings are removed from the kernel direct map and
> >marked with PG_user_exclusive flag. When the exclusive area is unmapped,
> >the pages are mapped back into the direct map.
> >
> 
> Just a thought, the kernel is still able to indirectly read the contents of
> these pages by doing a kdump from kexec environment, right?

Right.

> Also, I wonder
> what would happen if you map such pages via /dev/mem into another user space
> application and e.g., use them along with kvm [1].

Do you mean that one application creates MAP_EXCLUSIVE and another
applications accesses the same physical pages via /dev/mem? 

With /dev/mem all physical memory is visible...
 
> [1] https://lwn.net/Articles/778240/
> 
> -- 
> 
> Thanks,
> 
> David / dhildenb
> 

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Mike Rapoport @ 2019-10-30  7:11 UTC (permalink / raw)
  To: Christopher Lameter
  Cc: Kirill A. Shutemov, linux-kernel, Alexey Dobriyan, Andrew Morton,
	Andy Lutomirski, Arnd Bergmann, Borislav Petkov, Dave Hansen,
	James Bottomley, Peter Zijlstra, Steven Rostedt, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, linux-api, linux-mm, x86,
	Mike Rapoport
In-Reply-To: <alpine.DEB.2.21.1910291011090.5411@www.lameter.com>

On Tue, Oct 29, 2019 at 10:12:04AM +0000, Christopher Lameter wrote:
> 
> 
> On Tue, 29 Oct 2019, Mike Rapoport wrote:
> 
> > I've talked with Thomas yesterday and he suggested something similar:
> >
> > When the MAP_EXCLUSIVE request comes for the first time, we allocate a huge
> > page for it and then use this page as a pool of 4K pages for subsequent
> > requests. Once this huge page is full we allocate a new one and append it
> > to the pool. When all the 4K pages that comprise the huge page are freed
> > the huge page is collapsed.
> 
> Or write a device driver that allows you to mmap a secure area and avoid
> all core kernel modifications?
> 
> /dev/securemem or so?

A device driver will need to remove the secure area from the direct map and
then we back to square one.
 
> It may exist already.
> 

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: mbind() breaks its API definition since v5.2 by commit d883544515aa (mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified)
From: Yang Shi @ 2019-10-30  4:32 UTC (permalink / raw)
  To: Li Xinhai, linux-mm@kvack.org, akpm, torvalds
  Cc: Vlastimil Babka, Linux API, Michal Hocko, Hugh Dickins,
	linux-kernel@vger.kernel.org, lixinhai_lxh
In-Reply-To: <2019103011122763779044@gmail.com>



On 10/29/19 8:12 PM, Li Xinhai wrote:
> On 2019-10-30 at 10:50 Yang Shi wrote:
>>
>> On 10/29/19 7:27 PM, Li Xinhai wrote:
>>> One change in do_mbind() of this commit has suspicious usage of return value of
>>> queue_pages_range(), excerpt as below:
>>>
>>> ---
>>> @@ -1243,10 +1265,15 @@ static long do_mbind(unsigned long start, unsigned long len,
>>>     	if (err)
>>>     	goto mpol_out;
>>>    
>>> -	err = queue_pages_range(mm, start, end, nmask,
>>> +	ret = queue_pages_range(mm, start, end, nmask,
>>>     	 flags | MPOL_MF_INVERT, &pagelist);
>>> -	if (!err)
>>> -	err = mbind_range(mm, start, end, new);
>>> +
>>> +	if (ret < 0) {      /////// convert to all possible 'ret' to '-EIO' <<<<
>>> +	err = -EIO;
>>> +	goto up_out;
>>> +	}
>>> +
>>> +	err = mbind_range(mm, start, end, new);
>>>    
>>>     	if (!err) {
>>>     	int nr_failed = 0;
>>> ---
>>>
>>> Note that inside queue_pages_range(), the call to walk_page_range() may return
>>> errors from 'test_walk' of 'struct mm_walk_ops', e.g. -EFAULT. Now, those error
>>> codes are no longer reported to user space application.
>>>
>>>    From user space, the mbind() call need to reported error, with EFAULT, as example:
>>> EFAULT
>>> Part or all of the memory range specified by nodemask and maxnode points
>>> outside your accessible address space. Or, there was an unmapped hole in the
>>> specified memory range specified by addr and len.
>> Thanks for catching this. That commit was aimed to correct the return
>> values for some corner cases in mbind(), but it should not alter the
>> errno for other failure cases, i.e. -EFAULT.
>>
>> Could you please try the below patch (build test only)?
>>
>> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>> index 4ae967b..99df43a 100644
>> --- a/mm/mempolicy.c
>> +++ b/mm/mempolicy.c
>> @@ -1286,7 +1286,7 @@ static long do_mbind(unsigned long start, unsigned
>> long len,
>>                            flags | MPOL_MF_INVERT, &pagelist);
>>
>>          if (ret < 0) {
>> -               err = -EIO;
>> +               err = ret;
>>                  goto up_out;
>>          }
>>
>>
> This seems do not work, because the 'pagelist' would have some pages queued
> into it, need to put back those pages instead of return quickly.
>
> So, we need to remove this page leak as well. <<<<<<
>
> In my understanding, revert the changes as I quoted above may solve it, but not sure
> the details about changes at end of do_mbind(), should keep them at there without
> further change?

Thanks for pointing this out. We don't have to revert this commit to 
handle the non-empty pagelist correctly. The simplest way is to just put 
those pages back and I'm supposed this is also the preferred way since 
mbind_range() is not called to really apply the policy so those pages 
should not be migrated.

The below patch should solve this:

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ae967b..d80025c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1286,7 +1286,10 @@ static long do_mbind(unsigned long start, 
unsigned long len,
                           flags | MPOL_MF_INVERT, &pagelist);

         if (ret < 0) {
-               err = -EIO;
+               if (!list_empty(&pagelist))
+                       putback_movable_pages(&pagelist);
+
+               err = ret;
                 goto up_out;
         }

>
> - Xinhai
>
>>> Please correct me if this is the intended change(and will have updated API
>>> definition), or something was misunderstood.
>>>
>>> -Xinhai
> >

^ permalink raw reply related

* Re: [PATCH bpf-next v11 2/7] landlock: Add the management of domains
From: Serge E. Hallyn @ 2019-10-30  2:56 UTC (permalink / raw)
  To: Mickaël Salaün
  Cc: linux-kernel, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley
In-Reply-To: <20191029171505.6650-3-mic@digikod.net>

On Tue, Oct 29, 2019 at 06:15:00PM +0100, Mickaël Salaün wrote:
> A Landlock domain is a set of eBPF programs.  There is a list for each
> different program types that can be run on a specific Landlock hook
> (e.g. ptrace).  A domain is tied to a set of subjects (i.e. tasks).  A
> Landlock program should not try (nor be able) to infer which subject is
> currently enforced, but to have a unique security policy for all
> subjects tied to the same domain.  This make the reasoning much easier
> and help avoid pitfalls.
> 
> The next commits tie a domain to a task's credentials thanks to
> seccomp(2), but we could use cgroups or a security file-system to
> enforce a sysadmin-defined policy .
> 
> Signed-off-by: Mickaël Salaün <mic@digikod.net>
> Cc: Alexei Starovoitov <ast@kernel.org>
> Cc: Andy Lutomirski <luto@amacapital.net>
> Cc: Daniel Borkmann <daniel@iogearbox.net>
> Cc: James Morris <jmorris@namei.org>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Serge E. Hallyn <serge@hallyn.com>
> Cc: Will Drewry <wad@chromium.org>
> ---
> 
> Changes since v10:
> * rename files and names to clearly define a domain
> * create a standalone patch to ease review
> ---

[...]

> +/**
> + * store_landlock_prog - prepend and deduplicate a Landlock prog_list
> + *
> + * Prepend @prog to @init_domain while ignoring @prog if they are already in
> + * @ref_domain.  Whatever is the result of this function call, you can call
> + * bpf_prog_put(@prog) after.
> + *
> + * @init_domain: empty domain to prepend to
> + * @ref_domain: domain to check for duplicate programs
> + * @prog: program to prepend
> + *
> + * Return -errno on error or 0 if @prog was successfully stored.
> + */
> +static int store_landlock_prog(struct landlock_domain *init_domain,
> +		const struct landlock_domain *ref_domain,
> +		struct bpf_prog *prog)
> +{
> +	struct landlock_prog_list *tmp_list = NULL;
> +	int err;
> +	size_t hook;
> +	enum landlock_hook_type last_type;
> +	struct bpf_prog *new = prog;
> +
> +	/* allocate all the memory we need */
> +	struct landlock_prog_list *new_list;
> +
> +	last_type = get_hook_type(new);
> +
> +	/* ignore duplicate programs */

This comment should be "don't allow" rather than "ignore", right?

> +	if (ref_domain) {
> +		struct landlock_prog_list *ref;
> +
> +		hook = get_hook_index(get_hook_type(new));
> +		for (ref = ref_domain->programs[hook]; ref;
> +				ref = ref->prev) {
> +			if (ref->prog == new)
> +				return -EINVAL;
> +		}
> +	}
> +
> +	new = bpf_prog_inc(new);
> +	if (IS_ERR(new)) {
> +		err = PTR_ERR(new);
> +		goto put_tmp_list;
> +	}
> +	new_list = kzalloc(sizeof(*new_list), GFP_KERNEL);
> +	if (!new_list) {
> +		bpf_prog_put(new);
> +		err = -ENOMEM;
> +		goto put_tmp_list;
> +	}
> +	/* ignore Landlock types in this tmp_list */
> +	new_list->prog = new;
> +	new_list->prev = tmp_list;
> +	refcount_set(&new_list->usage, 1);
> +	tmp_list = new_list;
> +
> +	if (!tmp_list)
> +		/* inform user space that this program was already added */

I'm not following this.  You just kzalloc'd new_list, pointed
tmp_list to new_list, so how could tmp_list be NULL?  Was there
a bad code reorg here, or am i being dense?

> +		return -EEXIST;
> +
> +	/* properly store the list (without error cases) */
> +	while (tmp_list) {
> +		struct landlock_prog_list *new_list;
> +
> +		new_list = tmp_list;
> +		tmp_list = tmp_list->prev;
> +		/* do not increment the previous prog list usage */
> +		hook = get_hook_index(get_hook_type(new_list->prog));
> +		new_list->prev = init_domain->programs[hook];
> +		/* no need to add from the last program to the first because
> +		 * each of them are a different Landlock type */
> +		smp_store_release(&init_domain->programs[hook], new_list);
> +	}
> +	return 0;
> +
> +put_tmp_list:
> +	put_landlock_prog_list(tmp_list);
> +	return err;
> +}
> +
> +/* limit Landlock programs set to 256KB */
> +#define LANDLOCK_PROGRAMS_MAX_PAGES (1 << 6)
> +
> +/**
> + * landlock_prepend_prog - attach a Landlock prog_list to @current_domain
> + *
> + * Whatever is the result of this function call, you can call
> + * bpf_prog_put(@prog) after.
> + *
> + * @current_domain: landlock_domain pointer, must be (RCU-)locked (if needed)
> + *                  to prevent a concurrent put/free. This pointer must not be
> + *                  freed after the call.
> + * @prog: non-NULL Landlock prog_list to prepend to @current_domain. @prog will
> + *        be owned by landlock_prepend_prog() and freed if an error happened.
> + *
> + * Return @current_domain or a new pointer when OK. Return a pointer error
> + * otherwise.
> + */
> +struct landlock_domain *landlock_prepend_prog(
> +		struct landlock_domain *current_domain,
> +		struct bpf_prog *prog)
> +{
> +	struct landlock_domain *new_domain = current_domain;
> +	unsigned long pages;
> +	int err;
> +	size_t i;
> +	struct landlock_domain tmp_domain = {};
> +
> +	if (prog->type != BPF_PROG_TYPE_LANDLOCK_HOOK)
> +		return ERR_PTR(-EINVAL);
> +
> +	/* validate memory size allocation */
> +	pages = prog->pages;
> +	if (current_domain) {
> +		size_t i;
> +
> +		for (i = 0; i < ARRAY_SIZE(current_domain->programs); i++) {
> +			struct landlock_prog_list *walker_p;
> +
> +			for (walker_p = current_domain->programs[i];
> +					walker_p; walker_p = walker_p->prev)
> +				pages += walker_p->prog->pages;
> +		}
> +		/* count a struct landlock_domain if we need to allocate one */
> +		if (refcount_read(&current_domain->usage) != 1)
> +			pages += round_up(sizeof(*current_domain), PAGE_SIZE)
> +				/ PAGE_SIZE;
> +	}
> +	if (pages > LANDLOCK_PROGRAMS_MAX_PAGES)
> +		return ERR_PTR(-E2BIG);
> +
> +	/* ensure early that we can allocate enough memory for the new
> +	 * prog_lists */
> +	err = store_landlock_prog(&tmp_domain, current_domain, prog);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	/*
> +	 * Each task_struct points to an array of prog list pointers.  These
> +	 * tables are duplicated when additions are made (which means each
> +	 * table needs to be refcounted for the processes using it). When a new
> +	 * table is created, all the refcounters on the prog_list are bumped
> +	 * (to track each table that references the prog). When a new prog is
> +	 * added, it's just prepended to the list for the new table to point
> +	 * at.
> +	 *
> +	 * Manage all the possible errors before this step to not uselessly
> +	 * duplicate current_domain and avoid a rollback.
> +	 */
> +	if (!new_domain) {
> +		/*
> +		 * If there is no Landlock domain used by the current task,
> +		 * then create a new one.
> +		 */
> +		new_domain = new_landlock_domain();
> +		if (IS_ERR(new_domain))
> +			goto put_tmp_lists;
> +	} else if (refcount_read(&current_domain->usage) > 1) {
> +		/*
> +		 * If the current task is not the sole user of its Landlock
> +		 * domain, then duplicate it.
> +		 */
> +		new_domain = new_landlock_domain();
> +		if (IS_ERR(new_domain))
> +			goto put_tmp_lists;
> +		for (i = 0; i < ARRAY_SIZE(new_domain->programs); i++) {
> +			new_domain->programs[i] =
> +				READ_ONCE(current_domain->programs[i]);
> +			if (new_domain->programs[i])
> +				refcount_inc(&new_domain->programs[i]->usage);
> +		}
> +
> +		/*
> +		 * Landlock domain from the current task will not be freed here
> +		 * because the usage is strictly greater than 1. It is only
> +		 * prevented to be freed by another task thanks to the caller
> +		 * of landlock_prepend_prog() which should be locked if needed.
> +		 */
> +		landlock_put_domain(current_domain);
> +	}
> +
> +	/* prepend tmp_domain to new_domain */
> +	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++) {
> +		/* get the last new list */
> +		struct landlock_prog_list *last_list =
> +			tmp_domain.programs[i];
> +
> +		if (last_list) {
> +			while (last_list->prev)
> +				last_list = last_list->prev;
> +			/* no need to increment usage (pointer replacement) */
> +			last_list->prev = new_domain->programs[i];
> +			new_domain->programs[i] = tmp_domain.programs[i];
> +		}
> +	}
> +	return new_domain;
> +
> +put_tmp_lists:
> +	for (i = 0; i < ARRAY_SIZE(tmp_domain.programs); i++)
> +		put_landlock_prog_list(tmp_domain.programs[i]);
> +	return new_domain;
> +}
> diff --git a/security/landlock/domain_manage.h b/security/landlock/domain_manage.h
> new file mode 100644
> index 000000000000..5b5b49f6e3e8
> --- /dev/null
> +++ b/security/landlock/domain_manage.h
> @@ -0,0 +1,23 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Landlock LSM - domain management headers
> + *
> + * Copyright © 2016-2019 Mickaël Salaün <mic@digikod.net>
> + * Copyright © 2018-2019 ANSSI
> + */
> +
> +#ifndef _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
> +#define _SECURITY_LANDLOCK_DOMAIN_MANAGE_H
> +
> +#include <linux/filter.h>
> +
> +#include "common.h"
> +
> +void landlock_get_domain(struct landlock_domain *dom);
> +void landlock_put_domain(struct landlock_domain *dom);
> +
> +struct landlock_domain *landlock_prepend_prog(
> +		struct landlock_domain *current_domain,
> +		struct bpf_prog *prog);
> +
> +#endif /* _SECURITY_LANDLOCK_DOMAIN_MANAGE_H */
> -- 
> 2.23.0

^ permalink raw reply

* Re: mbind() breaks its API definition since v5.2 by commit d883544515aa (mm: mempolicy: make the behavior consistent when MPOL_MF_MOVE* and MPOL_MF_STRICT were specified)
From: Yang Shi @ 2019-10-30  2:50 UTC (permalink / raw)
  To: Li Xinhai, linux-mm, akpm, torvalds
  Cc: Vlastimil Babka, Linux API, Michal Hocko, Hugh Dickins,
	linux-kernel@vger.kernel.org, lixinhai_lxh
In-Reply-To: <2019103010274679257634@gmail.com>



On 10/29/19 7:27 PM, Li Xinhai wrote:
> One change in do_mbind() of this commit has suspicious usage of return value of
> queue_pages_range(), excerpt as below:
>
> ---
> @@ -1243,10 +1265,15 @@ static long do_mbind(unsigned long start, unsigned long len,
>   	if (err)
>   		goto mpol_out;
>   
> -	err = queue_pages_range(mm, start, end, nmask,
> +	ret = queue_pages_range(mm, start, end, nmask,
>   			  flags | MPOL_MF_INVERT, &pagelist);
> -	if (!err)
> -		err = mbind_range(mm, start, end, new);
> +
> +	if (ret < 0) {      /////// convert to all possible 'ret' to '-EIO' <<<<
> +		err = -EIO;
> +		goto up_out;
> +	}
> +
> +	err = mbind_range(mm, start, end, new);
>   
>   	if (!err) {
>   		int nr_failed = 0;
> ---
>
> Note that inside queue_pages_range(), the call to walk_page_range() may return
> errors from 'test_walk' of 'struct mm_walk_ops', e.g. -EFAULT. Now, those error
> codes are no longer reported to user space application.
>
>  From user space, the mbind() call need to reported error, with EFAULT, as example:
> EFAULT
> Part or all of the memory range specified by nodemask and maxnode points
> outside your accessible address space. Or, there was an unmapped hole in the
> specified memory range specified by addr and len.

Thanks for catching this. That commit was aimed to correct the return 
values for some corner cases in mbind(), but it should not alter the 
errno for other failure cases, i.e. -EFAULT.

Could you please try the below patch (build test only)?

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ae967b..99df43a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1286,7 +1286,7 @@ static long do_mbind(unsigned long start, unsigned 
long len,
                           flags | MPOL_MF_INVERT, &pagelist);

         if (ret < 0) {
-               err = -EIO;
+               err = ret;
                 goto up_out;
         }


>
> Please correct me if this is the intended change(and will have updated API
> definition), or something was misunderstood.
>
> -Xinhai

^ permalink raw reply related

* [PATCH 4/4] docs: fs-verity: mention statx() support
From: Eric Biggers @ 2019-10-29 20:41 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Ts'o, linux-api, linux-f2fs-devel, David Howells,
	linux-fsdevel, Jaegeuk Kim, linux-ext4, Victor Hsieh
In-Reply-To: <20191029204141.145309-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Document that the statx() system call can now be used to check whether a
file is a verity file.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 Documentation/filesystems/fsverity.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Documentation/filesystems/fsverity.rst b/Documentation/filesystems/fsverity.rst
index 42a0b6dd9e0b68..3355377a24398d 100644
--- a/Documentation/filesystems/fsverity.rst
+++ b/Documentation/filesystems/fsverity.rst
@@ -226,6 +226,14 @@ To do so, check for FS_VERITY_FL (0x00100000) in the returned flags.
 The verity flag is not settable via FS_IOC_SETFLAGS.  You must use
 FS_IOC_ENABLE_VERITY instead, since parameters must be provided.
 
+statx
+-----
+
+Since Linux v5.5, the statx() system call sets STATX_ATTR_VERITY if
+the file has fs-verity enabled.  This can perform better than
+FS_IOC_GETFLAGS and FS_IOC_MEASURE_VERITY because it doesn't require
+opening the file, and opening verity files can be expensive.
+
 Accessing verity files
 ======================
 
-- 
2.24.0.rc1.363.gb1bccd3e3d-goog

^ permalink raw reply related

* [PATCH 3/4] f2fs: support STATX_ATTR_VERITY
From: Eric Biggers @ 2019-10-29 20:41 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Ts'o, linux-api, linux-f2fs-devel, David Howells,
	linux-fsdevel, Jaegeuk Kim, linux-ext4, Victor Hsieh
In-Reply-To: <20191029204141.145309-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Set the STATX_ATTR_VERITY bit when the statx() system call is used on a
verity file on f2fs.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/f2fs/file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 29bc0a542759a2..6a2e5b7d8fc74c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -726,11 +726,14 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
 		stat->attributes |= STATX_ATTR_IMMUTABLE;
 	if (flags & F2FS_NODUMP_FL)
 		stat->attributes |= STATX_ATTR_NODUMP;
+	if (IS_VERITY(inode))
+		stat->attributes |= STATX_ATTR_VERITY;
 
 	stat->attributes_mask |= (STATX_ATTR_APPEND |
 				  STATX_ATTR_ENCRYPTED |
 				  STATX_ATTR_IMMUTABLE |
-				  STATX_ATTR_NODUMP);
+				  STATX_ATTR_NODUMP |
+				  STATX_ATTR_VERITY);
 
 	generic_fillattr(inode, stat);
 
-- 
2.24.0.rc1.363.gb1bccd3e3d-goog

^ permalink raw reply related

* [PATCH 2/4] ext4: support STATX_ATTR_VERITY
From: Eric Biggers @ 2019-10-29 20:41 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Ts'o, linux-api, linux-f2fs-devel, David Howells,
	linux-fsdevel, Jaegeuk Kim, linux-ext4, Victor Hsieh
In-Reply-To: <20191029204141.145309-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Set the STATX_ATTR_VERITY bit when the statx() system call is used on a
verity file on ext4.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 fs/ext4/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 516faa280ceda8..a7ca6517798008 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5717,12 +5717,15 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
 		stat->attributes |= STATX_ATTR_IMMUTABLE;
 	if (flags & EXT4_NODUMP_FL)
 		stat->attributes |= STATX_ATTR_NODUMP;
+	if (flags & EXT4_VERITY_FL)
+		stat->attributes |= STATX_ATTR_VERITY;
 
 	stat->attributes_mask |= (STATX_ATTR_APPEND |
 				  STATX_ATTR_COMPRESSED |
 				  STATX_ATTR_ENCRYPTED |
 				  STATX_ATTR_IMMUTABLE |
-				  STATX_ATTR_NODUMP);
+				  STATX_ATTR_NODUMP |
+				  STATX_ATTR_VERITY);
 
 	generic_fillattr(inode, stat);
 	return 0;
-- 
2.24.0.rc1.363.gb1bccd3e3d-goog

^ permalink raw reply related

* [PATCH 1/4] statx: define STATX_ATTR_VERITY
From: Eric Biggers @ 2019-10-29 20:41 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Ts'o, linux-api, linux-f2fs-devel, David Howells,
	linux-fsdevel, Jaegeuk Kim, linux-ext4, Victor Hsieh
In-Reply-To: <20191029204141.145309-1-ebiggers@kernel.org>

From: Eric Biggers <ebiggers@google.com>

Add a statx attribute bit STATX_ATTR_VERITY which will be set if the
file has fs-verity enabled.  This is the statx() equivalent of
FS_VERITY_FL which is returned by FS_IOC_GETFLAGS.

This is useful because it allows applications to check whether a file is
a verity file without opening it.  Opening a verity file can be
expensive because the fsverity_info is set up on open, which involves
parsing metadata and optionally verifying a cryptographic signature.

This is analogous to how various other bits are exposed through both
FS_IOC_GETFLAGS and statx(), e.g. the encrypt bit.

Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 include/linux/stat.h      | 3 ++-
 include/uapi/linux/stat.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/stat.h b/include/linux/stat.h
index 765573dc17d659..528c4baad09146 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -33,7 +33,8 @@ struct kstat {
 	 STATX_ATTR_IMMUTABLE |				\
 	 STATX_ATTR_APPEND |				\
 	 STATX_ATTR_NODUMP |				\
-	 STATX_ATTR_ENCRYPTED				\
+	 STATX_ATTR_ENCRYPTED |				\
+	 STATX_ATTR_VERITY				\
 	 )/* Attrs corresponding to FS_*_FL flags */
 	u64		ino;
 	dev_t		dev;
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 7b35e98d3c58b1..ad80a5c885d598 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -167,8 +167,8 @@ struct statx {
 #define STATX_ATTR_APPEND		0x00000020 /* [I] File is append-only */
 #define STATX_ATTR_NODUMP		0x00000040 /* [I] File is not to be dumped */
 #define STATX_ATTR_ENCRYPTED		0x00000800 /* [I] File requires key to decrypt in fs */
-
 #define STATX_ATTR_AUTOMOUNT		0x00001000 /* Dir: Automount trigger */
+#define STATX_ATTR_VERITY		0x00100000 /* [I] Verity protected file */
 
 
 #endif /* _UAPI_LINUX_STAT_H */
-- 
2.24.0.rc1.363.gb1bccd3e3d-goog

^ permalink raw reply related

* [PATCH 0/4] statx: expose the fs-verity bit
From: Eric Biggers @ 2019-10-29 20:41 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: Theodore Ts'o, linux-api, linux-f2fs-devel, David Howells,
	linux-fsdevel, Jaegeuk Kim, linux-ext4, Victor Hsieh

This patchset exposes the verity bit (a.k.a. FS_VERITY_FL) via statx().

This is useful because it allows applications to check whether a file is
a verity file without opening it.  Opening a verity file can be
expensive because the fsverity_info is set up on open, which involves
parsing metadata and optionally verifying a cryptographic signature.

This is analogous to how various other bits are exposed through both
FS_IOC_GETFLAGS and statx(), e.g. the encrypt bit.

This patchset applies to v5.4-rc5.

Eric Biggers (4):
  statx: define STATX_ATTR_VERITY
  ext4: support STATX_ATTR_VERITY
  f2fs: support STATX_ATTR_VERITY
  docs: fs-verity: mention statx() support

 Documentation/filesystems/fsverity.rst | 8 ++++++++
 fs/ext4/inode.c                        | 5 ++++-
 fs/f2fs/file.c                         | 5 ++++-
 include/linux/stat.h                   | 3 ++-
 include/uapi/linux/stat.h              | 2 +-
 5 files changed, 19 insertions(+), 4 deletions(-)

-- 
2.24.0.rc1.363.gb1bccd3e3d-goog

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Dave Hansen @ 2019-10-29 20:07 UTC (permalink / raw)
  To: Dan Williams, Kirill A. Shutemov
  Cc: Mike Rapoport, Linux Kernel Mailing List, Alexey Dobriyan,
	Andrew Morton, Andy Lutomirski, Arnd Bergmann, Borislav Petkov,
	Dave Hansen, James Bottomley, Peter Zijlstra, Steven Rostedt,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin, Linux API, linux-mm,
	the arch/x86 maintainers, Mike Rapoport
In-Reply-To: <CAPcyv4hDPdmHxhMF753Jt5Dk6V9bTAkGqzkyYHCiG6xowT4Ncg@mail.gmail.com>

On 10/29/19 12:43 PM, Dan Williams wrote:
>> But some CPUs don't like to have two TLB entries for the same memory with
>> different sizes at the same time. See for instance AMD erratum 383.
> That basic description would seem to defeat most (all?) interesting
> huge page use cases. For example dax makes no attempt to make sure
> aliased mappings of pmem are the same size between the direct map that
> the driver uses, and userspace dax mappings. So I assume there are
> more details than "all aliased mappings must be the same size".

These are about when large and small TLB entries could be held in the
TLB at the same time for the same virtual address in the same process.
It doesn't matter that two *different* mappings are using different page
size.

Imagine you were *just* changing the page size.  Without these errata,
you could just skip flushing the TLB.  You might use the old hardware
page size for a while, but it will be functionally OK.  With these
errata, we need to ensure in software that the old TLB entries for the
old page size are flushed before the new page size is established.

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Dan Williams @ 2019-10-29 19:43 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Mike Rapoport, Linux Kernel Mailing List, Alexey Dobriyan,
	Andrew Morton, Andy Lutomirski, Arnd Bergmann, Borislav Petkov,
	Dave Hansen, James Bottomley, Peter Zijlstra, Steven Rostedt,
	Thomas Gleixner, Ingo Molnar, H. Peter Anvin, Linux API, linux-mm,
	the arch/x86 maintainers, Mike Rapoport
In-Reply-To: <20191029064318.s4n4gidlfjun3d47@box>

On Mon, Oct 28, 2019 at 11:43 PM Kirill A. Shutemov
<kirill@shutemov.name> wrote:
>
> On Mon, Oct 28, 2019 at 10:43:51PM -0700, Dan Williams wrote:
> > On Mon, Oct 28, 2019 at 6:16 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> > >
> > > On Mon, Oct 28, 2019 at 02:00:19PM +0100, Mike Rapoport wrote:
> > > > On Mon, Oct 28, 2019 at 03:31:24PM +0300, Kirill A. Shutemov wrote:
> > > > > On Sun, Oct 27, 2019 at 12:17:32PM +0200, Mike Rapoport wrote:
> > > > > > From: Mike Rapoport <rppt@linux.ibm.com>
> > > > > >
> > > > > > The mappings created with MAP_EXCLUSIVE are visible only in the context of
> > > > > > the owning process and can be used by applications to store secret
> > > > > > information that will not be visible not only to other processes but to the
> > > > > > kernel as well.
> > > > > >
> > > > > > The pages in these mappings are removed from the kernel direct map and
> > > > > > marked with PG_user_exclusive flag. When the exclusive area is unmapped,
> > > > > > the pages are mapped back into the direct map.
> > > > >
> > > > > I probably blind, but I don't see where you manipulate direct map...
> > > >
> > > > __get_user_pages() calls __set_page_user_exclusive() which in turn calls
> > > > set_direct_map_invalid_noflush() that makes the page not present.
> > >
> > > Ah. okay.
> > >
> > > I think active use of this feature will lead to performance degradation of
> > > the system with time.
> > >
> > > Setting a single 4k page non-present in the direct mapping will require
> > > splitting 2M or 1G page we usually map direct mapping with. And it's one
> > > way road. We don't have any mechanism to map the memory with huge page
> > > again after the application has freed the page.
> > >
> > > It might be okay if all these pages cluster together, but I don't think we
> > > have a way to achieve it easily.
> >
> > Still, it would be worth exploring what that would look like if not
> > for MAP_EXCLUSIVE then set_mce_nospec() that wants to punch out poison
> > pages from the direct map. In the case of pmem, where those pages are
> > able to be repaired, it would be nice to also repair the mapping
> > granularity of the direct map.
>
> The solution has to consist of two parts: finding a range to collapse and
> actually collapsing the range into a huge page.
>
> Finding the collapsible range will likely require background scanning of
> the direct mapping as we do for THP with khugepaged. It should not too
> hard, but likely require long and tedious tuning to be effective, but not
> too disturbing for the system.
>
> Alternatively, after any changes to the direct mapping, we can initiate
> checking if the range is collapsible. Up to 1G around the changed 4k.
> It might be more taxing than scanning if direct mapping changes often.
>
> Collapsing itself appears to be simple: re-check if the range is
> collapsible under the lock, replace the page table with the huge page and
> flush the TLB.
>
> But some CPUs don't like to have two TLB entries for the same memory with
> different sizes at the same time. See for instance AMD erratum 383.

That basic description would seem to defeat most (all?) interesting
huge page use cases. For example dax makes no attempt to make sure
aliased mappings of pmem are the same size between the direct map that
the driver uses, and userspace dax mappings. So I assume there are
more details than "all aliased mappings must be the same size".

> Getting it right would require making the range not present, flush TLB and
> only then install huge page. That's what we do for userspace.
>
> It will not fly for the direct mapping. There is no reasonable way to
> exclude other CPU from accessing the range while it's not present (call
> stop_machine()? :P). Moreover, the range may contain the code that doing
> the collapse or data required for it...

At least for pmem all the access points can be controlled. pmem is
never used for kernel text at least in the dax mode where it is
accessed via file-backed shared mappings, or the pmem driver. So when
I say "direct-map repair" I mean the incidental direct-map that pmem
uses since it maps pmem with arch_add_memory(), not the typical DRAM
direct-map that may house kernel text. Poison consumed from the kernel
DRAM direct-map is fatal, poison consumed from dax mappings and the
pmem driver path is recoverable and repairable.

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Andy Lutomirski @ 2019-10-29 18:10 UTC (permalink / raw)
  To: James Bottomley
  Cc: Andy Lutomirski, Reshetova, Elena, Mike Rapoport,
	linux-kernel@vger.kernel.org, Alexey Dobriyan, Andrew Morton,
	Arnd Bergmann, Borislav Petkov, Dave Hansen, Peter Zijlstra,
	Steven Rostedt, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	linux-api@vger.kernel.org, linux-mm@kvack.org, x86@kernel.org,
	Mike Rapoport, Tycho Andersen
In-Reply-To: <1572371012.4812.19.camel@linux.ibm.com>

On Tue, Oct 29, 2019 at 10:44 AM James Bottomley <jejb@linux.ibm.com> wrote:
>
> On Tue, 2019-10-29 at 10:03 -0700, Andy Lutomirski wrote:
> > On Tue, Oct 29, 2019 at 4:25 AM Reshetova, Elena
> > <elena.reshetova@intel.com> wrote:
> > >
> > > > The patch below aims to allow applications to create mappins that
> > > > have
> > > > pages visible only to the owning process. Such mappings could be
> > > > used to
> > > > store secrets so that these secrets are not visible neither to
> > > > other
> > > > processes nor to the kernel.
> > >
> > > Hi Mike,
> > >
> > > I have actually been looking into the closely related problem for
> > > the past
> > > couple of weeks (on and off). What is common here is the need for
> > > userspace
> > > to indicate to kernel that some pages contain secrets. And then
> > > there are
> > > actually a number of things that kernel can do to try to protect
> > > these secrets
> > > better. Unmap from direct map is one of them. Another thing is to
> > > map such
> > > pages as non-cached, which can help us to prevent or considerably
> > > restrict
> > > speculation on such pages. The initial proof of concept for marking
> > > pages as
> > > "UNCACHED" that I got from Dave Hansen was actually based on
> > > mlock2()
> > > and a new flag for it for this purpose. Since then I have been
> > > thinking on what
> > > interface suits the use case better and actually selected going
> > > with new madvise()
> > > flag instead because of all possible implications for fragmentation
> > > and performance.
> >
> > Doing all of this with MAP_SECRET seems bad to me.  If user code
> > wants UC memory, it should ask for UC memory -- having the kernel
> > involved in the decision to use UC memory is a bad idea, because the
> > performance impact of using UC memory where user code wasn't
> > expecting it wil be so bad that the system might as well not work at
> > all.  (For kicks, I once added a sysctl to turn off caching in
> > CR0.  I enabled it in gnome-shell.  The system slowed down to such an
> > extent that I was unable to enter the three or so keystrokes to turn
> > it back off.)
> >
> > EXCLUSIVE makes sense.  Saying "don't ptrace this" makes sense.  UC
> > makes sense.  But having one flag to rule them all does not make
> > sense to me.
>
> So this is a usability problem.  We have a memory flag that can be used
> for "secrecy" for some userspace value of the word and we have a load
> of internal properties depending on how the hardware works, including
> potentially some hardware additions like SEV or TME, that can be used
> to implement the property.  If we expose our hardware vagaries, the
> user is really not going to know what to do ... and we have a limited
> number of flags to express this, so it stands to reason that we need to
> define "secrecy" for the user and then implement it using whatever
> flags we have.  So I think no ptrace and no direct map make sense for
> pretty much any value of "secrecy".  The UC bit seems to be an attempt
> to prevent exfiltration via L1TF or other cache side channels, so it
> looks like it should only be applied if the side channel mitigations
> aren't active ... which would tend to indicate it's a kernel decision
> as well.

I just don't think this will work in practice.  Someone will say "hey,
let's keep this giant buffer we do crypto from, or maybe even the
entire data area of some critical service, secret".  It will work
*fine* at first.  But then some kernel config changes and we can't do
DMA, and now it breaks on some configs.  Someone else will say "hey, I
don't have L1TF or whatever mitigation, let's turn on UC", and
everything goes to hell.

IMO the kernel should attempt to keep *all memory* secret.  Specific
applications that want greater levels of secrecy should opt in to more
expensive things.  Here's what's already on the table:

Exclusive / XPFO / XPO: allocation might be extremely expensive.
Overuse might hurt performance due to huge page fragmentation  DMA may
not work.  Otherwise it's peachy.

SEV: Works only in some contexts.  The current kernel implementation
is, IMO, unacceptable to the extent that I wish I could go back in
time and NAK it.

TME: it's on or it's off.  There's no room for a MAP_ flag here.

MKTME: of highly dubious value here.  The only useful thing here I can
thing it would be a MAP_NOTSECRET to opt *out* of encryption for a
specific range.  Other than that, it has all the same performance
implications that EXCLUSIVE has.

UC: Performance hit is extreme.  *Also* has the perf implications of
exclusive.  I can't imagine this making any sense except were the user
application is written in the expectation that UC might be used so
that the access patterns would be reasonable.

WC: Same issues as UC plus memory ordering issues such that
unsuspecting applications will corrupt data.

Trying to bundle these together with kernel- or admin-only config
seems like a lost cause.

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: James Bottomley @ 2019-10-29 17:43 UTC (permalink / raw)
  To: Andy Lutomirski, Reshetova, Elena
  Cc: Mike Rapoport, linux-kernel@vger.kernel.org, Alexey Dobriyan,
	Andrew Morton, Arnd Bergmann, Borislav Petkov, Dave Hansen,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, linux-api@vger.kernel.org, linux-mm@kvack.org,
	x86@kernel.org, Mike Rapoport, Tycho Andersen, Alan Cox
In-Reply-To: <CALCETrWN9kc+10tf7YoBp9ixqkO_KZ=b1E_cFBr_Uogxhu68PQ@mail.gmail.com>

On Tue, 2019-10-29 at 10:03 -0700, Andy Lutomirski wrote:
> On Tue, Oct 29, 2019 at 4:25 AM Reshetova, Elena
> <elena.reshetova@intel.com> wrote:
> > 
> > > The patch below aims to allow applications to create mappins that
> > > have
> > > pages visible only to the owning process. Such mappings could be
> > > used to
> > > store secrets so that these secrets are not visible neither to
> > > other
> > > processes nor to the kernel.
> > 
> > Hi Mike,
> > 
> > I have actually been looking into the closely related problem for
> > the past
> > couple of weeks (on and off). What is common here is the need for
> > userspace
> > to indicate to kernel that some pages contain secrets. And then
> > there are
> > actually a number of things that kernel can do to try to protect
> > these secrets
> > better. Unmap from direct map is one of them. Another thing is to
> > map such
> > pages as non-cached, which can help us to prevent or considerably
> > restrict
> > speculation on such pages. The initial proof of concept for marking
> > pages as
> > "UNCACHED" that I got from Dave Hansen was actually based on
> > mlock2()
> > and a new flag for it for this purpose. Since then I have been
> > thinking on what
> > interface suits the use case better and actually selected going
> > with new madvise()
> > flag instead because of all possible implications for fragmentation
> > and performance.
> 
> Doing all of this with MAP_SECRET seems bad to me.  If user code
> wants UC memory, it should ask for UC memory -- having the kernel
> involved in the decision to use UC memory is a bad idea, because the
> performance impact of using UC memory where user code wasn't
> expecting it wil be so bad that the system might as well not work at
> all.  (For kicks, I once added a sysctl to turn off caching in
> CR0.  I enabled it in gnome-shell.  The system slowed down to such an
> extent that I was unable to enter the three or so keystrokes to turn
> it back off.)
> 
> EXCLUSIVE makes sense.  Saying "don't ptrace this" makes sense.  UC
> makes sense.  But having one flag to rule them all does not make
> sense to me.

So this is a usability problem.  We have a memory flag that can be used
for "secrecy" for some userspace value of the word and we have a load
of internal properties depending on how the hardware works, including
potentially some hardware additions like SEV or TME, that can be used
to implement the property.  If we expose our hardware vagaries, the
user is really not going to know what to do ... and we have a limited
number of flags to express this, so it stands to reason that we need to
define "secrecy" for the user and then implement it using whatever
flags we have.  So I think no ptrace and no direct map make sense for
pretty much any value of "secrecy".  The UC bit seems to be an attempt
to prevent exfiltration via L1TF or other cache side channels, so it
looks like it should only be applied if the side channel mitigations
aren't active ... which would tend to indicate it's a kernel decision
as well.

In the use case in my head, I'd like MAP_EXCLUSIVE to mean the data in
the user memory is difficult to exfiltrate for another tenant in a
virtual system, even if they break containment, so effectively I want
it protected against kernel exploitation and root in the host ... and I
suppose I need to acknowledge that "protected" means best effort
available on the platform, not no attacker can ever extract this.

James

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Alan Cox @ 2019-10-29 17:37 UTC (permalink / raw)
  To: Andy Lutomirski, Reshetova, Elena
  Cc: Mike Rapoport, linux-kernel@vger.kernel.org, Alexey Dobriyan,
	Andrew Morton, Arnd Bergmann, Borislav Petkov, Dave Hansen,
	James Bottomley, Peter Zijlstra, Steven Rostedt, Thomas Gleixner,
	Ingo Molnar, H. Peter Anvin, linux-api@vger.kernel.org,
	linux-mm@kvack.org, x86@kernel.org, Mike Rapoport, Tycho Andersen
In-Reply-To: <CALCETrWN9kc+10tf7YoBp9ixqkO_KZ=b1E_cFBr_Uogxhu68PQ@mail.gmail.com>

> Doing all of this with MAP_SECRET seems bad to me.  If user code
> wants
> UC memory, it should ask for UC memory -- having the kernel involved
> in the decision to use UC memory is a bad idea, because the
> performance impact of using UC memory where user code wasn't
> expecting

The user has no idea that they want UC memory. It varies by platform
what this means. There are some systems (eg in order uclinux devices,
M68K, old atoms) for which it probably means 'no-op', there are those
where UC helps, those it hinders, there are those where WC is probably
sufficient. There are platforms where 'secret' memory might best be
implemented by using on die memory pools or cache locking. It might
even mean 'put me in a non HT cgroup'.

Secret might also mean 'not accessible by thunderbolt', or 'do not swap
unless swap is encrypted' and other things.

IMHO the question is what is the actual semantic here. What are you
asking for ? Does it mean "at any cost", what does it guarantee (100%
or statistically), what level of guarantee is acceptable, what level is
-EOPNOTSUPP or similar ?

I'm also wary of the focus always being on keys. If you decrypt a file
I'm probably just as interested in the contents so can I mmap a file
this way and if so what happens on the unmap. Yes key theft lets me do
all sorts of theoretical long term bad stuff, but frequently data theft
is sufficient to do lots of practical short term bad stuff. Also as an
attacker I'm probably a script, and I don't want to be exposing my
master long term because they want the footprints gone.

> in gnome-shell.  The system slowed down to such an extent that I was
> unable to enter the three or so keystrokes to turn it back off.)

Yes - and any uncached pages also need to be kept away from anything
that the kernel touches under locks, or use in atomic user operations
stuff. Copy on write of an uncached page for example is suddenly really
slow and there are so many other cases we'd have to find and deal with.

> EXCLUSIVE makes sense.  Saying "don't ptrace this" makes sense.  UC
> makes sense.  But having one flag to rule them all does not make
> sense
> to me.

We already support not ptracing, and if I can ptrace any of the code I
can access all of its code/data so that one isn't hard and the LSM
interfaces can do it. That one is easy - minus the fact that malware
writers are big fans of anything that stops tracing...

Alan

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Edgecombe, Rick P @ 2019-10-29 17:27 UTC (permalink / raw)
  To: peterz@infradead.org
  Cc: adobriyan@gmail.com, linux-kernel@vger.kernel.org,
	rppt@kernel.org, rostedt@goodmis.org, jejb@linux.ibm.com,
	tglx@linutronix.de, linux-mm@kvack.org,
	dave.hansen@linux.intel.com, linux-api@vger.kernel.org,
	x86@kernel.org, akpm@linux-foundation.org, hpa@zytor.com,
	mingo@redhat.com, luto@kernel.org, kirill@shutemov.name,
	bp@alien8.de, rppt@linux.ibm.com
In-Reply-To: <20191028210052.GM4643@worktop.programming.kicks-ass.net>

On Mon, 2019-10-28 at 22:00 +0100, Peter Zijlstra wrote:
> On Mon, Oct 28, 2019 at 07:59:25PM +0000, Edgecombe, Rick P wrote:
> > On Mon, 2019-10-28 at 14:55 +0100, Peter Zijlstra wrote:
> > > On Mon, Oct 28, 2019 at 04:16:23PM +0300, Kirill A. Shutemov wrote:
> > > 
> > > > I think active use of this feature will lead to performance degradation
> > > > of
> > > > the system with time.
> > > > 
> > > > Setting a single 4k page non-present in the direct mapping will require
> > > > splitting 2M or 1G page we usually map direct mapping with. And it's one
> > > > way road. We don't have any mechanism to map the memory with huge page
> > > > again after the application has freed the page.
> > > 
> > > Right, we recently had a 'bug' where ftrace triggered something like
> > > this and facebook ran into it as a performance regression. So yes, this
> > > is a real concern.
> > 
> > Don't e/cBPF filters also break the direct map down to 4k pages when calling
> > set_memory_ro() on the filter for 64 bit x86 and arm?
> > 
> > I've been wondering if the page allocator should make some effort to find a
> > broken down page for anything that can be known will have direct map
> > permissions
> > changed (or if it already groups them somehow). But also, why any potential
> > slowdown of 4k pages on the direct map hasn't been noticed for apps that do
> > a
> > lot of insertions and removals of BPF filters, if this is indeed the case.
> 
> That should be limited to the module range. Random data maps could
> shatter the world.

BPF has one vmalloc space allocation for the byte code and one for the module
space allocation for the JIT. Both get RO also set on the direct map alias of
the pages, and reset RW when freed.

You mean shatter performance?

^ permalink raw reply

* [PATCH bpf-next v11 7/7] landlock: Add user and kernel documentation for Landlock
From: Mickaël Salaün @ 2019-10-29 17:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley
In-Reply-To: <20191029171505.6650-1-mic@digikod.net>

This documentation can be built with the Sphinx framework.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: James Morris <jmorris@namei.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Will Drewry <wad@chromium.org>
---

Changes since v10:
* replace the filesystem hooks with the ptrace one
* remove the triggers
* update example
* add documenation for Landlock domains and seccomp interaction
* reference more kernel documenation (e.g. LSM hooks)

Changes since v9:
* update with expected attach type and expected attach triggers

Changes since v8:
* remove documentation related to chaining and tagging according to this
  patch series

Changes since v7:
* update documentation according to the Landlock revamp

Changes since v6:
* add a check for ctx->event
* rename BPF_PROG_TYPE_LANDLOCK to BPF_PROG_TYPE_LANDLOCK_RULE
* rename Landlock version to ABI to better reflect its purpose and add a
  dedicated changelog section
* update tables
* relax no_new_privs recommendations
* remove ABILITY_WRITE related functions
* reword rule "appending" to "prepending" and explain it
* cosmetic fixes

Changes since v5:
* update the rule hierarchy inheritance explanation
* briefly explain ctx->arg2
* add ptrace restrictions
* explain EPERM
* update example (subtype)
* use ":manpage:"
---
 Documentation/security/index.rst           |   1 +
 Documentation/security/landlock/index.rst  |  22 ++++
 Documentation/security/landlock/kernel.rst | 139 ++++++++++++++++++++
 Documentation/security/landlock/user.rst   | 142 +++++++++++++++++++++
 4 files changed, 304 insertions(+)
 create mode 100644 Documentation/security/landlock/index.rst
 create mode 100644 Documentation/security/landlock/kernel.rst
 create mode 100644 Documentation/security/landlock/user.rst

diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst
index fc503dd689a7..4d213e76ddf4 100644
--- a/Documentation/security/index.rst
+++ b/Documentation/security/index.rst
@@ -15,3 +15,4 @@ Security Documentation
    self-protection
    siphash
    tpm/index
+   landlock/index
diff --git a/Documentation/security/landlock/index.rst b/Documentation/security/landlock/index.rst
new file mode 100644
index 000000000000..1eced757b05d
--- /dev/null
+++ b/Documentation/security/landlock/index.rst
@@ -0,0 +1,22 @@
+=========================================
+Landlock LSM: programmatic access control
+=========================================
+
+:Author: Mickaël Salaün
+
+Landlock is a stackable Linux Security Module (LSM) that makes it possible to
+create security sandboxes, programmable access-controls or safe endpoint
+security agents.  This kind of sandbox is expected to help mitigate the
+security impact of bugs or unexpected/malicious behaviors in user-space
+applications.  The current version allows only a process with the global
+CAP_SYS_ADMIN capability to create such sandboxes but the ultimate goal of
+Landlock is to empower any process, including unprivileged ones, to securely
+restrict themselves.  Landlock is inspired by seccomp-bpf but instead of
+filtering syscalls and their raw arguments, a Landlock rule can inspect the use
+of kernel objects like processes and hence make a decision according to the
+kernel semantic.
+
+.. toctree::
+
+    user
+    kernel
diff --git a/Documentation/security/landlock/kernel.rst b/Documentation/security/landlock/kernel.rst
new file mode 100644
index 000000000000..0be906f92c3e
--- /dev/null
+++ b/Documentation/security/landlock/kernel.rst
@@ -0,0 +1,139 @@
+==============================
+Landlock: kernel documentation
+==============================
+
+eBPF properties
+===============
+
+To get an expressive language while still being safe and small, Landlock is
+based on eBPF. Landlock should be usable by untrusted processes and must
+therefore expose a minimal attack surface. The eBPF bytecode is minimal,
+powerful, widely used and designed to be used by untrusted applications. Thus,
+reusing the eBPF support in the kernel enables a generic approach while
+minimizing new code.
+
+An eBPF program has access to an eBPF context containing some fields used to
+inspect the current object. These arguments may be used directly (e.g. raw
+value) or passed to helper functions according to their types (e.g. pointer).
+It is then possible to do complex access checks without race conditions or
+inconsistent evaluation (i.e.  `incorrect mirroring of the OS code and state
+<https://www.ndss-symposium.org/ndss2003/traps-and-pitfalls-practical-problems-system-call-interposition-based-security-tools/>`_).
+
+A Landlock hook describes a particular access type.  For now, there is one hook
+dedicated to ptrace related operations: BPF_LANDLOCK_PTRACE.  A Landlock
+program is tied to one hook.  This makes it possible to statically check
+context accesses, potentially performed by such program, and hence prevents
+kernel address leaks and ensure the right use of hook arguments with eBPF
+functions.  Any user can add multiple Landlock programs per Landlock hook.
+They are stacked and evaluated one after the other, starting from the most
+recent program, as seccomp-bpf does with its filters.  Underneath, a hook is an
+abstraction over a set of LSM hooks.
+
+
+Guiding principles
+==================
+
+Unprivileged use
+----------------
+
+* Landlock helpers and context should be usable by any unprivileged and
+  untrusted program while following the system security policy enforced by
+  other access control mechanisms (e.g. DAC, LSM), even if a global
+  CAP_SYS_ADMIN is currently required.
+
+
+Landlock hook and context
+-------------------------
+
+* A Landlock hook shall be focused on access control on kernel objects instead
+  of syscall filtering (i.e. syscall arguments), which is the purpose of
+  seccomp-bpf.
+* A Landlock context provided by a hook shall express the minimal and more
+  generic interface to control an access for a kernel object.
+* A hook shall guaranty that all the BPF function calls from a program are
+  safe.  Thus, the related Landlock context arguments shall always be of the
+  same type for a particular hook.  For example, a network hook could share
+  helpers with a file hook because of UNIX socket.  However, the same helpers
+  may not be compatible for a file system handle and a net handle.
+* Multiple hooks may use the same context interface.
+
+
+Landlock helpers
+----------------
+
+* Landlock helpers shall be as generic as possible while at the same time being
+  as simple as possible and following the syscall creation principles (cf.
+  *Documentation/adding-syscalls.txt*).
+* The only behavior change allowed on a helper is to fix a (logical) bug to
+  match the initial semantic.
+* Helpers shall be reentrant, i.e. only take inputs from arguments (e.g. from
+  the BPF context), to enable a hook to use a cache.  Future program options
+  might change this cache behavior.
+* It is quite easy to add new helpers to extend Landlock.  The main concern
+  should be about the possibility to leak information from the kernel that may
+  not be accessible otherwise (i.e. side-channel attack).
+
+
+Landlock domain
+===============
+
+A Landlock domain is a set of eBPF programs.  There is a list for each
+different program types that can be run on a specific Landlock hook (e.g.
+ptrace).  A domain is tied to a set of subjects (i.e. tasks).
+
+A Landlock program should not try (nor be able) to infer which subject is
+currently enforced, but to have a unique security policy for all subjects tied
+to the same domain.  This make the reasoning much easier and help avoid
+pitfalls.
+
+.. kernel-doc:: security/landlock/common.h
+    :functions: landlock_domain
+
+.. kernel-doc:: security/landlock/domain_manage.c
+    :functions: landlock_prepend_prog
+
+
+Adding a Landlock program with seccomp
+--------------------------------------
+
+The :manpage:`seccomp(2)` syscall can be used with the
+`SECCOMP_PREPEND_LANDLOCK_PROG` operation to prepend a Landlock program to the
+current task's domain.
+
+.. kernel-doc:: security/landlock/domain_syscall.c
+    :functions: landlock_seccomp_prepend_prog
+
+
+Running a list of Landlock programs
+-----------------------------------
+
+.. kernel-doc:: security/landlock/bpf_run.c
+    :functions: landlock_access_denied
+
+
+LSM hooks
+=========
+
+.. kernel-doc:: security/landlock/hooks_ptrace.c
+    :functions: hook_ptrace_access_check
+
+.. kernel-doc:: security/landlock/hooks_ptrace.c
+    :functions: hook_ptrace_traceme
+
+
+Questions and answers
+=====================
+
+Why a program does not return an errno or a kill code?
+------------------------------------------------------
+
+seccomp filters can return multiple kind of code, including an errno value or a
+kill signal, which may be convenient for access control.  Those return codes
+are hardwired in the userland ABI.  Instead, Landlock's approach is to return a
+bitmask to allow or deny an action, which is much simpler and more generic.
+Moreover, we do not really have a choice because, unlike to seccomp, Landlock
+programs are not enforced at the syscall entry point but may be executed at any
+point in the kernel (through LSM hooks) where an errno return code may not make
+sense.  However, with this simple ABI and with the ability to call helpers,
+Landlock may gain features similar to seccomp-bpf in the future while being
+compatible with previous programs.
diff --git a/Documentation/security/landlock/user.rst b/Documentation/security/landlock/user.rst
new file mode 100644
index 000000000000..e7aa9a260a86
--- /dev/null
+++ b/Documentation/security/landlock/user.rst
@@ -0,0 +1,142 @@
+================================
+Landlock: userland documentation
+================================
+
+Landlock programs
+=================
+
+eBPF programs are used to create security programs.  They are contained and can
+call only a whitelist of dedicated functions. Moreover, they can only loop
+under strict conditions, which protects from denial of service.  More
+information on BPF can be found in *Documentation/networking/filter.txt*.
+
+
+Writing a program
+-----------------
+
+To enforce a security policy, a thread first needs to create a Landlock
+program.  The easiest way to write an eBPF program depicting a security program
+is to write it in the C language.  As described in *samples/bpf/README.rst*,
+LLVM can compile such programs.  A simple eBPF program can also be written by
+hand has done in *tools/testing/selftests/landlock/*.
+
+Once the eBPF program is created, the next step is to create the metadata
+describing the Landlock program.  This metadata includes an expected attach
+type which contains the hook type to which the program is tied.
+
+A hook is a policy decision point which exposes the same context type for
+each program evaluation.
+
+A Landlock hook describes the kind of kernel object for which a program will be
+triggered to allow or deny an action.  For example, the hook
+BPF_LANDLOCK_PTRACE can be triggered every time a landlocked thread performs a
+set of action related to debugging (cf. :manpage:`ptrace(2)`) or if the kernel
+needs to know if a process manipulation requested by something else is
+legitimate.
+
+The next step is to fill a :c:type:`struct bpf_load_program_attr
+<bpf_load_program_attr>` with BPF_PROG_TYPE_LANDLOCK_HOOK, the expected attach
+type and other BPF program metadata.  This bpf_attr must then be passed to the
+:manpage:`bpf(2)` syscall alongside the BPF_PROG_LOAD command.  If everything
+is deemed correct by the kernel, the thread gets a file descriptor referring to
+this program.
+
+In the following code, the *insn* variable is an array of BPF instructions
+which can be extracted from an ELF file as is done in bpf_load_file() from
+*samples/bpf/bpf_load.c*.
+
+.. code-block:: c
+
+    int prog_fd;
+    struct bpf_load_program_attr load_attr;
+
+    memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
+    load_attr.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK;
+    load_attr.expected_attach_type = BPF_LANDLOCK_PTRACE;
+    load_attr.insns = insns;
+    load_attr.insns_cnt = sizeof(insn) / sizeof(struct bpf_insn);
+    load_attr.license = "GPL";
+
+    prog_fd = bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz);
+    if (prog_fd == -1)
+        exit(1);
+
+
+Enforcing a program
+-------------------
+
+Once the Landlock program has been created or received (e.g. through a UNIX
+socket), the thread willing to sandbox itself (and its future children) should
+perform the following two steps.
+
+The thread should first request to never be allowed to get new privileges with a
+call to :manpage:`prctl(2)` and the PR_SET_NO_NEW_PRIVS option.  More
+information can be found in *Documentation/prctl/no_new_privs.txt*.
+
+.. code-block:: c
+
+    if (prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0))
+        exit(1);
+
+A thread can apply a program to itself by using the :manpage:`seccomp(2)` syscall.
+The operation is SECCOMP_PREPEND_LANDLOCK_PROG, the flags must be empty and the
+*args* argument must point to a valid Landlock program file descriptor.
+
+.. code-block:: c
+
+    if (seccomp(SECCOMP_PREPEND_LANDLOCK_PROG, 0, &fd))
+        exit(1);
+
+If the syscall succeeds, the program is now enforced on the calling thread and
+will be enforced on all its subsequently created children of the thread as
+well.  Once a thread is landlocked, there is no way to remove this security
+policy, only stacking more restrictions is allowed.  The program evaluation is
+performed from the newest to the oldest.
+
+When a syscall ask for an action on a kernel object, if this action is denied,
+then an EACCES errno code is returned through the syscall.
+
+
+.. _inherited_programs:
+
+Inherited programs
+------------------
+
+Every new thread resulting from a :manpage:`clone(2)` inherits Landlock program
+restrictions from its parent.  This is similar to the seccomp inheritance as
+described in *Documentation/prctl/seccomp_filter.txt* or any other LSM dealing
+with task's :manpage:`credentials(7)`.
+
+
+Ptrace restrictions
+-------------------
+
+A sandboxed process has less privileges than a non-sandboxed process and must
+then be subject to additional restrictions when manipulating another process.
+To be allowed to use :manpage:`ptrace(2)` and related syscalls on a target
+process, a sandboxed process should have a subset of the target process
+programs.  This security policy can easily be implemented like in
+*tools/testing/selftests/landlock/test_ptrace.c*.
+
+
+Landlock structures and constants
+=================================
+
+Contexts
+--------
+
+.. kernel-doc:: include/uapi/linux/landlock.h
+    :functions: landlock_context_ptrace
+
+
+Return types
+------------
+
+.. kernel-doc:: include/uapi/linux/landlock.h
+    :functions: landlock_ret
+
+
+Additional documentation
+========================
+
+See https://landlock.io
-- 
2.23.0

^ permalink raw reply related

* [PATCH bpf-next v11 6/7] bpf,landlock: Add tests for the Landlock ptrace program type
From: Mickaël Salaün @ 2019-10-29 17:15 UTC (permalink / raw)
  To: linux-kernel
  Cc: Mickaël Salaün, Alexei Starovoitov, Andy Lutomirski,
	Casey Schaufler, Daniel Borkmann, David Drysdale, Florent Revest,
	James Morris, Jann Horn, John Johansen, Jonathan Corbet,
	Kees Cook, KP Singh, Michael Kerrisk, Mickaël Salaün,
	Paul Moore, Sargun Dhillon, Serge E . Hallyn, Shuah Khan,
	Stephen Smalley
In-Reply-To: <20191029171505.6650-1-mic@digikod.net>

Test eBPF program context access and ptrace hooks semantic.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: James Morris <jmorris@namei.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Serge E. Hallyn <serge@hallyn.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Will Drewry <wad@chromium.org>
---

Changes since v10:
* rework tests with new Landlock ptrace programs which restrict ptrace
  thanks to the task_landlock_ptrace_ancestor() helper
* simplify ptrace tests (make expect_ptrace implicit)
* add tests:
  * check a child process tracing its parent
  * check Landlock domain without ptrace enforcement (e.g. useful for
    audit/signaling purpose)
  * check inherited-only domains
  * check task pointer arithmetic
* fix flaky test for multi-core
* increase log size
* cosmetic renames
* update and improve the Makefile

Changes since v9:
* replace subtype with expected_attach_type and expected_attach_triggers
* rename inode_map_lookup() into inode_map_lookup_elem()
* check for inode map entry without value (which is now possible thanks
  to the pointer null check)
* use read-only inode map for Landlock programs

Changes since v8:
* update eBPF include path for macros
* use TEST_GEN_PROGS and use the generic "clean" target
* add more verbose errors
* update the bpf/verifier files
* remove chain tests (from landlock and bpf/verifier)
* replace the whitelist tests with blacklist tests (because of stateless
  Landlock programs): remove "dotdot" tests and other depth tests
* sync the landlock Makefile with its bpf sibling directory and use
  bpf_load_program_xattr()

Changes since v7:
* update tests and add new ones for filesystem hierarchy and Landlock
  chains.

Changes since v6:
* use the new kselftest_harness.h
* use const variables
* replace ASSERT_STEP with ASSERT_*
* rename BPF_PROG_TYPE_LANDLOCK to BPF_PROG_TYPE_LANDLOCK_RULE
* force sample library rebuild
* fix install target

Changes since v5:
* add subtype test
* add ptrace tests
* split and rename files
* cleanup and rebase
---
 scripts/bpf_helpers_doc.py                    |   1 +
 tools/include/uapi/linux/bpf.h                |  23 +-
 tools/include/uapi/linux/landlock.h           |  22 ++
 tools/lib/bpf/libbpf_probes.c                 |   3 +
 tools/testing/selftests/bpf/config            |   3 +
 tools/testing/selftests/bpf/test_verifier.c   |   1 +
 .../testing/selftests/bpf/verifier/landlock.c |  56 +++++
 tools/testing/selftests/landlock/.gitignore   |   5 +
 tools/testing/selftests/landlock/Makefile     |  27 +++
 tools/testing/selftests/landlock/config       |   5 +
 tools/testing/selftests/landlock/test.h       |  48 ++++
 tools/testing/selftests/landlock/test_base.c  |  24 ++
 .../testing/selftests/landlock/test_ptrace.c  | 210 ++++++++++++++++++
 13 files changed, 427 insertions(+), 1 deletion(-)
 create mode 100644 tools/include/uapi/linux/landlock.h
 create mode 100644 tools/testing/selftests/bpf/verifier/landlock.c
 create mode 100644 tools/testing/selftests/landlock/.gitignore
 create mode 100644 tools/testing/selftests/landlock/Makefile
 create mode 100644 tools/testing/selftests/landlock/config
 create mode 100644 tools/testing/selftests/landlock/test.h
 create mode 100644 tools/testing/selftests/landlock/test_base.c
 create mode 100644 tools/testing/selftests/landlock/test_ptrace.c

diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 7548569e8076..8e4c0fe75663 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -466,6 +466,7 @@ class PrinterHelpers(Printer):
             'const struct sk_buff': 'const struct __sk_buff',
             'struct sk_msg_buff': 'struct sk_msg_md',
             'struct xdp_buff': 'struct xdp_md',
+            'struct task_struct': 'void',
     }
 
     def print_header(self):
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4af8b0819a32..c88436b97163 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -173,6 +173,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_CGROUP_SYSCTL,
 	BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
 	BPF_PROG_TYPE_CGROUP_SOCKOPT,
+	BPF_PROG_TYPE_LANDLOCK_HOOK,
 };
 
 enum bpf_attach_type {
@@ -199,6 +200,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UDP6_RECVMSG,
 	BPF_CGROUP_GETSOCKOPT,
 	BPF_CGROUP_SETSOCKOPT,
+	BPF_LANDLOCK_PTRACE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -2775,6 +2777,24 @@ union bpf_attr {
  * 		restricted to raw_tracepoint bpf programs.
  * 	Return
  * 		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_task_landlock_ptrace_ancestor(struct task_struct *parent, struct task_struct *child)
+ *	Description
+ *		Check the relation of a potentially parent task with a child
+ *		one, according to their Landlock ptrace hook programs.
+ *	Return
+ *		**-EINVAL** if the child's ptrace programs are not comparable
+ *		to the parent ones, i.e. one of them is an empty set.
+ *
+ *		**-ENOENT** if the parent's ptrace programs are either in a
+ *		separate hierarchy of the child ones, or if the parent's ptrace
+ *		programs are a superset of the child ones.
+ *
+ *		0 if the parent's ptrace programs are the same as the child
+ *		ones.
+ *
+ *		1 if the parent's ptrace programs are indeed a subset of the
+ *		child ones.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2888,7 +2908,8 @@ union bpf_attr {
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
 	FN(tcp_gen_syncookie),		\
-	FN(skb_output),
+	FN(skb_output),			\
+	FN(task_landlock_ptrace_ancestor),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/include/uapi/linux/landlock.h b/tools/include/uapi/linux/landlock.h
new file mode 100644
index 000000000000..3db2d190c4e7
--- /dev/null
+++ b/tools/include/uapi/linux/landlock.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Landlock - UAPI headers
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2018-2019 ANSSI
+ */
+
+#ifndef _UAPI__LINUX_LANDLOCK_H__
+#define _UAPI__LINUX_LANDLOCK_H__
+
+#include <linux/types.h>
+
+#define LANDLOCK_RET_ALLOW	0
+#define LANDLOCK_RET_DENY	1
+
+struct landlock_context_ptrace {
+	__u64 tracer;
+	__u64 tracee;
+};
+
+#endif /* _UAPI__LINUX_LANDLOCK_H__ */
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 4b0b0364f5fc..1e0d6346a7c7 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -78,6 +78,9 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
 	case BPF_PROG_TYPE_KPROBE:
 		xattr.kern_version = get_kernel_version();
 		break;
+	case BPF_PROG_TYPE_LANDLOCK_HOOK:
+		xattr.expected_attach_type = BPF_LANDLOCK_PTRACE;
+		break;
 	case BPF_PROG_TYPE_UNSPEC:
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 5dc109f4c097..3161a88a6059 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -35,3 +35,6 @@ CONFIG_MPLS_ROUTING=m
 CONFIG_MPLS_IPTUNNEL=m
 CONFIG_IPV6_SIT=m
 CONFIG_BPF_JIT=y
+CONFIG_SECCOMP_FILTER=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_LANDLOCK=y
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index d27fd929abb9..74f249dafc0b 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -30,6 +30,7 @@
 #include <linux/bpf.h>
 #include <linux/if_ether.h>
 #include <linux/btf.h>
+#include <linux/landlock.h>
 
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
diff --git a/tools/testing/selftests/bpf/verifier/landlock.c b/tools/testing/selftests/bpf/verifier/landlock.c
new file mode 100644
index 000000000000..59cd333745dc
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/landlock.c
@@ -0,0 +1,56 @@
+{
+	"landlock/ptrace: always accept",
+	.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK,
+	.expected_attach_type = BPF_LANDLOCK_PTRACE,
+	.insns = {
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+},
+{
+	"landlock/ptrace: forbid arbitrary return value",
+	.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK,
+	.expected_attach_type = BPF_LANDLOCK_PTRACE,
+	.insns = {
+		BPF_MOV32_IMM(BPF_REG_0, 2),
+		BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "At program exit the register R0 has value (0x2; 0x0) should have been in (0x0; 0x1)",
+},
+{
+	"landlock/ptrace: read context and call dedicated helper",
+	.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK,
+	.expected_attach_type = BPF_LANDLOCK_PTRACE,
+	.insns = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracer)),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracer)),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				BPF_FUNC_task_landlock_ptrace_ancestor),
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+},
+{
+	"landlock/ptrace: forbid pointer arithmetic",
+	.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK,
+	.expected_attach_type = BPF_LANDLOCK_PTRACE,
+	.insns = {
+		BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracer)),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracee)),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+		BPF_MOV32_IMM(BPF_REG_0, 0),
+		BPF_EXIT_INSN(),
+	},
+	.result = REJECT,
+	.errstr = "R1 pointer arithmetic on task prohibited",
+},
diff --git a/tools/testing/selftests/landlock/.gitignore b/tools/testing/selftests/landlock/.gitignore
new file mode 100644
index 000000000000..4c5c01d23fe0
--- /dev/null
+++ b/tools/testing/selftests/landlock/.gitignore
@@ -0,0 +1,5 @@
+/feature
+/fixdep
+/*libbpf*
+/test_base
+/test_ptrace
diff --git a/tools/testing/selftests/landlock/Makefile b/tools/testing/selftests/landlock/Makefile
new file mode 100644
index 000000000000..2da77c30e77f
--- /dev/null
+++ b/tools/testing/selftests/landlock/Makefile
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0
+
+LIBDIR := $(abspath ../../../lib)
+BPFDIR := $(LIBDIR)/bpf
+TOOLSDIR := $(abspath ../../../include)
+APIDIR := $(TOOLSDIR)/uapi
+
+CFLAGS += -g -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(TOOLSDIR)
+LDLIBS += -lelf
+
+test_src = $(wildcard test_*.c)
+
+TEST_GEN_PROGS := $(test_src:.c=)
+
+include ../lib.mk
+
+BPFOBJ := $(OUTPUT)/libbpf.a
+
+$(TEST_GEN_PROGS): $(BPFOBJ) ../kselftest_harness.h
+
+.PHONY: force
+
+# force a rebuild of BPFOBJ when its dependencies are updated
+force:
+
+$(BPFOBJ): force
+	$(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/
diff --git a/tools/testing/selftests/landlock/config b/tools/testing/selftests/landlock/config
new file mode 100644
index 000000000000..fa5081b840ad
--- /dev/null
+++ b/tools/testing/selftests/landlock/config
@@ -0,0 +1,5 @@
+CONFIG_BPF=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_SECCOMP_FILTER=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_LANDLOCK=y
diff --git a/tools/testing/selftests/landlock/test.h b/tools/testing/selftests/landlock/test.h
new file mode 100644
index 000000000000..836df68b6bb8
--- /dev/null
+++ b/tools/testing/selftests/landlock/test.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Landlock helpers
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019 ANSSI
+ */
+
+#include <bpf/bpf.h>
+#include <errno.h>
+#include <linux/filter.h>
+#include <linux/landlock.h>
+#include <linux/seccomp.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+
+#include "../kselftest_harness.h"
+#include "../../../../samples/bpf/bpf_load.h"
+
+#ifndef SECCOMP_PREPEND_LANDLOCK_PROG
+#define SECCOMP_PREPEND_LANDLOCK_PROG	4
+#endif
+
+#ifndef seccomp
+static int __attribute__((unused)) seccomp(unsigned int op, unsigned int flags,
+		void *args)
+{
+	errno = 0;
+	return syscall(__NR_seccomp, op, flags, args);
+}
+#endif
+
+static int __attribute__((unused)) ll_bpf_load_program(
+		const struct bpf_insn *bpf_insns, size_t insns_len,
+		char *log_buf, size_t log_buf_sz,
+		const enum bpf_attach_type attach_type)
+{
+	struct bpf_load_program_attr load_attr;
+
+	memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
+	load_attr.prog_type = BPF_PROG_TYPE_LANDLOCK_HOOK;
+	load_attr.expected_attach_type = attach_type;
+	load_attr.insns = bpf_insns;
+	load_attr.insns_cnt = insns_len / sizeof(struct bpf_insn);
+	load_attr.license = "GPL";
+
+	return bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz);
+}
diff --git a/tools/testing/selftests/landlock/test_base.c b/tools/testing/selftests/landlock/test_base.c
new file mode 100644
index 000000000000..db46f39048cb
--- /dev/null
+++ b/tools/testing/selftests/landlock/test_base.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - base
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+
+#include "test.h"
+
+TEST(seccomp_landlock)
+{
+	int ret;
+
+	ret = seccomp(SECCOMP_PREPEND_LANDLOCK_PROG, 0, NULL);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EFAULT, errno) {
+		TH_LOG("Kernel does not support CONFIG_SECURITY_LANDLOCK");
+	}
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/landlock/test_ptrace.c b/tools/testing/selftests/landlock/test_ptrace.c
new file mode 100644
index 000000000000..f4ee67126394
--- /dev/null
+++ b/tools/testing/selftests/landlock/test_ptrace.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Landlock tests - ptrace
+ *
+ * Copyright © 2017-2019 Mickaël Salaün <mic@digikod.net>
+ * Copyright © 2019 ANSSI
+ */
+
+#define _GNU_SOURCE
+#include <signal.h> /* raise */
+#include <sys/ptrace.h>
+#include <sys/types.h> /* waitpid */
+#include <sys/wait.h> /* waitpid */
+#include <unistd.h> /* fork, pipe */
+
+#include "test.h"
+
+#define LOG_SIZE 512
+
+static void create_domain(struct __test_metadata *_metadata,
+		bool scoped_ptrace, bool inherited_only)
+{
+	const struct bpf_insn prog_void[] = {
+		BPF_MOV32_IMM(BPF_REG_0, LANDLOCK_RET_ALLOW),
+		BPF_EXIT_INSN(),
+	};
+	const struct bpf_insn prog_check[] = {
+		BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracer)),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6,
+			offsetof(struct landlock_context_ptrace, tracee)),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				BPF_FUNC_task_landlock_ptrace_ancestor),
+		/* if @tracee is an ancestor or at the same level of @tracer,
+		 * then allow ptrace (warning: do not use BPF_JGE 0) */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, inherited_only ? 0 : 1, 2),
+		BPF_MOV32_IMM(BPF_REG_0, LANDLOCK_RET_DENY),
+		BPF_EXIT_INSN(),
+		BPF_MOV32_IMM(BPF_REG_0, LANDLOCK_RET_ALLOW),
+		BPF_EXIT_INSN(),
+	};
+	int prog;
+	char log[LOG_SIZE] = "";
+
+	if (scoped_ptrace)
+		prog = ll_bpf_load_program(prog_check, sizeof(prog_check),
+				log, sizeof(log), BPF_LANDLOCK_PTRACE);
+	else
+		prog = ll_bpf_load_program(prog_void, sizeof(prog_void),
+				log, sizeof(log), BPF_LANDLOCK_PTRACE);
+	ASSERT_NE(-1, prog) {
+		TH_LOG("Failed to load the %s program: %s\n%s",
+				scoped_ptrace ? "check" : "void",
+				strerror(errno), log);
+	}
+	ASSERT_EQ(0, seccomp(SECCOMP_PREPEND_LANDLOCK_PROG, 0, &prog)) {
+		TH_LOG("Failed to create a Landlock domain: %s", strerror(errno));
+	}
+	EXPECT_EQ(0, close(prog));
+}
+
+/* test PTRACE_TRACEME and PTRACE_ATTACH for parent and child */
+static void _check_ptrace(struct __test_metadata *_metadata,
+		bool scoped_ptrace, bool domain_both,
+		bool domain_parent, bool domain_child)
+{
+	pid_t child, parent;
+	int status;
+	int pipe_child[2], pipe_parent[2];
+	char buf_parent;
+	const bool inherited_only = domain_both && !domain_parent && !domain_child;
+
+	parent = getpid();
+
+	ASSERT_EQ(0, pipe(pipe_child));
+	ASSERT_EQ(0, pipe(pipe_parent));
+	if (domain_both)
+		create_domain(_metadata, scoped_ptrace, inherited_only);
+
+	child = fork();
+	ASSERT_LE(0, child);
+	if (child == 0) {
+		char buf_child;
+
+		EXPECT_EQ(0, close(pipe_parent[1]));
+		EXPECT_EQ(0, close(pipe_child[0]));
+		if (domain_child)
+			create_domain(_metadata, scoped_ptrace, inherited_only);
+
+		/* sync #1 */
+		ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1)) {
+			TH_LOG("Failed to read() sync #1 from parent");
+		}
+		ASSERT_EQ('.', buf_child);
+
+		/* test the parent protection */
+		ASSERT_EQ((domain_child && scoped_ptrace) ? -1 : 0,
+				ptrace(PTRACE_ATTACH, parent, NULL, 0));
+		if (domain_child && scoped_ptrace) {
+			ASSERT_EQ(EPERM, errno);
+		} else {
+			ASSERT_EQ(parent, waitpid(parent, &status, 0));
+			ASSERT_EQ(1, WIFSTOPPED(status));
+			ASSERT_EQ(0, ptrace(PTRACE_DETACH, parent, NULL, 0));
+		}
+
+		/* sync #2 */
+		ASSERT_EQ(1, write(pipe_child[1], ".", 1)) {
+			TH_LOG("Failed to write() sync #2 to parent");
+		}
+
+		/* test traceme */
+		ASSERT_EQ((domain_parent && scoped_ptrace) ? -1 : 0,
+				ptrace(PTRACE_TRACEME));
+		if (domain_parent && scoped_ptrace) {
+			ASSERT_EQ(EPERM, errno);
+		} else {
+			ASSERT_EQ(0, raise(SIGSTOP));
+		}
+
+		/* sync #3 */
+		ASSERT_EQ(1, read(pipe_parent[0], &buf_child, 1)) {
+			TH_LOG("Failed to read() sync #3 from parent");
+		}
+		ASSERT_EQ('.', buf_child);
+		_exit(_metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
+	}
+
+	EXPECT_EQ(0, close(pipe_child[1]));
+	EXPECT_EQ(0, close(pipe_parent[0]));
+	if (domain_parent)
+		create_domain(_metadata, scoped_ptrace, inherited_only);
+
+	/* sync #1 */
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1)) {
+		TH_LOG("Failed to write() sync #1 to child");
+	}
+
+	/* test the parent protection */
+	/* sync #2 */
+	ASSERT_EQ(1, read(pipe_child[0], &buf_parent, 1)) {
+		TH_LOG("Failed to read() sync #2 from child");
+	}
+	ASSERT_EQ('.', buf_parent);
+
+	/* test traceme */
+	if (!(domain_parent && scoped_ptrace)) {
+		ASSERT_EQ(child, waitpid(child, &status, 0));
+		ASSERT_EQ(1, WIFSTOPPED(status));
+		ASSERT_EQ(0, ptrace(PTRACE_DETACH, child, NULL, 0));
+	}
+	/* test attach */
+	ASSERT_EQ((domain_parent && scoped_ptrace) ? -1 : 0,
+			ptrace(PTRACE_ATTACH, child, NULL, 0));
+	if (domain_parent && scoped_ptrace) {
+		ASSERT_EQ(EPERM, errno);
+	} else {
+		ASSERT_EQ(child, waitpid(child, &status, 0));
+		ASSERT_EQ(1, WIFSTOPPED(status));
+		ASSERT_EQ(0, ptrace(PTRACE_DETACH, child, NULL, 0));
+	}
+
+	/* sync #3 */
+	ASSERT_EQ(1, write(pipe_parent[1], ".", 1)) {
+		TH_LOG("Failed to write() sync #3 to child");
+	}
+	ASSERT_EQ(child, waitpid(child, &status, 0));
+	if (WIFSIGNALED(status) || WEXITSTATUS(status))
+		_metadata->passed = 0;
+}
+
+/* keep the *_scoped order to check program inheritance */
+#define CHECK_PTRACE(name, domain_both, domain_parent, domain_child) \
+	TEST(name ## _unscoped) { \
+		_check_ptrace(_metadata, false, domain_both, domain_parent, \
+				domain_child); \
+	} \
+	TEST(name ## _scoped) { \
+		_check_ptrace(_metadata, false, domain_both, domain_parent, \
+				domain_child); \
+		_check_ptrace(_metadata, true, domain_both, domain_parent, \
+				domain_child); \
+	}
+
+/* no domain */
+CHECK_PTRACE(allow_without_domain, false, false, false);
+
+/* child domain */
+CHECK_PTRACE(allow_with_one_domain, false, false, true);
+
+/* parent domain */
+CHECK_PTRACE(deny_with_parent_domain, false, true, false);
+
+/* parent and child domain */
+CHECK_PTRACE(deny_with_sibling_domain, false, true, true);
+
+/* inherited domain */
+CHECK_PTRACE(allow_sibling_domain, true, false, false);
+
+/* inherited and child domain */
+CHECK_PTRACE(allow_with_nested_domain, true, false, true);
+
+/* inherited and parent domain */
+CHECK_PTRACE(deny_with_nested_and_parent_domain, true, true, false);
+
+/* inherited, parent and child domain */
+CHECK_PTRACE(deny_with_forked_domain, true, true, true);
+
+TEST_HARNESS_MAIN
-- 
2.23.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox