Linux userland API discussions
 help / color / mirror / Atom feed
* Re: [RFC PATCH 04/10] pipe: Use head and tail pointers for the ring, not cursor and length [ver #2]
From: Ilya Dryomov @ 2019-10-30 16:19 UTC (permalink / raw)
  To: David Howells
  Cc: Linus Torvalds, Rasmus Villemoes, Greg Kroah-Hartman,
	Peter Zijlstra, nicolas.dichtel, raven, Christian Brauner,
	keyrings, linux-usb, linux-block, linux-security-module,
	linux-fsdevel, linux-api, LKML
In-Reply-To: <157186186167.3995.7568100174393739543.stgit@warthog.procyon.org.uk>

On Thu, Oct 24, 2019 at 11:49 AM David Howells <dhowells@redhat.com> wrote:
>
> Convert pipes to use head and tail pointers for the buffer ring rather than
> pointer and length as the latter requires two atomic ops to update (or a
> combined op) whereas the former only requires one.
>
>  (1) The head pointer is the point at which production occurs and points to
>      the slot in which the next buffer will be placed.  This is equivalent
>      to pipe->curbuf + pipe->nrbufs.
>
>      The head pointer belongs to the write-side.
>
>  (2) The tail pointer is the point at which consumption occurs.  It points
>      to the next slot to be consumed.  This is equivalent to pipe->curbuf.
>
>      The tail pointer belongs to the read-side.
>
>  (3) head and tail are allowed to run to UINT_MAX and wrap naturally.  They
>      are only masked off when the array is being accessed, e.g.:
>
>         pipe->bufs[head & mask]
>
>      This means that it is not necessary to have a dead slot in the ring as
>      head == tail isn't ambiguous.
>
>  (4) The ring is empty if "head == tail".
>
>      A helper, pipe_empty(), is provided for this.
>
>  (5) The occupancy of the ring is "head - tail".
>
>      A helper, pipe_occupancy(), is provided for this.
>
>  (6) The number of free slots in the ring is "pipe->ring_size - occupancy".
>
>      A helper, pipe_space_for_user() is provided to indicate how many slots
>      userspace may use.
>
>  (7) The ring is full if "head - tail >= pipe->ring_size".
>
>      A helper, pipe_full(), is provided for this.
>
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
>
>  fs/fuse/dev.c             |   31 +++--
>  fs/pipe.c                 |  169 ++++++++++++++++-------------
>  fs/splice.c               |  188 ++++++++++++++++++++------------
>  include/linux/pipe_fs_i.h |   86 ++++++++++++++-
>  include/linux/uio.h       |    4 -
>  lib/iov_iter.c            |  266 +++++++++++++++++++++++++--------------------
>  6 files changed, 464 insertions(+), 280 deletions(-)
>
> diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
> index dadd617d826c..1e4bc27573cc 100644
> --- a/fs/fuse/dev.c
> +++ b/fs/fuse/dev.c
> @@ -703,7 +703,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
>                         cs->pipebufs++;
>                         cs->nr_segs--;
>                 } else {
> -                       if (cs->nr_segs == cs->pipe->buffers)
> +                       if (cs->nr_segs >= cs->pipe->ring_size)
>                                 return -EIO;
>
>                         page = alloc_page(GFP_HIGHUSER);
> @@ -879,7 +879,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
>         struct pipe_buffer *buf;
>         int err;
>
> -       if (cs->nr_segs == cs->pipe->buffers)
> +       if (cs->nr_segs >= cs->pipe->ring_size)
>                 return -EIO;
>
>         err = unlock_request(cs->req);
> @@ -1341,7 +1341,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
>         if (!fud)
>                 return -EPERM;
>
> -       bufs = kvmalloc_array(pipe->buffers, sizeof(struct pipe_buffer),
> +       bufs = kvmalloc_array(pipe->ring_size, sizeof(struct pipe_buffer),
>                               GFP_KERNEL);
>         if (!bufs)
>                 return -ENOMEM;
> @@ -1353,7 +1353,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
>         if (ret < 0)
>                 goto out;
>
> -       if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
> +       if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->ring_size) {
>                 ret = -EIO;
>                 goto out;
>         }
> @@ -1935,6 +1935,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>                                      struct file *out, loff_t *ppos,
>                                      size_t len, unsigned int flags)
>  {
> +       unsigned int head, tail, mask, count;
>         unsigned nbuf;
>         unsigned idx;
>         struct pipe_buffer *bufs;
> @@ -1949,8 +1950,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>
>         pipe_lock(pipe);
>
> -       bufs = kvmalloc_array(pipe->nrbufs, sizeof(struct pipe_buffer),
> -                             GFP_KERNEL);
> +       head = pipe->head;
> +       tail = pipe->tail;
> +       mask = pipe->ring_size - 1;
> +       count = head - tail;
> +
> +       bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
>         if (!bufs) {
>                 pipe_unlock(pipe);
>                 return -ENOMEM;
> @@ -1958,8 +1963,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>
>         nbuf = 0;
>         rem = 0;
> -       for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
> -               rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
> +       for (idx = tail; idx < head && rem < len; idx++)
> +               rem += pipe->bufs[idx & mask].len;
>
>         ret = -EINVAL;
>         if (rem < len)
> @@ -1970,16 +1975,16 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>                 struct pipe_buffer *ibuf;
>                 struct pipe_buffer *obuf;
>
> -               BUG_ON(nbuf >= pipe->buffers);
> -               BUG_ON(!pipe->nrbufs);
> -               ibuf = &pipe->bufs[pipe->curbuf];
> +               BUG_ON(nbuf >= pipe->ring_size);
> +               BUG_ON(tail == head);
> +               ibuf = &pipe->bufs[tail & mask];
>                 obuf = &bufs[nbuf];
>
>                 if (rem >= ibuf->len) {
>                         *obuf = *ibuf;
>                         ibuf->ops = NULL;
> -                       pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
> -                       pipe->nrbufs--;
> +                       tail++;
> +                       pipe_commit_read(pipe, tail);
>                 } else {
>                         if (!pipe_buf_get(pipe, ibuf))
>                                 goto out_free;
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 8a2ab2f974bd..8a0806fe12d3 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -43,10 +43,11 @@ unsigned long pipe_user_pages_hard;
>  unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
>
>  /*
> - * We use a start+len construction, which provides full use of the
> - * allocated memory.
> - * -- Florian Coosmann (FGC)
> - *
> + * We use head and tail indices that aren't masked off, except at the point of
> + * dereference, but rather they're allowed to wrap naturally.  This means there
> + * isn't a dead spot in the buffer, provided the ring size < INT_MAX.
> + * -- David Howells 2019-09-23.

Hi David,

Is "ring size < INT_MAX" constraint correct?

I've never had to implement this free running indices scheme, but
the way I've always visualized it is that the top bit of the index is
used as a lap (as in a race) indicator, leaving 31 bits to work with
(in case of unsigned ints).  Should that be

  ring size <= 2^31

or more precisely

  ring size is a power of two <= 2^31

or am I missing something?

Thanks,

                Ilya

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Edgecombe, Rick P @ 2019-10-30 17:48 UTC (permalink / raw)
  To: peterz@infradead.org
  Cc: adobriyan@gmail.com, linux-kernel@vger.kernel.org,
	rppt@kernel.org, rostedt@goodmis.org, jejb@linux.ibm.com,
	tglx@linutronix.de, linux-mm@kvack.org,
	dave.hansen@linux.intel.com, linux-api@vger.kernel.org,
	x86@kernel.org, akpm@linux-foundation.org, hpa@zytor.com,
	mingo@redhat.com, luto@kernel.org, kirill@shutemov.name,
	bp@alien8.de, rppt@linux.ibm.com
In-Reply-To: <20191030100418.GV4097@hirez.programming.kicks-ass.net>

On Wed, 2019-10-30 at 11:04 +0100, Peter Zijlstra wrote:
> > You mean shatter performance?
> 
> Shatter (all) large pages.

So it looks like this is already happening then to some degree. It's not just
BPF either, any module_alloc() user is going to do something similar with the
direct map alias of the page they got for the text.

So there must be at least some usages where breaking the direct map down, for
like a page to store a key or something, isn't totally horrible.


^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Dave Hansen @ 2019-10-30 17:58 UTC (permalink / raw)
  To: Edgecombe, Rick P, peterz@infradead.org
  Cc: adobriyan@gmail.com, linux-kernel@vger.kernel.org,
	rppt@kernel.org, rostedt@goodmis.org, jejb@linux.ibm.com,
	tglx@linutronix.de, linux-mm@kvack.org,
	dave.hansen@linux.intel.com, linux-api@vger.kernel.org,
	x86@kernel.org, akpm@linux-foundation.org, hpa@zytor.com,
	mingo@redhat.com, luto@kernel.org, kirill@shutemov.name,
	bp@alien8.de, rppt@linux.ibm.com
In-Reply-To: <b4d87f54b02cfccb58442f791485cad1ac080063.camel@intel.com>

On 10/30/19 10:48 AM, Edgecombe, Rick P wrote:
> On Wed, 2019-10-30 at 11:04 +0100, Peter Zijlstra wrote:
>>> You mean shatter performance?
>>
>> Shatter (all) large pages.
> 
> So it looks like this is already happening then to some degree. It's not just
> BPF either, any module_alloc() user is going to do something similar with the
> direct map alias of the page they got for the text.
> 
> So there must be at least some usages where breaking the direct map down, for
> like a page to store a key or something, isn't totally horrible.

The systems that really need large pages are the large ones.  They have
the same TLBs and data structures as really little systems, but orders
of magnitude more address space.  Modules and BPF are a (hopefully) drop
in the bucket on small systems and they're really inconsequential on
really big systems.

Modules also require privilege.

Allowing random user apps to fracture the direct map for every page of
their memory or *lots* of pages of their memory is an entirely different
kind of problem from modules.  It takes a "drop in the bucket"
fracturing and turns it into the common case.

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Dave Hansen @ 2019-10-30 18:01 UTC (permalink / raw)
  To: Edgecombe, Rick P, peterz@infradead.org
  Cc: adobriyan@gmail.com, linux-kernel@vger.kernel.org,
	rppt@kernel.org, rostedt@goodmis.org, jejb@linux.ibm.com,
	tglx@linutronix.de, linux-mm@kvack.org,
	dave.hansen@linux.intel.com, linux-api@vger.kernel.org,
	x86@kernel.org, akpm@linux-foundation.org, hpa@zytor.com,
	mingo@redhat.com, luto@kernel.org, kirill@shutemov.name,
	bp@alien8.de, rppt@linux.ibm.com
In-Reply-To: <775eb0cf-7189-a314-5dde-f720b56ec3b2@intel.com>

On 10/30/19 10:58 AM, Dave Hansen wrote:
> Modules also require privilege.

IMNHO, if BPF is fracturing large swaths the direct map with no
privilege, it's only a matter of time until it starts to cause problems.
 The fact that we do it today is only evidence that we have a ticking
time bomb, not that it's OK.

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Peter Zijlstra @ 2019-10-30 18:39 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Edgecombe, Rick P, adobriyan@gmail.com,
	linux-kernel@vger.kernel.org, rppt@kernel.org,
	rostedt@goodmis.org, jejb@linux.ibm.com, tglx@linutronix.de,
	linux-mm@kvack.org, dave.hansen@linux.intel.com,
	linux-api@vger.kernel.org, x86@kernel.org,
	akpm@linux-foundation.org, hpa@zytor.com, mingo@redhat.com,
	luto@kernel.org, kirill@shutemov.name, bp@alien8.de
In-Reply-To: <CAADnVQ+3LeLWv-rpATyfAbdS1w9L=sCQFuyy=paCZVBUr0rK6Q@mail.gmail.com>

On Wed, Oct 30, 2019 at 08:35:09AM -0700, Alexei Starovoitov wrote:
> On Wed, Oct 30, 2019 at 3:06 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Tue, Oct 29, 2019 at 05:27:43PM +0000, Edgecombe, Rick P wrote:
> > > On Mon, 2019-10-28 at 22:00 +0100, Peter Zijlstra wrote:
> >
> > > > That should be limited to the module range. Random data maps could
> > > > shatter the world.
> > >
> > > BPF has one vmalloc space allocation for the byte code and one for the module
> > > space allocation for the JIT. Both get RO also set on the direct map alias of
> > > the pages, and reset RW when freed.
> >
> > Argh, I didn't know they mapped the bytecode RO; why does it do that? It
> > can throw out the bytecode once it's JIT'ed.
> 
> because of endless security "concerns" that some folks had.
> Like what if something can exploit another bug in the kernel
> and modify bytecode that was already verified
> then interpreter will execute that modified bytecode.

But when it's JIT'ed the bytecode is no longer of relevance, right? So
any scenario with a JIT on can then toss the bytecode and certainly
doesn't need to map it RO.

> Sort of similar reasoning why .text is read-only.
> I think it's not a realistic attack, but I didn't bother to argue back then.
> The mere presence of interpreter itself is a real security concern.
> People that care about speculation attacks should
> have CONFIG_BPF_JIT_ALWAYS_ON=y,

This isn't about speculation attacks, it is about breaking buffer limits
and being able to write to memory. And in that respect being able to
change the current task state (write it's effective PID to 0) is much
simpler than writing to text or bytecode, but if you cannot reach/find
the task struct but can reach/find text..

> so modifying bytecode via another exploit will be pointless.
> Getting rid of RO for bytecode will save a ton of memory too,
> since we won't need to allocate full page for each small programs.

So I'm thinking we can get rid of that for any scenario that has the JIT
enabled -- not only JIT_ALWAYS_ON.

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Alexei Starovoitov @ 2019-10-30 18:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Edgecombe, Rick P, adobriyan@gmail.com,
	linux-kernel@vger.kernel.org, rppt@kernel.org,
	rostedt@goodmis.org, jejb@linux.ibm.com, tglx@linutronix.de,
	linux-mm@kvack.org, dave.hansen@linux.intel.com,
	linux-api@vger.kernel.org, x86@kernel.org,
	akpm@linux-foundation.org, hpa@zytor.com, mingo@redhat.com,
	luto@kernel.org, kirill@shutemov.name, bp@alien8.de
In-Reply-To: <20191030183944.GV4114@hirez.programming.kicks-ass.net>

On Wed, Oct 30, 2019 at 11:39 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Wed, Oct 30, 2019 at 08:35:09AM -0700, Alexei Starovoitov wrote:
> > On Wed, Oct 30, 2019 at 3:06 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Tue, Oct 29, 2019 at 05:27:43PM +0000, Edgecombe, Rick P wrote:
> > > > On Mon, 2019-10-28 at 22:00 +0100, Peter Zijlstra wrote:
> > >
> > > > > That should be limited to the module range. Random data maps could
> > > > > shatter the world.
> > > >
> > > > BPF has one vmalloc space allocation for the byte code and one for the module
> > > > space allocation for the JIT. Both get RO also set on the direct map alias of
> > > > the pages, and reset RW when freed.
> > >
> > > Argh, I didn't know they mapped the bytecode RO; why does it do that? It
> > > can throw out the bytecode once it's JIT'ed.
> >
> > because of endless security "concerns" that some folks had.
> > Like what if something can exploit another bug in the kernel
> > and modify bytecode that was already verified
> > then interpreter will execute that modified bytecode.
>
> But when it's JIT'ed the bytecode is no longer of relevance, right? So
> any scenario with a JIT on can then toss the bytecode and certainly
> doesn't need to map it RO.

We keep so called "xlated" bytecode around for debugging.
It's the one that is actually running. It was modified through
several stages of the verifier before being runnable by interpreter.
When folks debug stuff in production they want to see
the whole thing. Both x86 asm and xlated bytecode.
xlated bytecode also sanitized before it's returned
back to user space.

> > Sort of similar reasoning why .text is read-only.
> > I think it's not a realistic attack, but I didn't bother to argue back then.
> > The mere presence of interpreter itself is a real security concern.
> > People that care about speculation attacks should
> > have CONFIG_BPF_JIT_ALWAYS_ON=y,
>
> This isn't about speculation attacks, it is about breaking buffer limits
> and being able to write to memory. And in that respect being able to
> change the current task state (write it's effective PID to 0) is much
> simpler than writing to text or bytecode, but if you cannot reach/find
> the task struct but can reach/find text..

exactly. that's why RO bytecode was dubious to me from the beginning.
For an attacker to write meaningful bytecode they need to know
quite a few other kernel internal pointers.
If an exploit can write into memory there are plenty of easier targets.

> > so modifying bytecode via another exploit will be pointless.
> > Getting rid of RO for bytecode will save a ton of memory too,
> > since we won't need to allocate full page for each small programs.
>
> So I'm thinking we can get rid of that for any scenario that has the JIT
> enabled -- not only JIT_ALWAYS_ON.

Sounds good to me. Happy to do that. Will add it to our todo list.

^ permalink raw reply

* Re: [PATCH ghak90 V7 20/21] audit: add capcontid to set contid outside init_user_ns
From: Paul Moore @ 2019-10-30 20:27 UTC (permalink / raw)
  To: Richard Guy Briggs
  Cc: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
	LKML, netdev, netfilter-devel, sgrubb, omosnace, dhowells, simo,
	Eric Paris, Serge Hallyn, ebiederm, nhorman, Dan Walsh, mpatel
In-Reply-To: <20191024210010.owwgc3bqbvtdsqws@madcap2.tricolour.ca>

On Thu, Oct 24, 2019 at 5:00 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> Here's the note I had from that meeting:
>
> - Eric raised the issue that using /proc is likely to get more and more
>   hoary due to mount namespaces and suggested that we use a netlink
> audit message (or a new syscall) to set the audit container identifier
> and since the loginuid is a similar type of operation, that it should be
> migrated over to a similar mechanism to get it away from /proc.  Get
> could be done with a netlink audit message that triggers an audit log
> message to deliver the information.  I'm reluctant to further pollute
> the syscall space if we can find another method.  The netlink audit
> message makes sense since any audit-enabled service is likely to already
> have an audit socket open.

Thanks for the background info on the off-list meeting.  I would
encourage you to have discussions like this on-list in the future; if
that isn't possible, hosting a public call would okay-ish, but a
distant second.

At this point in time I'm not overly concerned about /proc completely
going away in namespaces/containers that are full featured enough to
host a container orchestrator.  If/when reliance on procfs becomes an
issue, we can look at alternate APIs, but given the importance of
/proc to userspace (including to audit) I suspect we are going to see
it persist for some time.  I would prefer to see you to drop the audit
container ID netlink API portions of this patchset and focus on the
procfs API.

Also, for the record, removing the audit loginuid from procfs is not
something to take lightly, if at all; like it or not, it's part of the
kernel API.

-- 
paul moore
www.paul-moore.com

^ permalink raw reply

* Re: [PATCH ghak90 V7 14/21] audit: contid check descendancy and nesting
From: Paul Moore @ 2019-10-30 20:32 UTC (permalink / raw)
  To: Richard Guy Briggs
  Cc: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
	LKML, netdev, netfilter-devel, sgrubb, omosnace, dhowells, simo,
	Eric Paris, Serge Hallyn, ebiederm, nhorman, Dan Walsh, mpatel
In-Reply-To: <20191024220814.pid5ql6kvyr4ianb@madcap2.tricolour.ca>

On Thu, Oct 24, 2019 at 6:08 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2019-10-10 20:40, Paul Moore wrote:
> > On Wed, Sep 18, 2019 at 9:26 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > ?fixup! audit: convert to contid list to check for orch/engine ownership
> >
> > ?
> >
> > > Require the target task to be a descendant of the container
> > > orchestrator/engine.
> > >
> > > You would only change the audit container ID from one set or inherited
> > > value to another if you were nesting containers.
> > >
> > > If changing the contid, the container orchestrator/engine must be a
> > > descendant and not same orchestrator as the one that set it so it is not
> > > possible to change the contid of another orchestrator's container.
> >
> > Did you mean to say that the container orchestrator must be an
> > ancestor of the target, and the same orchestrator as the one that set
> > the target process' audit container ID?
>
> Not quite, the first half yes, but the second half: if it was already
> set by that orchestrator, it can't be set again.  If it is a different
> orchestrator that is a descendant of the orchestrator that set it, then
> allow the action.
>
> > Or maybe I'm missing something about what you are trying to do?
>
> Does that help clarify it?

I think so, it's pretty much as you stated originally: "Require the
target task to be a descendant of the container orchestrator/engine".
It's possible I misread something in the patch, or got lost in all the
?fixup! patching.  I'll take a closer look at the next revision of the
patchset to make sure the code makes sense to me, but the logic seems
reasonable.

-- 
paul moore
www.paul-moore.com

^ permalink raw reply

* Re: [RFC PATCH 04/10] pipe: Use head and tail pointers for the ring, not cursor and length [ver #2]
From: Rasmus Villemoes @ 2019-10-30 20:35 UTC (permalink / raw)
  To: Ilya Dryomov, David Howells
  Cc: Linus Torvalds, Rasmus Villemoes, Greg Kroah-Hartman,
	Peter Zijlstra, nicolas.dichtel, raven, Christian Brauner,
	keyrings, linux-usb, linux-block, linux-security-module,
	linux-fsdevel, linux-api, LKML
In-Reply-To: <CAOi1vP97DMX8zweOLfBDOFstrjC78=6RgxK3PPj_mehCOSeoaw@mail.gmail.com>

On 30/10/2019 17.19, Ilya Dryomov wrote:
> On Thu, Oct 24, 2019 at 11:49 AM David Howells <dhowells@redhat.com> wrote:
>>  /*
>> - * We use a start+len construction, which provides full use of the
>> - * allocated memory.
>> - * -- Florian Coosmann (FGC)
>> - *
>> + * We use head and tail indices that aren't masked off, except at the point of
>> + * dereference, but rather they're allowed to wrap naturally.  This means there
>> + * isn't a dead spot in the buffer, provided the ring size < INT_MAX.
>> + * -- David Howells 2019-09-23.
> 
> Hi David,
> 
> Is "ring size < INT_MAX" constraint correct?

No. As long as one always uses a[idx % size] to access the array, the
only requirement is that size is representable in an unsigned int. Then
because one also wants to do the % using simple bitmasking, that further
restricts one to sizes that are a power of 2, so the end result is that
the max size is 2^31 (aka INT_MAX+1).

> I've never had to implement this free running indices scheme, but
> the way I've always visualized it is that the top bit of the index is
> used as a lap (as in a race) indicator, leaving 31 bits to work with
> (in case of unsigned ints).  Should that be
> 
>   ring size <= 2^31
> 
> or more precisely
> 
>   ring size is a power of two <= 2^31

Exactly. But it's kind of moot since the ring size would never be
allowed to grow anywhere near that.

Rasmus

^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Andy Lutomirski @ 2019-10-30 21:28 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: Andy Lutomirski, LKML, Alexey Dobriyan, Andrew Morton,
	Arnd Bergmann, Borislav Petkov, Dave Hansen, James Bottomley,
	Peter Zijlstra, Steven Rostedt, Thomas Gleixner, Ingo Molnar,
	H. Peter Anvin, Linux API, Linux-MM, X86 ML, Mike Rapoport
In-Reply-To: <20191030084005.GC20624@rapoport-lnx>

On Wed, Oct 30, 2019 at 1:40 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> On Tue, Oct 29, 2019 at 10:00:55AM -0700, Andy Lutomirski wrote:
> > On Tue, Oct 29, 2019 at 2:33 AM Mike Rapoport <rppt@kernel.org> wrote:
> > >
> > > On Mon, Oct 28, 2019 at 02:44:23PM -0600, Andy Lutomirski wrote:
> > > >
> > > > > On Oct 27, 2019, at 4:17 AM, Mike Rapoport <rppt@kernel.org> wrote:
> > > > >
> > > > > From: Mike Rapoport <rppt@linux.ibm.com>
> > > > >
> > > > > Hi,
> > > > >
> > > > > The patch below aims to allow applications to create mappins that have
> > > > > pages visible only to the owning process. Such mappings could be used to
> > > > > store secrets so that these secrets are not visible neither to other
> > > > > processes nor to the kernel.
> > > > >
> > > > > I've only tested the basic functionality, the changes should be verified
> > > > > against THP/migration/compaction. Yet, I'd appreciate early feedback.
> > > >
> > > > I’ve contemplated the concept a fair amount, and I think you should
> > > > consider a change to the API. In particular, rather than having it be a
> > > > MAP_ flag, make it a chardev.  You can, at least at first, allow only
> > > > MAP_SHARED, and admins can decide who gets to use it.  It might also play
> > > > better with the VM overall, and you won’t need a VM_ flag for it — you
> > > > can just wire up .fault to do the right thing.
> > >
> > > I think mmap()/mprotect()/madvise() are the natural APIs for such
> > > interface.
> >
> > Then you have a whole bunch of questions to answer.  For example:
> >
> > What happens if you mprotect() or similar when the mapping is already
> > in use in a way that's incompatible with MAP_EXCLUSIVE?
>
> Then we refuse to mprotect()? Like in any other case when vm_flags are not
> compatible with required madvise()/mprotect() operation.
>

I'm not talking about flags.  I'm talking about the case where one
thread (or RDMA or whatever) has get_user_pages()'d a mapping and
another thread mprotect()s it MAP_EXCLUSIVE.

> > Is it actually reasonable to malloc() some memory and then make it exclusive?
> >
> > Are you permitted to map a file MAP_EXCLUSIVE?  What does it mean?
>
> I'd limit MAP_EXCLUSIVE only to anonymous memory.
>
> > What does MAP_PRIVATE | MAP_EXCLUSIVE do?
>
> My preference is to have only mmap() and then the semantics is more clear:
>
> MAP_PRIVATE | MAP_EXCLUSIVE creates a pre-populated region, marks it locked
> and drops the pages in this region from the direct map.
> The pages are returned back on munmap().
> Then there is no way to change an existing area to be exclusive or vice
> versa.

And what happens if you fork()?  Limiting it to MAP_SHARED |
MAP_EXCLUSIVE would about this particular nasty question.

>
> > How does one pass exclusive memory via SCM_RIGHTS?  (If it's a
> > memfd-like or chardev interface, it's trivial.  mmap(), not so much.)
>
> Why passing such memory via SCM_RIGHTS would be useful?

Suppose I want to put a secret into exclusive memory and then send
that secret to some other process.  The obvious approach would be to
SCM_RIGHTS an fd over, but you can't do that with MAP_EXCLUSIVE as
you've defined it.  In general, there are lots of use cases for memfd
and other fd-backed memory.

>
> > And finally, there's my personal giant pet peeve: a major use of this
> > will be for virtualization.  I suspect that a lot of people would like
> > the majority of KVM guest memory to be unmapped from the host
> > pagetables.  But people might also like for guest memory to be
> > unmapped in *QEMU's* pagetables, and mmap() is a basically worthless
> > interface for this.  Getting fd-backed memory into a guest will take
> > some possibly major work in the kernel, but getting vma-backed memory
> > into a guest without mapping it in the host user address space seems
> > much, much worse.
>
> Well, in my view, the MAP_EXCLUSIVE is intended to keep small secrets
> rather than use it for the entire guest memory. I even considered adding a
> limit for the mapping size, but then I decided that since RLIMIT_MEMLOCK is
> anyway enforced there is no need for a new one.
>
> I agree that getting fd-backed memory into a guest would be less pain that
> VMA, but KVM can already use memory outside the control of the kernel via
> /dev/map [1].

That series doesn't address the problem I'm talking about at all.  I'm
saying that there is a legitimate use case where QEMU should *not*
have a mapping of the memory.  So QEMU would create some exclusive
memory using /dev/exclusive_memory and would tell KVM to map it into
the guest without mapping it into QEMU's address space at all.

(In fact, the way that SEV currently works is *functionally* like
this, except that there's a bogus incoherent mapping in the QEMU
process that is a giant can of worms.


IMO a major benefit of a chardev approach is that you don't need a new
VM_ flag and you don't need to worry about wiring it up everywhere in
the core mm code.

^ permalink raw reply

* Re: [PATCH ghak90 V7 20/21] audit: add capcontid to set contid outside init_user_ns
From: Richard Guy Briggs @ 2019-10-30 22:03 UTC (permalink / raw)
  To: Paul Moore
  Cc: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
	LKML, netdev, netfilter-devel, sgrubb, omosnace, dhowells, simo,
	Eric Paris, Serge Hallyn, ebiederm, nhorman, Dan Walsh, mpatel
In-Reply-To: <CAHC9VhRDoX9du4XbCnBtBzsNPMGOsb-TKM1CC+sCL7HP=FuTRQ@mail.gmail.com>

On 2019-10-30 16:27, Paul Moore wrote:
> On Thu, Oct 24, 2019 at 5:00 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > Here's the note I had from that meeting:
> >
> > - Eric raised the issue that using /proc is likely to get more and more
> >   hoary due to mount namespaces and suggested that we use a netlink
> > audit message (or a new syscall) to set the audit container identifier
> > and since the loginuid is a similar type of operation, that it should be
> > migrated over to a similar mechanism to get it away from /proc.  Get
> > could be done with a netlink audit message that triggers an audit log
> > message to deliver the information.  I'm reluctant to further pollute
> > the syscall space if we can find another method.  The netlink audit
> > message makes sense since any audit-enabled service is likely to already
> > have an audit socket open.
> 
> Thanks for the background info on the off-list meeting.  I would
> encourage you to have discussions like this on-list in the future; if
> that isn't possible, hosting a public call would okay-ish, but a
> distant second.

I'm still trying to get Eric's attention to get him to weigh in here and
provide a more eloquent representation of his ideas and concerns.  Some
of it was related to CRIU(sp?) issues which we've already of which we've
already seen similar concerns in namespace identifiers including the
device identity to qualify it.

> At this point in time I'm not overly concerned about /proc completely
> going away in namespaces/containers that are full featured enough to
> host a container orchestrator.  If/when reliance on procfs becomes an
> issue, we can look at alternate APIs, but given the importance of
> /proc to userspace (including to audit) I suspect we are going to see
> it persist for some time.  I would prefer to see you to drop the audit
> container ID netlink API portions of this patchset and focus on the
> procfs API.

I've already refactored the code to put the netlink bits at the end as
completely optional pieces for completeness so they won't get in the way
of the real substance of this patchset.  The nesting depth and total
number of containers checks have also been punted to the end of the
patchset to get them out of the way of discussion.

> Also, for the record, removing the audit loginuid from procfs is not
> something to take lightly, if at all; like it or not, it's part of the
> kernel API.

Oh, I'm quite aware of how important this change is and it was discussed
with Steve Grubb who saw the concern and value of considering such a
disruptive change.  Removing proc support for auid/ses would be a
long-term deprecation if accepted.

Really, I should have labelled the v7 patchset as RFC since there were
so many new and disruptive ideas presented in it.

> paul moore
> www.paul-moore.com

- RGB

^ permalink raw reply

* Re: [RFC PATCH 04/10] pipe: Use head and tail pointers for the ring, not cursor and length [ver #2]
From: Ilya Dryomov @ 2019-10-30 22:16 UTC (permalink / raw)
  To: Rasmus Villemoes
  Cc: David Howells, Linus Torvalds, Greg Kroah-Hartman, Peter Zijlstra,
	nicolas.dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, linux-security-module, linux-fsdevel, linux-api,
	LKML
In-Reply-To: <4892d186-8eb0-a282-e7e6-e79958431a54@rasmusvillemoes.dk>

On Wed, Oct 30, 2019 at 9:35 PM Rasmus Villemoes
<linux@rasmusvillemoes.dk> wrote:
>
> On 30/10/2019 17.19, Ilya Dryomov wrote:
> > On Thu, Oct 24, 2019 at 11:49 AM David Howells <dhowells@redhat.com> wrote:
> >>  /*
> >> - * We use a start+len construction, which provides full use of the
> >> - * allocated memory.
> >> - * -- Florian Coosmann (FGC)
> >> - *
> >> + * We use head and tail indices that aren't masked off, except at the point of
> >> + * dereference, but rather they're allowed to wrap naturally.  This means there
> >> + * isn't a dead spot in the buffer, provided the ring size < INT_MAX.
> >> + * -- David Howells 2019-09-23.
> >
> > Hi David,
> >
> > Is "ring size < INT_MAX" constraint correct?
>
> No. As long as one always uses a[idx % size] to access the array, the
> only requirement is that size is representable in an unsigned int. Then
> because one also wants to do the % using simple bitmasking, that further
> restricts one to sizes that are a power of 2, so the end result is that
> the max size is 2^31 (aka INT_MAX+1).

I think the fact that indices are free running and wrap at a power of
two already restricts you to sizes the are a power of two, independent
of how you do masking.  If you switch to a[idx % size], size still has
to be a power of two for things to work when idx wraps.  Consider:

  size = 6
  head = tail = 4294967292, empty buffer

  push  4294967292 % 6 = 0
  push  4294967293 % 6 = 1
  push  4294967294 % 6 = 2
  push  4294967295 % 6 = 3
  push           0 % 6 = 0  <-- expected 4, overwrote a[0]

>
> > I've never had to implement this free running indices scheme, but
> > the way I've always visualized it is that the top bit of the index is
> > used as a lap (as in a race) indicator, leaving 31 bits to work with
> > (in case of unsigned ints).  Should that be
> >
> >   ring size <= 2^31
> >
> > or more precisely
> >
> >   ring size is a power of two <= 2^31
>
> Exactly. But it's kind of moot since the ring size would never be
> allowed to grow anywhere near that.

Thanks for confirming.  Even if it's kind of moot, I think it should be
corrected to avoid confusion.

                Ilya

^ permalink raw reply

* Re: [RFC PATCH v2 2/5] fs: add RWF_ENCODED for reading/writing compressed data
From: Omar Sandoval @ 2019-10-30 22:21 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: Darrick J. Wong, linux-fsdevel, linux-btrfs, Dave Chinner,
	Jann Horn, linux-api, kernel-team
In-Reply-To: <20191022013717.enwdmox4b7la4i74@yavin.dot.cyphar.com>

On Tue, Oct 22, 2019 at 12:37:17PM +1100, Aleksa Sarai wrote:
> On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Tue, Oct 22, 2019 at 05:38:31AM +1100, Aleksa Sarai wrote:
> > > On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > > > On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> > > > > From: Omar Sandoval <osandov@fb.com>
> > > > > 
> > > > > Btrfs supports transparent compression: data written by the user can be
> > > > > compressed when written to disk and decompressed when read back.
> > > > > However, we'd like to add an interface to write pre-compressed data
> > > > > directly to the filesystem, and the matching interface to read
> > > > > compressed data without decompressing it. This adds support for
> > > > > so-called "encoded I/O" via preadv2() and pwritev2().
> > > > > 
> > > > > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > > > > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > > is used for metadata: namely, the compression algorithm, unencoded
> > > > > (i.e., decompressed) length, and what subrange of the unencoded data
> > > > > should be used (needed for truncated or hole-punched extents and when
> > > > > reading in the middle of an extent). For reads, the filesystem returns
> > > > > this information; for writes, the caller provides it to the filesystem.
> > > > > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > > > > used to extend the interface in the future. The remaining iovecs contain
> > > > > the encoded extent.
> > > > > 
> > > > > Filesystems must indicate that they support encoded writes by setting
> > > > > FMODE_ENCODED_IO in ->file_open().
> > > > > 
> > > > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > > > ---
> > > > >  include/linux/fs.h      | 14 +++++++
> > > > >  include/uapi/linux/fs.h | 26 ++++++++++++-
> > > > >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> > > > >  3 files changed, 108 insertions(+), 14 deletions(-)
> > > > > 
> > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > > index e0d909d35763..54681f21e05e 100644
> > > > > --- a/include/linux/fs.h
> > > > > +++ b/include/linux/fs.h
> > > > > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> > > > >  /* File does not contribute to nr_files count */
> > > > >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> > > > >  
> > > > > +/* File supports encoded IO */
> > > > > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > > > > +
> > > > >  /*
> > > > >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> > > > >   * that indicates that they should check the contents of the iovec are
> > > > > @@ -314,6 +317,7 @@ enum rw_hint {
> > > > >  #define IOCB_SYNC		(1 << 5)
> > > > >  #define IOCB_WRITE		(1 << 6)
> > > > >  #define IOCB_NOWAIT		(1 << 7)
> > > > > +#define IOCB_ENCODED		(1 << 8)
> > > > >  
> > > > >  struct kiocb {
> > > > >  	struct file		*ki_filp;
> > > > > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> > > > >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> > > > >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> > > > >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > > > > +struct encoded_iov;
> > > > > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > > > > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > > > > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > > > > +				struct iov_iter *);
> > > > >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> > > > >  				struct file *file_out, loff_t pos_out,
> > > > >  				loff_t *count, unsigned int remap_flags);
> > > > > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> > > > >  			return -EOPNOTSUPP;
> > > > >  		ki->ki_flags |= IOCB_NOWAIT;
> > > > >  	}
> > > > > +	if (flags & RWF_ENCODED) {
> > > > > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > > > > +			return -EOPNOTSUPP;
> > > > > +		ki->ki_flags |= IOCB_ENCODED;
> > > > > +	}
> > > > >  	if (flags & RWF_HIPRI)
> > > > >  		ki->ki_flags |= IOCB_HIPRI;
> > > > >  	if (flags & RWF_DSYNC)
> > > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > > > > index 379a612f8f1d..ed92a8a257cb 100644
> > > > > --- a/include/uapi/linux/fs.h
> > > > > +++ b/include/uapi/linux/fs.h
> > > > > @@ -284,6 +284,27 @@ struct fsxattr {
> > > > >  
> > > > >  typedef int __bitwise __kernel_rwf_t;
> > > > >  
> > > > > +enum {
> > > > > +	ENCODED_IOV_COMPRESSION_NONE,
> > > > > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > > > > +	ENCODED_IOV_COMPRESSION_LZO,
> > > > > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > +};
> > > > > +
> > > > > +enum {
> > > > > +	ENCODED_IOV_ENCRYPTION_NONE,
> > > > > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > > > > +};
> > > > > +
> > > > > +struct encoded_iov {
> > > > > +	__u64 len;
> > > > > +	__u64 unencoded_len;
> > > > > +	__u64 unencoded_offset;
> > > > > +	__u32 compression;
> > > > > +	__u32 encryption;
> > > > 
> > > > Can we add some must-be-zero padding space at the end here for whomever
> > > > comes along next wanting to add more encoding info?
> > > 
> > > I would suggest to copy the extension design of copy_struct_from_user().
> > > Adding must-be-zero padding is a less-ideal solution to the extension
> > > problem than length-based extension.
> > 
> > Come to think of it, you /do/ have to specify iov_len so... yeah, do
> > that instead; we can always extend the structure later.
> > 
> > > Also (I might be wrong) but shouldn't the __u64s be __aligned_u64 (as
> > > with syscall structure arguments)?
> > 
> > <shrug> No idea, that's the first I've heard of that type and it doesn't
> > seem to be used by the fs code.  Why would we care about alignment for
> > an incore structure?
> 
> When passing u64s from userspace, it's generally considered a good idea
> to use __aligned_u64 -- the main reason is that 32-bit userspace on a
> 64-bit kernel will use different structure alignment for 64-bit fields.
> 
> This means you'd need to implement a bunch of COMPAT_SYSCALL-like
> handling for that case. It's much simpler to use __aligned_u64 (and on
> the plus side I don't think you need to add any fields to ensure the
> padding is zero).

I'll used __aligned_u64 for the next submission.

^ permalink raw reply

* Re: [RFC PATCH v2 2/5] fs: add RWF_ENCODED for reading/writing compressed data
From: Omar Sandoval @ 2019-10-30 22:26 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: Darrick J. Wong, linux-fsdevel, linux-btrfs, Dave Chinner,
	Jann Horn, linux-api, kernel-team
In-Reply-To: <20191022020215.csdwgi3ky27rfidf@yavin.dot.cyphar.com>

On Tue, Oct 22, 2019 at 01:02:15PM +1100, Aleksa Sarai wrote:
> On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > On Tue, Oct 22, 2019 at 05:38:31AM +1100, Aleksa Sarai wrote:
> > > On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > > > On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> > > > > From: Omar Sandoval <osandov@fb.com>
> > > > > 
> > > > > Btrfs supports transparent compression: data written by the user can be
> > > > > compressed when written to disk and decompressed when read back.
> > > > > However, we'd like to add an interface to write pre-compressed data
> > > > > directly to the filesystem, and the matching interface to read
> > > > > compressed data without decompressing it. This adds support for
> > > > > so-called "encoded I/O" via preadv2() and pwritev2().
> > > > > 
> > > > > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > > > > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > > is used for metadata: namely, the compression algorithm, unencoded
> > > > > (i.e., decompressed) length, and what subrange of the unencoded data
> > > > > should be used (needed for truncated or hole-punched extents and when
> > > > > reading in the middle of an extent). For reads, the filesystem returns
> > > > > this information; for writes, the caller provides it to the filesystem.
> > > > > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > > > > used to extend the interface in the future. The remaining iovecs contain
> > > > > the encoded extent.
> > > > > 
> > > > > Filesystems must indicate that they support encoded writes by setting
> > > > > FMODE_ENCODED_IO in ->file_open().
> > > > > 
> > > > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > > > ---
> > > > >  include/linux/fs.h      | 14 +++++++
> > > > >  include/uapi/linux/fs.h | 26 ++++++++++++-
> > > > >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> > > > >  3 files changed, 108 insertions(+), 14 deletions(-)
> > > > > 
> > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > > index e0d909d35763..54681f21e05e 100644
> > > > > --- a/include/linux/fs.h
> > > > > +++ b/include/linux/fs.h
> > > > > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> > > > >  /* File does not contribute to nr_files count */
> > > > >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> > > > >  
> > > > > +/* File supports encoded IO */
> > > > > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > > > > +
> > > > >  /*
> > > > >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> > > > >   * that indicates that they should check the contents of the iovec are
> > > > > @@ -314,6 +317,7 @@ enum rw_hint {
> > > > >  #define IOCB_SYNC		(1 << 5)
> > > > >  #define IOCB_WRITE		(1 << 6)
> > > > >  #define IOCB_NOWAIT		(1 << 7)
> > > > > +#define IOCB_ENCODED		(1 << 8)
> > > > >  
> > > > >  struct kiocb {
> > > > >  	struct file		*ki_filp;
> > > > > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> > > > >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> > > > >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> > > > >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > > > > +struct encoded_iov;
> > > > > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > > > > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > > > > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > > > > +				struct iov_iter *);
> > > > >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> > > > >  				struct file *file_out, loff_t pos_out,
> > > > >  				loff_t *count, unsigned int remap_flags);
> > > > > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> > > > >  			return -EOPNOTSUPP;
> > > > >  		ki->ki_flags |= IOCB_NOWAIT;
> > > > >  	}
> > > > > +	if (flags & RWF_ENCODED) {
> > > > > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > > > > +			return -EOPNOTSUPP;
> > > > > +		ki->ki_flags |= IOCB_ENCODED;
> > > > > +	}
> > > > >  	if (flags & RWF_HIPRI)
> > > > >  		ki->ki_flags |= IOCB_HIPRI;
> > > > >  	if (flags & RWF_DSYNC)
> > > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > > > > index 379a612f8f1d..ed92a8a257cb 100644
> > > > > --- a/include/uapi/linux/fs.h
> > > > > +++ b/include/uapi/linux/fs.h
> > > > > @@ -284,6 +284,27 @@ struct fsxattr {
> > > > >  
> > > > >  typedef int __bitwise __kernel_rwf_t;
> > > > >  
> > > > > +enum {
> > > > > +	ENCODED_IOV_COMPRESSION_NONE,
> > > > > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > > > > +	ENCODED_IOV_COMPRESSION_LZO,
> > > > > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > +};
> > > > > +
> > > > > +enum {
> > > > > +	ENCODED_IOV_ENCRYPTION_NONE,
> > > > > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > > > > +};
> > > > > +
> > > > > +struct encoded_iov {
> > > > > +	__u64 len;
> > > > > +	__u64 unencoded_len;
> > > > > +	__u64 unencoded_offset;
> > > > > +	__u32 compression;
> > > > > +	__u32 encryption;
> > > > 
> > > > Can we add some must-be-zero padding space at the end here for whomever
> > > > comes along next wanting to add more encoding info?
> > > 
> > > I would suggest to copy the extension design of copy_struct_from_user().
> > > Adding must-be-zero padding is a less-ideal solution to the extension
> > > problem than length-based extension.
> > 
> > Come to think of it, you /do/ have to specify iov_len so... yeah, do
> > that instead; we can always extend the structure later.
> 
> Just to clarify -- if we want to make the interface forward-compatible
> from the outset (programs built 4 years from now running on 5.5), we
> will need to implement this in the original merge. Otherwise userspace
> will need to handle backwards-compatibility themselves once new features
> are added.
> 
> @Omar: If it'd make your life easier, I can send some draft patches
> 	   which port copy_struct_from_user() to iovec-land.

You're right, I didn't think about the case of newer programs on older
kernels. I can do that for the next submission. RWF_ENCODED should
probably translate the E2BIG from copy_struct_from_user() to EINVAL,
though, to avoid ambiguity with the case that the buffer wasn't big
enough to return the encoded data.

^ permalink raw reply

* Re: [RFC PATCH 04/10] pipe: Use head and tail pointers for the ring, not cursor and length [ver #2]
From: Rasmus Villemoes @ 2019-10-30 22:38 UTC (permalink / raw)
  To: Ilya Dryomov
  Cc: David Howells, Linus Torvalds, Greg Kroah-Hartman, Peter Zijlstra,
	nicolas.dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, linux-security-module, linux-fsdevel, linux-api,
	LKML
In-Reply-To: <CAOi1vP9paV2-2_S0NgfbZDE6+5kqHXVc9xabHVC-2Ss1MmXkCg@mail.gmail.com>

On 30/10/2019 23.16, Ilya Dryomov wrote:
> On Wed, Oct 30, 2019 at 9:35 PM Rasmus Villemoes
> <linux@rasmusvillemoes.dk> wrote:
>>
>> On 30/10/2019 17.19, Ilya Dryomov wrote:
>>> On Thu, Oct 24, 2019 at 11:49 AM David Howells <dhowells@redhat.com> wrote:
>>>>  /*
>>>> - * We use a start+len construction, which provides full use of the
>>>> - * allocated memory.
>>>> - * -- Florian Coosmann (FGC)
>>>> - *
>>>> + * We use head and tail indices that aren't masked off, except at the point of
>>>> + * dereference, but rather they're allowed to wrap naturally.  This means there
>>>> + * isn't a dead spot in the buffer, provided the ring size < INT_MAX.
>>>> + * -- David Howells 2019-09-23.
>>>
>>> Hi David,
>>>
>>> Is "ring size < INT_MAX" constraint correct?
>>
>> No. As long as one always uses a[idx % size] to access the array, the
>> only requirement is that size is representable in an unsigned int. Then
>> because one also wants to do the % using simple bitmasking, that further
>> restricts one to sizes that are a power of 2, so the end result is that
>> the max size is 2^31 (aka INT_MAX+1).
> 
> I think the fact that indices are free running and wrap at a power of
> two already restricts you to sizes the are a power of two,

Ah, yes, of course. When reducing indices mod n that may already have
been implicitly reduced mod N, N must be a multiple of n for the result
to be well-defined.

Rasmus

^ permalink raw reply

* Re: [PATCH man-pages] Document encoded I/O
From: Omar Sandoval @ 2019-10-30 22:46 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: Amir Goldstein, linux-fsdevel, Linux Btrfs, Dave Chinner,
	Jann Horn, Linux API, kernel-team, Theodore Tso
In-Reply-To: <20191023121203.pozm2xzrbxmcqpbr@yavin.dot.cyphar.com>

On Wed, Oct 23, 2019 at 11:12:03PM +1100, Aleksa Sarai wrote:
> On 2019-10-23, Amir Goldstein <amir73il@gmail.com> wrote:
> > > >
> > > > No, I see why you choose to add the flag to open(2).
> > > > I have no objection.
> > > >
> > > > I once had a crazy thought how to add new open flags
> > > > in a non racy manner without adding a new syscall,
> > > > but as you wrote, this is not relevant for O_ALLOW_ENCODED.
> > > >
> > > > Something like:
> > > >
> > > > /*
> > > >  * Old kernels silently ignore unsupported open flags.
> > > >  * New kernels that gets __O_CHECK_NEWFLAGS do
> > > >  * the proper checking for unsupported flags AND set the
> > > >  * flag __O_HAVE_NEWFLAGS.
> > > >  */
> > > > #define O_FLAG1 __O_CHECK_NEWFLAGS|__O_FLAG1
> > > > #define O_HAVE_FLAG1 __O_HAVE_NEWFLAGS|__O_FLAG1
> > > >
> > > > fd = open(path, O_FLAG1);
> > > > if (fd < 0)
> > > >     return -errno;
> > > > flags = fcntl(fd, F_GETFL, 0);
> > > > if (flags < 0)
> > > >     return flags;
> > > > if ((flags & O_HAVE_FLAG1) != O_HAVE_FLAG1) {
> > > >     close(fd);
> > > >     return -EINVAL;
> > > > }
> > >
> > > You don't need to add __O_HAVE_NEWFLAGS to do this -- this already works
> > > today for userspace to check whether a flag works properly
> > > (specifically, __O_FLAG1 will only be set if __O_FLAG1 is supported --
> > > otherwise it gets cleared during build_open_flags).
> > 
> > That's a behavior of quite recent kernels since
> > 629e014bb834 fs: completely ignore unknown open flags
> > and maybe some stable kernels. Real old kernels don't have that luxury.
> 
> Ah okay -- so the key feature is that __O_CHECK_NEWFLAGS gets
> transformed into __O_HAVE_NEWFLAGS (making it so that both the older and
> current behaviours are detected). Apologies, I missed that on my first
> read-through.
> 
> While it is a little bit ugly, it probably wouldn't be a bad idea to
> have something like that.
> 
> > > The problem with adding new flags is that an *old* program running on a
> > > *new* kernel could pass a garbage flag (__O_CHECK_NEWFLAGS for instance)
> > > that causes an error only on the new kernel.
> > 
> > That's a theoretic problem. Same as O_PATH|O_TMPFILE.
> > Show me a real life program that passes garbage files to open.
> 
> Has "that's a theoretical problem" helped when we faced this issue in
> the past? I don't disagree that this is mostly theoretical, but I have a
> feeling that this is an argument that won't hold water.
> 
> As for an example of semi-garbage flag passing -- systemd passes
> O_PATH|O_NOCTTY in several places. Yes, they're known flags (so not
> entirely applicable to this discussion) but it's also not a meaningful
> combination of flags and yet is permitted.
> 
> > > The only real solution to this (and several other problems) is
> > > openat2().
> > 
> > No argue about that. Come on, let's get it merged ;-)
> 
> Believe me, I'm trying. ;)
> 
> > > As for O_ALLOW_ENCODED -- the current semantics (-EPERM if it
> > > is set without CAP_SYS_ADMIN) *will* cause backwards compatibility
> > > issues for programs that have garbage flags set...
> > >
> > 
> > Again, that's theoretical. In practice, O_ALLOW_ENCODED can work with
> > open()/openat(). In fact, even if O_ALLOW_ENCODED gets merged after
> > openat2(), I don't think it should be forbidden by open()/openat(),
> > right? Do in that sense, O_ALLOW_ENCODED does not depend on openat2().
> 
> If it's a valid open() flag it'll also be a valid openat2(2) flag. The
> only question is whether the garbage-flag problem justifies making it a
> no-op for open(2).

Consider O_NOATIME: a (non-root) program passing this flag for files it
didn't own would have been broken by kernel v2.6.8. Or, more recently, a
program accidentally setting O_TMPFILE would suddenly get drastically
different behavior on v3.11. These two flags technically broke backwards
compatibility. I don't think it's worth the trouble to treat
O_ALLOW_ENCODED any differently for open().

^ permalink raw reply

* Re: [RFC PATCH v2 1/5] fs: add O_ENCODED open flag
From: Omar Sandoval @ 2019-10-30 22:55 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: linux-fsdevel, linux-btrfs, Dave Chinner, Jann Horn, linux-api,
	kernel-team
In-Reply-To: <20191019045057.2fcrzuwc27eg5naf@yavin.dot.cyphar.com>

On Sat, Oct 19, 2019 at 03:50:57PM +1100, Aleksa Sarai wrote:
> On 2019-10-15, Omar Sandoval <osandov@osandov.com> wrote:
> > From: Omar Sandoval <osandov@fb.com>
> > 
> > The upcoming RWF_ENCODED operation introduces some security concerns:
> > 
> > 1. Compressed writes will pass arbitrary data to decompression
> >    algorithms in the kernel.
> > 2. Compressed reads can leak truncated/hole punched data.
> > 
> > Therefore, we need to require privilege for RWF_ENCODED. It's not
> > possible to do the permissions checks at the time of the read or write
> > because, e.g., io_uring submits IO from a worker thread. So, add an open
> > flag which requires CAP_SYS_ADMIN. It can also be set and cleared with
> > fcntl(). The flag is not cleared in any way on fork or exec; it should
> > probably be used with O_CLOEXEC in most cases.
> > 
> > Note that the usual issue that unknown open flags are ignored doesn't
> > really matter for O_ENCODED; if the kernel doesn't support O_ENCODED,
> > then it doesn't support RWF_ENCODED, either.
> > 
> > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > ---
> >  fs/fcntl.c                       | 10 ++++++++--
> >  fs/namei.c                       |  4 ++++
> >  include/linux/fcntl.h            |  2 +-
> >  include/uapi/asm-generic/fcntl.h |  4 ++++
> >  4 files changed, 17 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/fcntl.c b/fs/fcntl.c
> > index 3d40771e8e7c..45ebc6df078e 100644
> > --- a/fs/fcntl.c
> > +++ b/fs/fcntl.c
> > @@ -30,7 +30,8 @@
> >  #include <asm/siginfo.h>
> >  #include <linux/uaccess.h>
> >  
> > -#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
> > +#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME | \
> > +		    O_ENCODED)
> >  
> >  static int setfl(int fd, struct file * filp, unsigned long arg)
> >  {
> > @@ -49,6 +50,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
> >  		if (!inode_owner_or_capable(inode))
> >  			return -EPERM;
> >  
> > +	/* O_ENCODED can only be set by superuser */
> > +	if ((arg & O_ENCODED) && !(filp->f_flags & O_ENCODED) &&
> > +	    !capable(CAP_SYS_ADMIN))
> > +		return -EPERM;
> 
> I have a feeling the error should probably be an EACCES and not EPERM.

Shrug, I wanted to make this consistent with O_NOATIME, which uses
EPERM. EACCES seems more appropriate for lacking permissions for a
particular path rather than for an operation, but the lines are blurry.

> > +
> >  	/* required for strict SunOS emulation */
> >  	if (O_NONBLOCK != O_NDELAY)
> >  	       if (arg & O_NDELAY)
> > @@ -1031,7 +1037,7 @@ static int __init fcntl_init(void)
> >  	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
> >  	 * is defined as O_NONBLOCK on some platforms and not on others.
> >  	 */
> > -	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
> > +	BUILD_BUG_ON(22 - 1 /* for O_RDONLY being 0 */ !=
> >  		HWEIGHT32(
> >  			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> >  			__FMODE_EXEC | __FMODE_NONOTIFY));
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 671c3c1a3425..ae86b125888a 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -2978,6 +2978,10 @@ static int may_open(const struct path *path, int acc_mode, int flag)
> >  	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
> >  		return -EPERM;
> >  
> > +	/* O_ENCODED can only be set by superuser */
> > +	if ((flag & O_ENCODED) && !capable(CAP_SYS_ADMIN))
> > +		return -EPERM;
> 
> I would suggest that this check be put into build_open_flags() rather
> than putting it this late in open(). Also, same nit about the error
> return as above.

This is where we check permissions for O_NOATIME, shouldn't we keep all
of those permission checks in the same place? build_open_flags() only
checks for flag validity.

> > +
> >  	return 0;
> >  }
> >  
> > diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> > index d019df946cb2..5fac02479639 100644
> > --- a/include/linux/fcntl.h
> > +++ b/include/linux/fcntl.h
> > @@ -9,7 +9,7 @@
> >  	(O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
> >  	 O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \
> >  	 FASYNC	| O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
> > -	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
> > +	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_ENCODED)
> >  
> >  #ifndef force_o_largefile
> >  #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
> > diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
> > index 9dc0bf0c5a6e..8c5cbd5942e3 100644
> > --- a/include/uapi/asm-generic/fcntl.h
> > +++ b/include/uapi/asm-generic/fcntl.h
> > @@ -97,6 +97,10 @@
> >  #define O_NDELAY	O_NONBLOCK
> >  #endif
> >  
> > +#ifndef O_ENCODED
> > +#define O_ENCODED	040000000
> > +#endif
> 
> You should also define this for all of the architectures which don't use
> the generic O_* flag values. On alpha, O_PATH is equal to the value you
> picked (just be careful on sparc -- 0x4000000 is the next free bit, but
> it's used by FMODE_NONOTIFY.)

Good catch, I'll fix that. Thanks!

^ permalink raw reply

* Re: [PATCH man-pages] Document encoded I/O
From: Omar Sandoval @ 2019-10-30 22:57 UTC (permalink / raw)
  To: Aleksa Sarai
  Cc: Amir Goldstein, linux-fsdevel, Linux Btrfs, Dave Chinner,
	Jann Horn, Linux API, kernel-team, Theodore Tso
In-Reply-To: <20191030224606.GF326591@vader>

On Wed, Oct 30, 2019 at 03:46:06PM -0700, Omar Sandoval wrote:
> On Wed, Oct 23, 2019 at 11:12:03PM +1100, Aleksa Sarai wrote:
> > On 2019-10-23, Amir Goldstein <amir73il@gmail.com> wrote:
> > > > >
> > > > > No, I see why you choose to add the flag to open(2).
> > > > > I have no objection.
> > > > >
> > > > > I once had a crazy thought how to add new open flags
> > > > > in a non racy manner without adding a new syscall,
> > > > > but as you wrote, this is not relevant for O_ALLOW_ENCODED.
> > > > >
> > > > > Something like:
> > > > >
> > > > > /*
> > > > >  * Old kernels silently ignore unsupported open flags.
> > > > >  * New kernels that gets __O_CHECK_NEWFLAGS do
> > > > >  * the proper checking for unsupported flags AND set the
> > > > >  * flag __O_HAVE_NEWFLAGS.
> > > > >  */
> > > > > #define O_FLAG1 __O_CHECK_NEWFLAGS|__O_FLAG1
> > > > > #define O_HAVE_FLAG1 __O_HAVE_NEWFLAGS|__O_FLAG1
> > > > >
> > > > > fd = open(path, O_FLAG1);
> > > > > if (fd < 0)
> > > > >     return -errno;
> > > > > flags = fcntl(fd, F_GETFL, 0);
> > > > > if (flags < 0)
> > > > >     return flags;
> > > > > if ((flags & O_HAVE_FLAG1) != O_HAVE_FLAG1) {
> > > > >     close(fd);
> > > > >     return -EINVAL;
> > > > > }
> > > >
> > > > You don't need to add __O_HAVE_NEWFLAGS to do this -- this already works
> > > > today for userspace to check whether a flag works properly
> > > > (specifically, __O_FLAG1 will only be set if __O_FLAG1 is supported --
> > > > otherwise it gets cleared during build_open_flags).
> > > 
> > > That's a behavior of quite recent kernels since
> > > 629e014bb834 fs: completely ignore unknown open flags
> > > and maybe some stable kernels. Real old kernels don't have that luxury.
> > 
> > Ah okay -- so the key feature is that __O_CHECK_NEWFLAGS gets
> > transformed into __O_HAVE_NEWFLAGS (making it so that both the older and
> > current behaviours are detected). Apologies, I missed that on my first
> > read-through.
> > 
> > While it is a little bit ugly, it probably wouldn't be a bad idea to
> > have something like that.
> > 
> > > > The problem with adding new flags is that an *old* program running on a
> > > > *new* kernel could pass a garbage flag (__O_CHECK_NEWFLAGS for instance)
> > > > that causes an error only on the new kernel.
> > > 
> > > That's a theoretic problem. Same as O_PATH|O_TMPFILE.
> > > Show me a real life program that passes garbage files to open.
> > 
> > Has "that's a theoretical problem" helped when we faced this issue in
> > the past? I don't disagree that this is mostly theoretical, but I have a
> > feeling that this is an argument that won't hold water.
> > 
> > As for an example of semi-garbage flag passing -- systemd passes
> > O_PATH|O_NOCTTY in several places. Yes, they're known flags (so not
> > entirely applicable to this discussion) but it's also not a meaningful
> > combination of flags and yet is permitted.
> > 
> > > > The only real solution to this (and several other problems) is
> > > > openat2().
> > > 
> > > No argue about that. Come on, let's get it merged ;-)
> > 
> > Believe me, I'm trying. ;)
> > 
> > > > As for O_ALLOW_ENCODED -- the current semantics (-EPERM if it
> > > > is set without CAP_SYS_ADMIN) *will* cause backwards compatibility
> > > > issues for programs that have garbage flags set...
> > > >
> > > 
> > > Again, that's theoretical. In practice, O_ALLOW_ENCODED can work with
> > > open()/openat(). In fact, even if O_ALLOW_ENCODED gets merged after
> > > openat2(), I don't think it should be forbidden by open()/openat(),
> > > right? Do in that sense, O_ALLOW_ENCODED does not depend on openat2().
> > 
> > If it's a valid open() flag it'll also be a valid openat2(2) flag. The
> > only question is whether the garbage-flag problem justifies making it a
> > no-op for open(2).
> 
> Consider O_NOATIME: a (non-root) program passing this flag for files it
> didn't own would have been broken by kernel v2.6.8. Or, more recently, a
> program accidentally setting O_TMPFILE would suddenly get drastically
> different behavior on v3.11. These two flags technically broke backwards
> compatibility. I don't think it's worth the trouble to treat
> O_ALLOW_ENCODED any differently for open().

Ah, I missed that O_TMPFILE is careful to fail on old kernels. My point
still stands about O_NOATIME, though :)

^ permalink raw reply

* Re: [RFC PATCH v2 2/5] fs: add RWF_ENCODED for reading/writing compressed data
From: Aleksa Sarai @ 2019-10-30 23:11 UTC (permalink / raw)
  To: Omar Sandoval
  Cc: Darrick J. Wong, linux-fsdevel, linux-btrfs, Dave Chinner,
	Jann Horn, linux-api, kernel-team
In-Reply-To: <20191030222601.GE326591@vader>

[-- Attachment #1: Type: text/plain, Size: 7658 bytes --]

On 2019-10-30, Omar Sandoval <osandov@osandov.com> wrote:
> On Tue, Oct 22, 2019 at 01:02:15PM +1100, Aleksa Sarai wrote:
> > On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > > On Tue, Oct 22, 2019 at 05:38:31AM +1100, Aleksa Sarai wrote:
> > > > On 2019-10-21, Darrick J. Wong <darrick.wong@oracle.com> wrote:
> > > > > On Tue, Oct 15, 2019 at 11:42:40AM -0700, Omar Sandoval wrote:
> > > > > > From: Omar Sandoval <osandov@fb.com>
> > > > > > 
> > > > > > Btrfs supports transparent compression: data written by the user can be
> > > > > > compressed when written to disk and decompressed when read back.
> > > > > > However, we'd like to add an interface to write pre-compressed data
> > > > > > directly to the filesystem, and the matching interface to read
> > > > > > compressed data without decompressing it. This adds support for
> > > > > > so-called "encoded I/O" via preadv2() and pwritev2().
> > > > > > 
> > > > > > A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> > > > > > this flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > > > is used for metadata: namely, the compression algorithm, unencoded
> > > > > > (i.e., decompressed) length, and what subrange of the unencoded data
> > > > > > should be used (needed for truncated or hole-punched extents and when
> > > > > > reading in the middle of an extent). For reads, the filesystem returns
> > > > > > this information; for writes, the caller provides it to the filesystem.
> > > > > > iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> > > > > > used to extend the interface in the future. The remaining iovecs contain
> > > > > > the encoded extent.
> > > > > > 
> > > > > > Filesystems must indicate that they support encoded writes by setting
> > > > > > FMODE_ENCODED_IO in ->file_open().
> > > > > > 
> > > > > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > > > > ---
> > > > > >  include/linux/fs.h      | 14 +++++++
> > > > > >  include/uapi/linux/fs.h | 26 ++++++++++++-
> > > > > >  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
> > > > > >  3 files changed, 108 insertions(+), 14 deletions(-)
> > > > > > 
> > > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > > > index e0d909d35763..54681f21e05e 100644
> > > > > > --- a/include/linux/fs.h
> > > > > > +++ b/include/linux/fs.h
> > > > > > @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
> > > > > >  /* File does not contribute to nr_files count */
> > > > > >  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
> > > > > >  
> > > > > > +/* File supports encoded IO */
> > > > > > +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> > > > > > +
> > > > > >  /*
> > > > > >   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
> > > > > >   * that indicates that they should check the contents of the iovec are
> > > > > > @@ -314,6 +317,7 @@ enum rw_hint {
> > > > > >  #define IOCB_SYNC		(1 << 5)
> > > > > >  #define IOCB_WRITE		(1 << 6)
> > > > > >  #define IOCB_NOWAIT		(1 << 7)
> > > > > > +#define IOCB_ENCODED		(1 << 8)
> > > > > >  
> > > > > >  struct kiocb {
> > > > > >  	struct file		*ki_filp;
> > > > > > @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
> > > > > >  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
> > > > > >  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
> > > > > >  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> > > > > > +struct encoded_iov;
> > > > > > +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> > > > > > +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> > > > > > +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> > > > > > +				struct iov_iter *);
> > > > > >  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
> > > > > >  				struct file *file_out, loff_t pos_out,
> > > > > >  				loff_t *count, unsigned int remap_flags);
> > > > > > @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
> > > > > >  			return -EOPNOTSUPP;
> > > > > >  		ki->ki_flags |= IOCB_NOWAIT;
> > > > > >  	}
> > > > > > +	if (flags & RWF_ENCODED) {
> > > > > > +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> > > > > > +			return -EOPNOTSUPP;
> > > > > > +		ki->ki_flags |= IOCB_ENCODED;
> > > > > > +	}
> > > > > >  	if (flags & RWF_HIPRI)
> > > > > >  		ki->ki_flags |= IOCB_HIPRI;
> > > > > >  	if (flags & RWF_DSYNC)
> > > > > > diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> > > > > > index 379a612f8f1d..ed92a8a257cb 100644
> > > > > > --- a/include/uapi/linux/fs.h
> > > > > > +++ b/include/uapi/linux/fs.h
> > > > > > @@ -284,6 +284,27 @@ struct fsxattr {
> > > > > >  
> > > > > >  typedef int __bitwise __kernel_rwf_t;
> > > > > >  
> > > > > > +enum {
> > > > > > +	ENCODED_IOV_COMPRESSION_NONE,
> > > > > > +	ENCODED_IOV_COMPRESSION_ZLIB,
> > > > > > +	ENCODED_IOV_COMPRESSION_LZO,
> > > > > > +	ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > > +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> > > > > > +};
> > > > > > +
> > > > > > +enum {
> > > > > > +	ENCODED_IOV_ENCRYPTION_NONE,
> > > > > > +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> > > > > > +};
> > > > > > +
> > > > > > +struct encoded_iov {
> > > > > > +	__u64 len;
> > > > > > +	__u64 unencoded_len;
> > > > > > +	__u64 unencoded_offset;
> > > > > > +	__u32 compression;
> > > > > > +	__u32 encryption;
> > > > > 
> > > > > Can we add some must-be-zero padding space at the end here for whomever
> > > > > comes along next wanting to add more encoding info?
> > > > 
> > > > I would suggest to copy the extension design of copy_struct_from_user().
> > > > Adding must-be-zero padding is a less-ideal solution to the extension
> > > > problem than length-based extension.
> > > 
> > > Come to think of it, you /do/ have to specify iov_len so... yeah, do
> > > that instead; we can always extend the structure later.
> > 
> > Just to clarify -- if we want to make the interface forward-compatible
> > from the outset (programs built 4 years from now running on 5.5), we
> > will need to implement this in the original merge. Otherwise userspace
> > will need to handle backwards-compatibility themselves once new features
> > are added.
> > 
> > @Omar: If it'd make your life easier, I can send some draft patches
> > 	   which port copy_struct_from_user() to iovec-land.
> 
> You're right, I didn't think about the case of newer programs on older
> kernels. I can do that for the next submission. RWF_ENCODED should
> probably translate the E2BIG from copy_struct_from_user() to EINVAL,
> though, to avoid ambiguity with the case that the buffer wasn't big
> enough to return the encoded data.

Yeah, that seems fair enough. I would've preferred to keep the error
semantics the same everywhere, but adding additional ambiguity to such
error cases isn't a good idea.

It's a bit of a shame we don't have more granular EINVALs to make it
easier to figure out *why* you got an EINVAL (then we wouldn't have had
to abuse E2BIG to indicate to userspace "you're using a new feature on
an old kernel") -- but that's a more generic problem that probably won't
be solved any time soon.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [RFC PATCH v2 1/5] fs: add O_ENCODED open flag
From: Aleksa Sarai @ 2019-10-30 23:17 UTC (permalink / raw)
  To: Omar Sandoval
  Cc: linux-fsdevel, linux-btrfs, Dave Chinner, Jann Horn, linux-api,
	kernel-team
In-Reply-To: <20191030225540.GG326591@vader>

[-- Attachment #1: Type: text/plain, Size: 6220 bytes --]

On 2019-10-30, Omar Sandoval <osandov@osandov.com> wrote:
> On Sat, Oct 19, 2019 at 03:50:57PM +1100, Aleksa Sarai wrote:
> > On 2019-10-15, Omar Sandoval <osandov@osandov.com> wrote:
> > > From: Omar Sandoval <osandov@fb.com>
> > > 
> > > The upcoming RWF_ENCODED operation introduces some security concerns:
> > > 
> > > 1. Compressed writes will pass arbitrary data to decompression
> > >    algorithms in the kernel.
> > > 2. Compressed reads can leak truncated/hole punched data.
> > > 
> > > Therefore, we need to require privilege for RWF_ENCODED. It's not
> > > possible to do the permissions checks at the time of the read or write
> > > because, e.g., io_uring submits IO from a worker thread. So, add an open
> > > flag which requires CAP_SYS_ADMIN. It can also be set and cleared with
> > > fcntl(). The flag is not cleared in any way on fork or exec; it should
> > > probably be used with O_CLOEXEC in most cases.
> > > 
> > > Note that the usual issue that unknown open flags are ignored doesn't
> > > really matter for O_ENCODED; if the kernel doesn't support O_ENCODED,
> > > then it doesn't support RWF_ENCODED, either.
> > > 
> > > Signed-off-by: Omar Sandoval <osandov@fb.com>
> > > ---
> > >  fs/fcntl.c                       | 10 ++++++++--
> > >  fs/namei.c                       |  4 ++++
> > >  include/linux/fcntl.h            |  2 +-
> > >  include/uapi/asm-generic/fcntl.h |  4 ++++
> > >  4 files changed, 17 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/fs/fcntl.c b/fs/fcntl.c
> > > index 3d40771e8e7c..45ebc6df078e 100644
> > > --- a/fs/fcntl.c
> > > +++ b/fs/fcntl.c
> > > @@ -30,7 +30,8 @@
> > >  #include <asm/siginfo.h>
> > >  #include <linux/uaccess.h>
> > >  
> > > -#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
> > > +#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME | \
> > > +		    O_ENCODED)
> > >  
> > >  static int setfl(int fd, struct file * filp, unsigned long arg)
> > >  {
> > > @@ -49,6 +50,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
> > >  		if (!inode_owner_or_capable(inode))
> > >  			return -EPERM;
> > >  
> > > +	/* O_ENCODED can only be set by superuser */
> > > +	if ((arg & O_ENCODED) && !(filp->f_flags & O_ENCODED) &&
> > > +	    !capable(CAP_SYS_ADMIN))
> > > +		return -EPERM;
> > 
> > I have a feeling the error should probably be an EACCES and not EPERM.
> 
> Shrug, I wanted to make this consistent with O_NOATIME, which uses
> EPERM. EACCES seems more appropriate for lacking permissions for a
> particular path rather than for an operation, but the lines are blurry.

Fair enough, though I would also argue that O_NOATIME should've also
been EACCES (and there are plenty of examples throughout the kernel
where EPERM was used where EACCES makes more sense). ;)

> > > +
> > >  	/* required for strict SunOS emulation */
> > >  	if (O_NONBLOCK != O_NDELAY)
> > >  	       if (arg & O_NDELAY)
> > > @@ -1031,7 +1037,7 @@ static int __init fcntl_init(void)
> > >  	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
> > >  	 * is defined as O_NONBLOCK on some platforms and not on others.
> > >  	 */
> > > -	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
> > > +	BUILD_BUG_ON(22 - 1 /* for O_RDONLY being 0 */ !=
> > >  		HWEIGHT32(
> > >  			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> > >  			__FMODE_EXEC | __FMODE_NONOTIFY));
> > > diff --git a/fs/namei.c b/fs/namei.c
> > > index 671c3c1a3425..ae86b125888a 100644
> > > --- a/fs/namei.c
> > > +++ b/fs/namei.c
> > > @@ -2978,6 +2978,10 @@ static int may_open(const struct path *path, int acc_mode, int flag)
> > >  	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
> > >  		return -EPERM;
> > >  
> > > +	/* O_ENCODED can only be set by superuser */
> > > +	if ((flag & O_ENCODED) && !capable(CAP_SYS_ADMIN))
> > > +		return -EPERM;
> > 
> > I would suggest that this check be put into build_open_flags() rather
> > than putting it this late in open(). Also, same nit about the error
> > return as above.
> 
> This is where we check permissions for O_NOATIME, shouldn't we keep all
> of those permission checks in the same place? build_open_flags() only
> checks for flag validity.

Right, but O_NOATIME can't be checked earlier -- you need to have
resolved the inode in order to do the permission check. O_ENCODED only
depends on the capability set, and IMHO checking it earlier seems
cleaner to me.

> > > +
> > >  	return 0;
> > >  }
> > >  
> > > diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> > > index d019df946cb2..5fac02479639 100644
> > > --- a/include/linux/fcntl.h
> > > +++ b/include/linux/fcntl.h
> > > @@ -9,7 +9,7 @@
> > >  	(O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
> > >  	 O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \
> > >  	 FASYNC	| O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
> > > -	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
> > > +	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_ENCODED)
> > >  
> > >  #ifndef force_o_largefile
> > >  #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
> > > diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
> > > index 9dc0bf0c5a6e..8c5cbd5942e3 100644
> > > --- a/include/uapi/asm-generic/fcntl.h
> > > +++ b/include/uapi/asm-generic/fcntl.h
> > > @@ -97,6 +97,10 @@
> > >  #define O_NDELAY	O_NONBLOCK
> > >  #endif
> > >  
> > > +#ifndef O_ENCODED
> > > +#define O_ENCODED	040000000
> > > +#endif
> > 
> > You should also define this for all of the architectures which don't use
> > the generic O_* flag values. On alpha, O_PATH is equal to the value you
> > picked (just be careful on sparc -- 0x4000000 is the next free bit, but
> > it's used by FMODE_NONOTIFY.)
> 
> Good catch, I'll fix that. Thanks!

Oh, and please add a one-line comment in the sparc header to ensure
nobody accidentally breaks open() on sparc in the future.

-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v2] mm: Fix checking unmapped holes for mbind
From: Li Xinhai @ 2019-10-31  3:15 UTC (permalink / raw)
  To: linux-mm@kvack.org, n-horiguchi, akpm
  Cc: Vlastimil Babka, Michal Hocko, linux-kernel@vger.kernel.org,
	Linux API, Hugh Dickins
In-Reply-To: <201910291756045288126@gmail.com>

On 2019-10-29 at 17:56 Li Xinhai wrote:
>queue_pages_range() will check for unmapped holes besides queue pages for
>migration. The rules for checking unmapped holes are:
>1 Unmapped holes at any part of the specified range should be reported as
>  EFAULT if mbind() for none MPOL_DEFAULT cases;
>2 Unmapped holes at any part of the specified range should be ignored if
>  mbind() for MPOL_DEFAULT case;
>Note that the second rule is the current implementation, but it seems
>conflicts the Linux API definition.
>
>queue_pages_test_walk() is fixed by introduce new fields in struct
>queue_pages which help to check:
>1 holes at head and tail side of specified range;
>2 the whole range is in a hole;
>
>Besides, queue_pages_test_walk() must update previous vma record no matter
>the current vma should be considered for queue pages or not.
> 

More details about current issue (which breaks the mbind() API definition):
1. In queue_pages_test_walk()
checking on (!vma->vm_next && vma->vm_end < end) would never success, 
because 'end' passed from walk_page_test() make sure "end <=  vma->vm_end". so hole 
beyond the last vma can't be detected. 

2. queue_pages_test_walk() only called for vma, and 'start', 'end' parameters are guaranteed
within vma. Then, if the range starting or ending in an unmapped hole,  
queue_pages_test_walk() don't have chance to be called to check. In other words, the 
current code can detect this case (range span over hole):
[  vma  ][ hole ][ vma]
   [     range      ]
but cant not detect these case :
[  vma  ][ hole ][ vma]
   [  range  ]
[  vma  ][ hole ][  vma  ]
            [  range  ]

3. a checking in mbind_range() try to recover if the hole is in head side, but can't 
recover if hole is in tail side of range.

- Xinhai

>Fixes: 9d8cebd4bcd7 ("mm: fix mbind vma merge problem")
>Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()")
>Fixes: 48684a65b4e3 ("mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP)")
>Signed-off-by: Li Xinhai <lixinhai.li@gmail.com>
>---
>Changes in v2:
>  - Fix the unmapped holes checking in queue_pages_test_walk() instead of 
>    mbind_rnage().
>
> mm/mempolicy.c | 44 +++++++++++++++++++++++++++++---------------
> 1 file changed, 29 insertions(+), 15 deletions(-)
>
>diff --git a/mm/mempolicy.c b/mm/mempolicy.c
>index 4ae967bcf954..24087dfa4dcd 100644
>--- a/mm/mempolicy.c
>+++ b/mm/mempolicy.c
>@@ -411,6 +411,9 @@ struct queue_pages {
> 	unsigned long flags;
> 	nodemask_t *nmask;
> 	struct vm_area_struct *prev;
>+	unsigned long start;
>+	unsigned long end;
>+	int in_hole;
> };
> 
> /*
>@@ -618,28 +621,31 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
> 	unsigned long endvma = vma->vm_end;
> 	unsigned long flags = qp->flags;
> 
>-	/*
>-	* Need check MPOL_MF_STRICT to return -EIO if possible
>-	* regardless of vma_migratable
>-	*/
>-	if (!vma_migratable(vma) &&
>-	   !(flags & MPOL_MF_STRICT))
>-	return 1;
>-
>+	/* range check first */
> 	if (endvma > end)
> 	endvma = end;
>-	if (vma->vm_start > start)
>-	start = vma->vm_start;
>+	BUG_ON((vma->vm_start > start) || (vma->vm_end < end));
> 
>+	qp->in_hole = 0;
> 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
>-	if (!vma->vm_next && vma->vm_end < end)
>+	if ((!vma->vm_next && vma->vm_end < qp->end) ||
>+	(vma->vm_next && qp->end < vma->vm_next->vm_start))
> 	return -EFAULT;
>-	if (qp->prev && qp->prev->vm_end < vma->vm_start)
>+	if ((qp->prev && qp->prev->vm_end < vma->vm_start) ||
>+	(!qp->prev && qp->start < vma->vm_start))
> 	return -EFAULT;
> 	}
> 
> 	qp->prev = vma;
> 
>+	/*
>+	* Need check MPOL_MF_STRICT to return -EIO if possible
>+	* regardless of vma_migratable
>+	*/
>+	if (!vma_migratable(vma) &&
>+	   !(flags & MPOL_MF_STRICT))
>+	return 1;
>+
> 	if (flags & MPOL_MF_LAZY) {
> 	/* Similar to task_numa_work, skip inaccessible VMAs */
> 	if (!is_vm_hugetlb_page(vma) &&
>@@ -679,14 +685,23 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
> 	nodemask_t *nodes, unsigned long flags,
> 	struct list_head *pagelist)
> {
>+	int err;
> 	struct queue_pages qp = {
> 	.pagelist = pagelist,
> 	.flags = flags,
> 	.nmask = nodes,
> 	.prev = NULL,
>+	.start = start,
>+	.end = end,
>+	.in_hole = 1,
> 	};
> 
>-	return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
>+	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
>+	/* whole range in unmapped hole */
>+	if (qp->in_hole && !(flags & MPOL_MF_DISCONTIG_OK))
>+	err = -EFAULT;
>+
>+	return err;
> }
> 
> /*
>@@ -738,8 +753,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
> 	unsigned long vmend;
> 
> 	vma = find_vma(mm, start);
>-	if (!vma || vma->vm_start > start)
>-	return -EFAULT;
>+	BUG_ON(!vma);
> 
> 	prev = vma->vm_prev;
> 	if (start > vma->vm_start)
>-- 
>2.22.0

^ permalink raw reply

* Re: [PATCH v2] mm: Fix checking unmapped holes for mbind
From: Andrew Morton @ 2019-10-31  4:08 UTC (permalink / raw)
  To: Li Xinhai
  Cc: linux-mm@kvack.org, Babka, Hocko, linux-kernel@vger.kernel.org,
	API, Dickins, linux-man
In-Reply-To: <201910291756045288126@gmail.com>

(cc linux-man@vger.kernel.org)

On Tue, 29 Oct 2019 17:56:06 +0800 "Li Xinhai" <lixinhai.lxh@gmail.com> wrote:

> queue_pages_range() will check for unmapped holes besides queue pages for
> migration. The rules for checking unmapped holes are:
> 1 Unmapped holes at any part of the specified range should be reported as
>   EFAULT if mbind() for none MPOL_DEFAULT cases;
> 2 Unmapped holes at any part of the specified range should be ignored if
>   mbind() for MPOL_DEFAULT case;
> Note that the second rule is the current implementation, but it seems
> conflicts the Linux API definition.

Can you quote the part of the API definition which you're looking at?

My mbind(2) manpage says

ERRORS
       EFAULT Part or all of the memory range specified by nodemask and maxn-
              ode points outside your accessible address space.  Or, there was
              an unmapped hole in the specified memory range specified by addr
              and len.

(I assume the first sentence meant to say "specified by addr and len")

I agree with your interpretation, but there's no mention here that
MPOL_DEFAULT is treated differently and I don't see why it should be.


More broadly, I worry that it's too late to change this - existing
applications might fail if we change the implementation in the proposed
fashion.  So perhaps what we should do here is to change the manpage to
match reality?

Is the current behavior causing you any problems in a real-world use
case?

^ permalink raw reply

* Re: [PATCH v2] mm: Fix checking unmapped holes for mbind
From: Li Xinhai @ 2019-10-31  6:01 UTC (permalink / raw)
  To: akpm
  Cc: linux-mm@kvack.org, Vlastimil Babka, Michal Hocko,
	linux-kernel@vger.kernel.org, Linux API, Hugh Dickins, linux-man,
	n-horiguchi
In-Reply-To: <20191030210836.a17c0649354b59961903d1a8@linux-foundation.org>

On 2019-10-31 at 12:08 Andrew Morton wrote:
>(cc linux-man@vger.kernel.org)
>
>On Tue, 29 Oct 2019 17:56:06 +0800 "Li Xinhai" <lixinhai.lxh@gmail.com> wrote:
>
>> queue_pages_range() will check for unmapped holes besides queue pages for
>> migration. The rules for checking unmapped holes are:
>> 1 Unmapped holes at any part of the specified range should be reported as
>>   EFAULT if mbind() for none MPOL_DEFAULT cases;
>> 2 Unmapped holes at any part of the specified range should be ignored if
>>   mbind() for MPOL_DEFAULT case;
>> Note that the second rule is the current implementation, but it seems
>> conflicts the Linux API definition.
>
>Can you quote the part of the API definition which you're looking at?
>
>My mbind(2) manpage says
>
>ERRORS
>       EFAULT Part or all of the memory range specified by nodemask and maxn-
>              ode points outside your accessible address space.  Or, there was
>              an unmapped hole in the specified memory range specified by addr
>              and len.
>
>(I assume the first sentence meant to say "specified by addr and len")
> 

this part: 
"Or, there was an unmapped hole in the specified memory range specified by addr 
and len."
is concerned by my patch.

>I agree with your interpretation, but there's no mention here that
>MPOL_DEFAULT is treated differently and I don't see why it should be.
> 
The first rule match the manpage, but the current mempolicy implementation only 
reports EFAULT if holes are within range, or at the head side of range. No EFAULT 
reported if hole at tail side of range. I suppose the first rule has to be fixed.

The seconde rule, when MPOL_DEFAULT is used, was summarized by me according 
to mempolicy implementation. Actually, this rule does not follow manpage and exsits 
for long days. In my understanding, this rule is reasonable (in code,  the internal flag 
MPOL_MF_DISCONTIG_OK is used for that purpose, there is comments for reason) 
and we'd better keep it.

>
>More broadly, I worry that it's too late to change this - existing
>applications might fail if we change the implementation in the proposed
>fashion.  So perhaps what we should do here is to change the manpage to
>match reality?
> 
I prefer add description in manpage for the second rule, so no change to our code. 
Only fix for first rule.

>Is the current behavior causing you any problems in a real-world use
>case? 
I was using mbind() with MPOL_DEFAULT(or MPOL_BIND) to reset a range of address 
(which maybe contiguous or not in the whole range) to the default policy (to a specific 
node), and observed this issue. If mbind() call for each mapping one by one, we don't see the 
issue.

- Xinhai


^ permalink raw reply

* Re: [PATCH RFC] mm: add MAP_EXCLUSIVE to create exclusive user mappings
From: Mike Rapoport @ 2019-10-31  7:21 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: LKML, Alexey Dobriyan, Andrew Morton, Arnd Bergmann,
	Borislav Petkov, Dave Hansen, James Bottomley, Peter Zijlstra,
	Steven Rostedt, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	Linux API, Linux-MM, X86 ML, Mike Rapoport
In-Reply-To: <CALCETrXajrY+0SmzkL7t++ndYwRoYLLE9VPKwSGSyW8HZx-TeA@mail.gmail.com>

On Wed, Oct 30, 2019 at 02:28:21PM -0700, Andy Lutomirski wrote:
> On Wed, Oct 30, 2019 at 1:40 AM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > On Tue, Oct 29, 2019 at 10:00:55AM -0700, Andy Lutomirski wrote:
> > > On Tue, Oct 29, 2019 at 2:33 AM Mike Rapoport <rppt@kernel.org> wrote:
> > > >
> > > > On Mon, Oct 28, 2019 at 02:44:23PM -0600, Andy Lutomirski wrote:
> > > > >
> > > > > > On Oct 27, 2019, at 4:17 AM, Mike Rapoport <rppt@kernel.org> wrote:
> > > > > >
> > > > > > From: Mike Rapoport <rppt@linux.ibm.com>
> > > > > >
> > > > > > Hi,
> > > > > >
> > > > > > The patch below aims to allow applications to create mappins that have
> > > > > > pages visible only to the owning process. Such mappings could be used to
> > > > > > store secrets so that these secrets are not visible neither to other
> > > > > > processes nor to the kernel.
> > > > > >
> > > > > > I've only tested the basic functionality, the changes should be verified
> > > > > > against THP/migration/compaction. Yet, I'd appreciate early feedback.
> > > > >
> > > > > I’ve contemplated the concept a fair amount, and I think you should
> > > > > consider a change to the API. In particular, rather than having it be a
> > > > > MAP_ flag, make it a chardev.  You can, at least at first, allow only
> > > > > MAP_SHARED, and admins can decide who gets to use it.  It might also play
> > > > > better with the VM overall, and you won’t need a VM_ flag for it — you
> > > > > can just wire up .fault to do the right thing.
> > > >
> > > > I think mmap()/mprotect()/madvise() are the natural APIs for such
> > > > interface.
> > >
> > > Then you have a whole bunch of questions to answer.  For example:
> > >
> > > What happens if you mprotect() or similar when the mapping is already
> > > in use in a way that's incompatible with MAP_EXCLUSIVE?
> >
> > Then we refuse to mprotect()? Like in any other case when vm_flags are not
> > compatible with required madvise()/mprotect() operation.
> >
> 
> I'm not talking about flags.  I'm talking about the case where one
> thread (or RDMA or whatever) has get_user_pages()'d a mapping and
> another thread mprotect()s it MAP_EXCLUSIVE.
> 
> > > Is it actually reasonable to malloc() some memory and then make it exclusive?
> > >
> > > Are you permitted to map a file MAP_EXCLUSIVE?  What does it mean?
> >
> > I'd limit MAP_EXCLUSIVE only to anonymous memory.
> >
> > > What does MAP_PRIVATE | MAP_EXCLUSIVE do?
> >
> > My preference is to have only mmap() and then the semantics is more clear:
> >
> > MAP_PRIVATE | MAP_EXCLUSIVE creates a pre-populated region, marks it locked
> > and drops the pages in this region from the direct map.
> > The pages are returned back on munmap().
> > Then there is no way to change an existing area to be exclusive or vice
> > versa.
> 
> And what happens if you fork()?  Limiting it to MAP_SHARED |
> MAP_EXCLUSIVE would about this particular nasty question.
> 
> >
> > > How does one pass exclusive memory via SCM_RIGHTS?  (If it's a
> > > memfd-like or chardev interface, it's trivial.  mmap(), not so much.)
> >
> > Why passing such memory via SCM_RIGHTS would be useful?
> 
> Suppose I want to put a secret into exclusive memory and then send
> that secret to some other process.  The obvious approach would be to
> SCM_RIGHTS an fd over, but you can't do that with MAP_EXCLUSIVE as
> you've defined it.  In general, there are lots of use cases for memfd
> and other fd-backed memory.
> 
> >
> > > And finally, there's my personal giant pet peeve: a major use of this
> > > will be for virtualization.  I suspect that a lot of people would like
> > > the majority of KVM guest memory to be unmapped from the host
> > > pagetables.  But people might also like for guest memory to be
> > > unmapped in *QEMU's* pagetables, and mmap() is a basically worthless
> > > interface for this.  Getting fd-backed memory into a guest will take
> > > some possibly major work in the kernel, but getting vma-backed memory
> > > into a guest without mapping it in the host user address space seems
> > > much, much worse.
> >
> > Well, in my view, the MAP_EXCLUSIVE is intended to keep small secrets
> > rather than use it for the entire guest memory. I even considered adding a
> > limit for the mapping size, but then I decided that since RLIMIT_MEMLOCK is
> > anyway enforced there is no need for a new one.
> >
> > I agree that getting fd-backed memory into a guest would be less pain that
> > VMA, but KVM can already use memory outside the control of the kernel via
> > /dev/map [1].
> 
> That series doesn't address the problem I'm talking about at all.  I'm
> saying that there is a legitimate use case where QEMU should *not*
> have a mapping of the memory.  So QEMU would create some exclusive
> memory using /dev/exclusive_memory and would tell KVM to map it into
> the guest without mapping it into QEMU's address space at all.
> 
> (In fact, the way that SEV currently works is *functionally* like
> this, except that there's a bogus incoherent mapping in the QEMU
> process that is a giant can of worms.
> 
> 
> IMO a major benefit of a chardev approach is that you don't need a new
> VM_ flag and you don't need to worry about wiring it up everywhere in
> the core mm code.

Ok, at last I'm starting to see your and Christoph's point.

Just to reiterate, we can use fd-backed memory using /dev/exclusive_memory
chardev (or some other name we'll pick after long bikeshedding) and then
the .mmap method of this character device can do interesting things with
the backing physical memory. Since the memory is not VMA-mapped, we do not
have to find all the places in the core that might require a check of a VM_
flag to ensure there is no clashes with the exclusive memory.

Still, whatever we do with the mapping  properties of this memory, we need
a solution to the splitting of huge pages that map the direct map, but this
is an orthogonal problem in a way.

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH v2] mm: Fix checking unmapped holes for mbind
From: Naoya Horiguchi @ 2019-10-31  9:06 UTC (permalink / raw)
  To: Li Xinhai
  Cc: linux-mm@kvack.org, akpm, Vlastimil Babka, Michal Hocko,
	linux-kernel@vger.kernel.org, Linux API, Hugh Dickins
In-Reply-To: <2019103111150700409251@gmail.com>

On Thu, Oct 31, 2019 at 11:15:09AM +0800, Li Xinhai wrote:
> On 2019-10-29 at 17:56 Li Xinhai wrote:
> >queue_pages_range() will check for unmapped holes besides queue pages for
> >migration. The rules for checking unmapped holes are:
> >1 Unmapped holes at any part of the specified range should be reported as
> >  EFAULT if mbind() for none MPOL_DEFAULT cases;
> >2 Unmapped holes at any part of the specified range should be ignored if
> >  mbind() for MPOL_DEFAULT case;
> >Note that the second rule is the current implementation, but it seems
> >conflicts the Linux API definition.
> >
> >queue_pages_test_walk() is fixed by introduce new fields in struct
> >queue_pages which help to check:
> >1 holes at head and tail side of specified range;
> >2 the whole range is in a hole;
> >
> >Besides, queue_pages_test_walk() must update previous vma record no matter
> >the current vma should be considered for queue pages or not.
> > 
> 
> More details about current issue (which breaks the mbind() API definition):
> 1. In queue_pages_test_walk()
> checking on (!vma->vm_next && vma->vm_end < end) would never success, 
> because 'end' passed from walk_page_test() make sure "end <=  vma->vm_end". so hole 
> beyond the last vma can't be detected. 
> 
> 2. queue_pages_test_walk() only called for vma, and 'start', 'end' parameters are guaranteed
> within vma. Then, if the range starting or ending in an unmapped hole,  
> queue_pages_test_walk() don't have chance to be called to check. In other words, the 
> current code can detect this case (range span over hole):
> [  vma  ][ hole ][ vma]
>    [     range      ]
> but cant not detect these case :
> [  vma  ][ hole ][ vma]
>    [  range  ]
> [  vma  ][ hole ][  vma  ]
>             [  range  ]

IIRC, page table walker (walk_page_range()) should be designed to
handle these range inputs by separating into sub-ranges by vma
boundaries like below (with your notation):

  [  vma  ][ hole ][ vma  ]
     [    ][      ][  ]      // for your 1st case
     [    ][   ]             // for your 2nd case
              [   ][  ]      // for your 3rd case

And I found that .pte_hole is undefined in queue_pages_walk_ops.
So I'm guessing now that that's why hole regions are ignored and
the definition of EFAULT behavior in manpage is violated.
So providing proper .pte_hole callback could be another approach
for this issue which might fit better to the design.
IOW, queue_pages_test_walk() might not the right place to handle
hole regions by definition.

Thanks,
Naoya Horiguchi

> 
> 3. a checking in mbind_range() try to recover if the hole is in head side, but can't 
> recover if hole is in tail side of range.
> 
> - Xinhai
> 
> >Fixes: 9d8cebd4bcd7 ("mm: fix mbind vma merge problem")
> >Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()")
> >Fixes: 48684a65b4e3 ("mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP)")
> >Signed-off-by: Li Xinhai <lixinhai.li@gmail.com>
> >---
> >Changes in v2:
> >  - Fix the unmapped holes checking in queue_pages_test_walk() instead of 
> >    mbind_rnage().
> >
> > mm/mempolicy.c | 44 +++++++++++++++++++++++++++++---------------
> > 1 file changed, 29 insertions(+), 15 deletions(-)
> >
> >diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> >index 4ae967bcf954..24087dfa4dcd 100644
> >--- a/mm/mempolicy.c
> >+++ b/mm/mempolicy.c
> >@@ -411,6 +411,9 @@ struct queue_pages {
> > 	unsigned long flags;
> > 	nodemask_t *nmask;
> > 	struct vm_area_struct *prev;
> >+	unsigned long start;
> >+	unsigned long end;
> >+	int in_hole;
> > };
> > 
> > /*
> >@@ -618,28 +621,31 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
> > 	unsigned long endvma = vma->vm_end;
> > 	unsigned long flags = qp->flags;
> > 
> >-	/*
> >-	* Need check MPOL_MF_STRICT to return -EIO if possible
> >-	* regardless of vma_migratable
> >-	*/
> >-	if (!vma_migratable(vma) &&
> >-	   !(flags & MPOL_MF_STRICT))
> >-	return 1;
> >-
> >+	/* range check first */
> > 	if (endvma > end)
> > 	endvma = end;
> >-	if (vma->vm_start > start)
> >-	start = vma->vm_start;
> >+	BUG_ON((vma->vm_start > start) || (vma->vm_end < end));
> > 
> >+	qp->in_hole = 0;
> > 	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
> >-	if (!vma->vm_next && vma->vm_end < end)
> >+	if ((!vma->vm_next && vma->vm_end < qp->end) ||
> >+	(vma->vm_next && qp->end < vma->vm_next->vm_start))
> > 	return -EFAULT;
> >-	if (qp->prev && qp->prev->vm_end < vma->vm_start)
> >+	if ((qp->prev && qp->prev->vm_end < vma->vm_start) ||
> >+	(!qp->prev && qp->start < vma->vm_start))
> > 	return -EFAULT;
> > 	}
> > 
> > 	qp->prev = vma;
> > 
> >+	/*
> >+	* Need check MPOL_MF_STRICT to return -EIO if possible
> >+	* regardless of vma_migratable
> >+	*/
> >+	if (!vma_migratable(vma) &&
> >+	   !(flags & MPOL_MF_STRICT))
> >+	return 1;
> >+
> > 	if (flags & MPOL_MF_LAZY) {
> > 	/* Similar to task_numa_work, skip inaccessible VMAs */
> > 	if (!is_vm_hugetlb_page(vma) &&
> >@@ -679,14 +685,23 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
> > 	nodemask_t *nodes, unsigned long flags,
> > 	struct list_head *pagelist)
> > {
> >+	int err;
> > 	struct queue_pages qp = {
> > 	.pagelist = pagelist,
> > 	.flags = flags,
> > 	.nmask = nodes,
> > 	.prev = NULL,
> >+	.start = start,
> >+	.end = end,
> >+	.in_hole = 1,
> > 	};
> > 
> >-	return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
> >+	err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
> >+	/* whole range in unmapped hole */
> >+	if (qp->in_hole && !(flags & MPOL_MF_DISCONTIG_OK))
> >+	err = -EFAULT;
> >+
> >+	return err;
> > }
> > 
> > /*
> >@@ -738,8 +753,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
> > 	unsigned long vmend;
> > 
> > 	vma = find_vma(mm, start);
> >-	if (!vma || vma->vm_start > start)
> >-	return -EFAULT;
> >+	BUG_ON(!vma);
> > 
> > 	prev = vma->vm_prev;
> > 	if (start > vma->vm_start)
> >-- 
> >2.22.0

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox