Linux userland API discussions
 help / color / mirror / Atom feed
* Re: [PATCH v1 1/2] pid: add pidfd_open()
From: Joel Fernandes @ 2019-05-18  9:48 UTC (permalink / raw)
  To: Christian Brauner
  Cc: jannh, oleg, viro, torvalds, linux-kernel, arnd, akpm, cyphar,
	dhowells, ebiederm, elena.reshetova, keescook, luto, luto, tglx,
	linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k, linux-mips,
	linux-parisc, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	linux-xtensa, linux-api, linux-arch, linux-kselftest, dancol,
	serge, su
In-Reply-To: <20190516135944.7205-1-christian@brauner.io>

Hi Christian,

For next revision, could you also CC surenb@google.com as well? He is also
working on the low memory killer. And also suggest CC to
kernel-team@android.com. And mentioned some comments below, thanks.

On Thu, May 16, 2019 at 03:59:42PM +0200, Christian Brauner wrote:
[snip]  
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 20881598bdfa..4afca3d6dcb8 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -38,6 +38,7 @@
>  #include <linux/syscalls.h>
>  #include <linux/proc_ns.h>
>  #include <linux/proc_fs.h>
> +#include <linux/sched/signal.h>
>  #include <linux/sched/task.h>
>  #include <linux/idr.h>
>  
> @@ -451,6 +452,55 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
>  	return idr_get_next(&ns->idr, &nr);
>  }
>  
> +/**
> + * pidfd_open() - Open new pid file descriptor.
> + *
> + * @pid:   pid for which to retrieve a pidfd
> + * @flags: flags to pass
> + *
> + * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> + * the process identified by @pid. Currently, the process identified by
> + * @pid must be a thread-group leader. This restriction currently exists
> + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> + * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> + * leaders).
> + *
> + * Return: On success, a cloexec pidfd is returned.
> + *         On error, a negative errno number will be returned.
> + */
> +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> +{
> +	int fd, ret;
> +	struct pid *p;
> +	struct task_struct *tsk;
> +
> +	if (flags)
> +		return -EINVAL;
> +
> +	if (pid <= 0)
> +		return -EINVAL;
> +
> +	p = find_get_pid(pid);
> +	if (!p)
> +		return -ESRCH;
> +
> +	ret = 0;
> +	rcu_read_lock();
> +	/*
> +	 * If this returns non-NULL the pid was used as a thread-group
> +	 * leader. Note, we race with exec here: If it changes the
> +	 * thread-group leader we might return the old leader.
> +	 */
> +	tsk = pid_task(p, PIDTYPE_TGID);

Just trying to understand the comment here. The issue is that we might either
return the new leader, or the old leader depending on the overlap with
concurrent de_thread right? In either case, we don't care though.

I suggest to remove the "Note..." part of the comment since it doesn't seem the
race is relevant here unless we are doing something else with tsk in the
function, but if you want to keep it that's also fine. Comment text should
probably should be 'return the new leader' though.

> +	if (!tsk)
> +		ret = -ESRCH;

Perhaps -EINVAL?  AFAICS, this can only happen if a CLONE_THREAD pid was
passed as argument to pidfd_open which is invalid. But let me know what you
had in mind..

thanks,

 - Joel

> +	rcu_read_unlock();
> +
> +	fd = ret ?: pidfd_create(p);
> +	put_pid(p);
> +	return fd;
> +}
> +
>  void __init pid_idr_init(void)
>  {
>  	/* Verify no one has done anything silly: */
> -- 
> 2.21.0
> 

^ permalink raw reply

* Re: [PATCH v1 1/2] pid: add pidfd_open()
From: Christian Brauner @ 2019-05-18 10:04 UTC (permalink / raw)
  To: Joel Fernandes
  Cc: jannh, oleg, viro, torvalds, linux-kernel, arnd, akpm, cyphar,
	dhowells, ebiederm, elena.reshetova, keescook, luto, luto, tglx,
	linux-alpha, linux-arm-kernel, linux-ia64, linux-m68k, linux-mips,
	linux-parisc, linuxppc-dev, linux-s390, linux-sh, sparclinux,
	linux-xtensa, linux-api, linux-arch, linux-kselftest, dancol,
	serge, su
In-Reply-To: <20190516224949.GA15401@localhost>

On Sat, May 18, 2019 at 05:48:03AM -0400, Joel Fernandes wrote:
> Hi Christian,
> 
> For next revision, could you also CC surenb@google.com as well? He is also
> working on the low memory killer. And also suggest CC to
> kernel-team@android.com. And mentioned some comments below, thanks.

Yip, totally. Just added them both to my Cc list. :)
(I saw you added Suren manually. I added the Android kernel team now too.)

> 
> On Thu, May 16, 2019 at 03:59:42PM +0200, Christian Brauner wrote:
> [snip]  
> > diff --git a/kernel/pid.c b/kernel/pid.c
> > index 20881598bdfa..4afca3d6dcb8 100644
> > --- a/kernel/pid.c
> > +++ b/kernel/pid.c
> > @@ -38,6 +38,7 @@
> >  #include <linux/syscalls.h>
> >  #include <linux/proc_ns.h>
> >  #include <linux/proc_fs.h>
> > +#include <linux/sched/signal.h>
> >  #include <linux/sched/task.h>
> >  #include <linux/idr.h>
> >  
> > @@ -451,6 +452,55 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
> >  	return idr_get_next(&ns->idr, &nr);
> >  }
> >  
> > +/**
> > + * pidfd_open() - Open new pid file descriptor.
> > + *
> > + * @pid:   pid for which to retrieve a pidfd
> > + * @flags: flags to pass
> > + *
> > + * This creates a new pid file descriptor with the O_CLOEXEC flag set for
> > + * the process identified by @pid. Currently, the process identified by
> > + * @pid must be a thread-group leader. This restriction currently exists
> > + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
> > + * be used with CLONE_THREAD) and pidfd polling (only supports thread group
> > + * leaders).
> > + *
> > + * Return: On success, a cloexec pidfd is returned.
> > + *         On error, a negative errno number will be returned.
> > + */
> > +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
> > +{
> > +	int fd, ret;
> > +	struct pid *p;
> > +	struct task_struct *tsk;
> > +
> > +	if (flags)
> > +		return -EINVAL;
> > +
> > +	if (pid <= 0)
> > +		return -EINVAL;
> > +
> > +	p = find_get_pid(pid);
> > +	if (!p)
> > +		return -ESRCH;
> > +
> > +	ret = 0;
> > +	rcu_read_lock();
> > +	/*
> > +	 * If this returns non-NULL the pid was used as a thread-group
> > +	 * leader. Note, we race with exec here: If it changes the
> > +	 * thread-group leader we might return the old leader.
> > +	 */
> > +	tsk = pid_task(p, PIDTYPE_TGID);
> 
> Just trying to understand the comment here. The issue is that we might either
> return the new leader, or the old leader depending on the overlap with
> concurrent de_thread right? In either case, we don't care though.
> 
> I suggest to remove the "Note..." part of the comment since it doesn't seem the
> race is relevant here unless we are doing something else with tsk in the
> function, but if you want to keep it that's also fine. Comment text should
> probably should be 'return the new leader' though.

Nah, I actually removed the comment already independently (cf. see [1]).

[1]: https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/commit/?h=pidfd_open&id=dcfc98c2d957bf3ac14b06414cb2cf4c673fc297
> 
> > +	if (!tsk)
> > +		ret = -ESRCH;
> 
> Perhaps -EINVAL?  AFAICS, this can only happen if a CLONE_THREAD pid was
> passed as argument to pidfd_open which is invalid. But let me know what you
> had in mind..

Hm, from the kernel's perspective ESRCH is correct but I guess EINVAL is
nicer for userspace. So I don't have objections to using EINVAL. My
first version did too I think.

Thanks!
Christian

^ permalink raw reply

* Re: [PATCH v2] net: Add UNIX_DIAG_UID to Netlink UNIX socket diagnostics.
From: Andy Lutomirski @ 2019-05-18 17:18 UTC (permalink / raw)
  To: Felipe Gasper; +Cc: davem, viro, linux-kernel, netdev, linux-api
In-Reply-To: <20190518034820.16500-1-felipe@felipegasper.com>



> On May 17, 2019, at 8:48 PM, Felipe Gasper <felipe@felipegasper.com> wrote:
> 
> Author: Felipe Gasper <felipe@felipegasper.com>
> Date:   Fri May 17 16:54:40 2019 -0500
> 
>   net: Add UNIX_DIAG_UID to Netlink UNIX socket diagnostics.
> 
>   This adds the ability for Netlink to report a socket’s UID along with the
>   other UNIX diagnostic information that is already available. This will
>   allow diagnostic tools greater insight into which users control which socket.
> 
>   Signed-off-by: Felipe Gasper <felipe@felipegasper.com>
> 
> diff --git a/include/uapi/linux/unix_diag.h b/include/uapi/linux/unix_diag.h
> index 5c502fd..a198857 100644
> --- a/include/uapi/linux/unix_diag.h
> +++ b/include/uapi/linux/unix_diag.h
> @@ -20,6 +20,7 @@ struct unix_diag_req {
> #define UDIAG_SHOW_ICONS    0x00000008    /* show pending connections */
> #define UDIAG_SHOW_RQLEN    0x00000010    /* show skb receive queue len */
> #define UDIAG_SHOW_MEMINFO    0x00000020    /* show memory info of a socket */
> +#define UDIAG_SHOW_UID        0x00000040    /* show socket's UID */
> 
> struct unix_diag_msg {
>    __u8    udiag_family;
> @@ -40,6 +41,7 @@ enum {
>    UNIX_DIAG_RQLEN,
>    UNIX_DIAG_MEMINFO,
>    UNIX_DIAG_SHUTDOWN,
> +    UNIX_DIAG_UID,
> 
>    __UNIX_DIAG_MAX,
> };
> diff --git a/net/unix/diag.c b/net/unix/diag.c
> index 3183d9b..e40f348 100644
> --- a/net/unix/diag.c
> +++ b/net/unix/diag.c
> @@ -4,9 +4,11 @@
> #include <linux/unix_diag.h>
> #include <linux/skbuff.h>
> #include <linux/module.h>
> +#include <linux/uidgid.h>
> #include <net/netlink.h>
> #include <net/af_unix.h>
> #include <net/tcp_states.h>
> +#include <net/sock.h>
> 
> static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
> {
> @@ -110,6 +112,12 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb)
>    return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql);
> }
> 
> +static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb)
> +{
> +    uid_t uid = from_kuid_munged(sk_user_ns(sk), sock_i_uid(sk));
> +    return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid);

This still looks wrong. You’re reporting the uid of the socket as seen by that socket.  Presumably you actually want the uid of the socket as seen by the *diagnostic* socket. This might be sk_user_ns(nlskb->sk).

You can test this with a command like unshare -U -r nc -l 12345 run as no -root. Then run you user diagnostic tool to find the uid of the socket. It should not be zero. Similarly, if you run unshare -U -r bash and then open a socket and run your diagnostic tool, both from that same session, you should see 0 as the uid since bash and all its children think they’re uid 0.

> +}

> +
> static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req,
>        u32 portid, u32 seq, u32 flags, int sk_ino)
> {
> @@ -156,6 +164,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
>    if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown))
>        goto out_nlmsg_trim;
> 
> +    if ((req->udiag_show & UDIAG_SHOW_UID) &&
> +        sk_diag_dump_uid(sk, skb))
> +        goto out_nlmsg_trim;
> +
>    nlmsg_end(skb, nlh);
>    return 0;
> 

^ permalink raw reply

* [PATCH v3] net: Add UNIX_DIAG_UID to Netlink UNIX socket diagnostics.
From: Felipe Gasper @ 2019-05-19  1:38 UTC (permalink / raw)
  To: davem, viro, linux-kernel, netdev, linux-api

Author: Felipe Gasper <felipe@felipegasper.com>
Date:   Sat May 18 20:04:40 2019 -0500

   net: Add UNIX_DIAG_UID to Netlink UNIX socket diagnostics.

   This adds the ability for Netlink to report a socket's UID along with the
   other UNIX diagnostic information that is already available. This will
   allow diagnostic tools greater insight into which users control which
   socket.

   To test this, do the following as a non-root user:

        unshare -U -r bash
        nc -l -U user.socket.$$ &

   .. and verify from within that same session that Netlink UNIX socket
   diagnostics report the socket's UID as 0. Also verify that Netlink UNIX
   socket diagnostics report the socket's UID as the user's UID from an
   unprivileged process in a different session. Verify the same from
   a root process.

   Signed-off-by: Felipe Gasper <felipe@felipegasper.com>

diff --git a/include/uapi/linux/unix_diag.h b/include/uapi/linux/unix_diag.h
index 5c502fd..a198857 100644
--- a/include/uapi/linux/unix_diag.h
+++ b/include/uapi/linux/unix_diag.h
@@ -20,6 +20,7 @@ struct unix_diag_req {
 #define UDIAG_SHOW_ICONS	0x00000008	/* show pending connections */
 #define UDIAG_SHOW_RQLEN	0x00000010	/* show skb receive queue len */
 #define UDIAG_SHOW_MEMINFO	0x00000020	/* show memory info of a socket */
+#define UDIAG_SHOW_UID		0x00000040	/* show socket's UID */
 
 struct unix_diag_msg {
 	__u8	udiag_family;
@@ -40,6 +41,7 @@ enum {
 	UNIX_DIAG_RQLEN,
 	UNIX_DIAG_MEMINFO,
 	UNIX_DIAG_SHUTDOWN,
+	UNIX_DIAG_UID,
 
 	__UNIX_DIAG_MAX,
 };
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 3183d9b..e40f348 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -4,9 +4,11 @@
 #include <linux/unix_diag.h>
 #include <linux/skbuff.h>
 #include <linux/module.h>
+#include <linux/uidgid.h>
 #include <net/netlink.h>
 #include <net/af_unix.h>
 #include <net/tcp_states.h>
+#include <net/sock.h>
 
 static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
 {
@@ -110,6 +112,12 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb)
 	return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql);
 }
 
+static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb)
+{
+	uid_t uid = from_kuid_munged(sk_user_ns(nlskb->sk), sock_i_uid(sk));
+	return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid);
+}
+
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req,
 		u32 portid, u32 seq, u32 flags, int sk_ino)
 {
@@ -156,6 +164,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
 	if (nla_put_u8(skb, UNIX_DIAG_SHUTDOWN, sk->sk_shutdown))
 		goto out_nlmsg_trim;
 
+	if ((req->udiag_show & UDIAG_SHOW_UID) &&
+	    sk_diag_dump_uid(sk, skb))
+		goto out_nlmsg_trim;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 

^ permalink raw reply related

* Re: [RFC 1/7] mm: introduce MADV_COOL
From: Michal Hocko @ 2019-05-20  8:16 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, LKML, linux-mm, Johannes Weiner, Tim Murray,
	Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, linux-api
In-Reply-To: <20190520035254.57579-2-minchan@kernel.org>

[CC linux-api]

On Mon 20-05-19 12:52:48, Minchan Kim wrote:
> When a process expects no accesses to a certain memory range
> it could hint kernel that the pages can be reclaimed
> when memory pressure happens but data should be preserved
> for future use.  This could reduce workingset eviction so it
> ends up increasing performance.
> 
> This patch introduces the new MADV_COOL hint to madvise(2)
> syscall. MADV_COOL can be used by a process to mark a memory range
> as not expected to be used in the near future. The hint can help
> kernel in deciding which pages to evict early during memory
> pressure.

I do not want to start naming fight but MADV_COOL sounds a bit
misleading. Everybody thinks his pages are cool ;). Probably MADV_COLD
or MADV_DONTNEED_PRESERVE.

> Internally, it works via deactivating memory from active list to
> inactive's head so when the memory pressure happens, they will be
> reclaimed earlier than other active pages unless there is no
> access until the time.

Could you elaborate about the decision to move to the head rather than
tail? What should happen to inactive pages? Should we move them to the
tail? Your implementation seems to ignore those completely. Why?

What should happen for shared pages? In other words do we want to allow
less privileged process to control evicting of shared pages with a more
privileged one? E.g. think of all sorts of side channel attacks. Maybe
we want to do the same thing as for mincore where write access is
required.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v3 2/2] initramfs: introduce do_readxattrs()
From: Roberto Sassu @ 2019-05-20  8:16 UTC (permalink / raw)
  To: Arvind Sankar
  Cc: hpa, viro, linux-security-module, linux-integrity, initramfs,
	linux-api, linux-fsdevel, linux-kernel, zohar, silviu.vlasceanu,
	dmitry.kasatkin, takondra, kamensky, arnd, rob, james.w.mcmechan,
	niveditas98
In-Reply-To: <20190517211014.GA9198@rani.riverdale.lan>

On 5/17/2019 11:10 PM, Arvind Sankar wrote:
> On Fri, May 17, 2019 at 05:02:20PM -0400, Arvind Sankar wrote:
>> On Fri, May 17, 2019 at 01:18:11PM -0700, hpa@zytor.com wrote:
>>>
>>> Ok... I just realized this does not work for a modular initramfs, composed at load time from multiple files, which is a very real problem. Should be easy enough to deal with: instead of one large file, use one companion file per source file, perhaps something like filename..xattrs (suggesting double dots to make it less likely to conflict with a "real" file.) No leading dot, as it makes it more likely that archivers will sort them before the file proper.
>> This version of the patch was changed from the previous one exactly to deal with this case --
>> it allows for the bootloader to load multiple initramfs archives, each
>> with its own .xattr-list file, and to have that work properly.
>> Could you elaborate on the issue that you see?
> Roberto, are you missing a changelog entry for v2->v3 change?

The changelog for v1->v2 is missing.

Thanks

Roberto

-- 
HUAWEI TECHNOLOGIES Duesseldorf GmbH, HRB 56063
Managing Director: Bo PENG, Jian LI, Yanli SHI

^ permalink raw reply

* Re: [RFC 1/7] mm: introduce MADV_COOL
From: Michal Hocko @ 2019-05-20  8:19 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, LKML, linux-mm, Johannes Weiner, Tim Murray,
	Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, linux-api
In-Reply-To: <20190520081621.GV6836@dhcp22.suse.cz>

On Mon 20-05-19 10:16:21, Michal Hocko wrote:
> [CC linux-api]
> 
> On Mon 20-05-19 12:52:48, Minchan Kim wrote:
> > When a process expects no accesses to a certain memory range
> > it could hint kernel that the pages can be reclaimed
> > when memory pressure happens but data should be preserved
> > for future use.  This could reduce workingset eviction so it
> > ends up increasing performance.
> > 
> > This patch introduces the new MADV_COOL hint to madvise(2)
> > syscall. MADV_COOL can be used by a process to mark a memory range
> > as not expected to be used in the near future. The hint can help
> > kernel in deciding which pages to evict early during memory
> > pressure.
> 
> I do not want to start naming fight but MADV_COOL sounds a bit
> misleading. Everybody thinks his pages are cool ;). Probably MADV_COLD
> or MADV_DONTNEED_PRESERVE.

OK, I can see that you have used MADV_COLD for a different mode.
So this one is effectively a non destructive MADV_FREE alternative
so MADV_FREE_PRESERVE would sound like a good fit. Your MADV_COLD
in other patch would then be MADV_DONTNEED_PRESERVE. Right?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [RFC 3/7] mm: introduce MADV_COLD
From: Michal Hocko @ 2019-05-20  8:27 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, LKML, linux-mm, Johannes Weiner, Tim Murray,
	Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, linux-api
In-Reply-To: <20190520035254.57579-4-minchan@kernel.org>

[Cc linux-api]

On Mon 20-05-19 12:52:50, Minchan Kim wrote:
> When a process expects no accesses to a certain memory range
> for a long time, it could hint kernel that the pages can be
> reclaimed instantly but data should be preserved for future use.
> This could reduce workingset eviction so it ends up increasing
> performance.
> 
> This patch introduces the new MADV_COLD hint to madvise(2)
> syscall. MADV_COLD can be used by a process to mark a memory range
> as not expected to be used for a long time. The hint can help
> kernel in deciding which pages to evict proactively.

As mentioned in other email this looks like a non-destructive
MADV_DONTNEED alternative.

> Internally, it works via reclaiming memory in process context
> the syscall is called. If the page is dirty but backing storage
> is not synchronous device, the written page will be rotate back
> into LRU's tail once the write is done so they will reclaim easily
> when memory pressure happens. If backing storage is
> synchrnous device(e.g., zram), hte page will be reclaimed instantly.

Why do we special case async backing storage? Please always try to
explain _why_ the decision is made.

I haven't checked the implementation yet so I cannot comment on that.

> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  include/linux/swap.h                   |   1 +
>  include/uapi/asm-generic/mman-common.h |   1 +
>  mm/madvise.c                           | 123 +++++++++++++++++++++++++
>  mm/vmscan.c                            |  74 +++++++++++++++
>  4 files changed, 199 insertions(+)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 64795abea003..7f32a948fc6a 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -365,6 +365,7 @@ extern int vm_swappiness;
>  extern int remove_mapping(struct address_space *mapping, struct page *page);
>  extern unsigned long vm_total_pages;
>  
> +extern unsigned long reclaim_pages(struct list_head *page_list);
>  #ifdef CONFIG_NUMA
>  extern int node_reclaim_mode;
>  extern int sysctl_min_unmapped_ratio;
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index f7a4a5d4b642..b9b51eeb8e1a 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -43,6 +43,7 @@
>  #define MADV_WILLNEED	3		/* will need these pages */
>  #define MADV_DONTNEED	4		/* don't need these pages */
>  #define MADV_COOL	5		/* deactivatie these pages */
> +#define MADV_COLD	6		/* reclaim these pages */
>  
>  /* common parameters: try to keep these consistent across architectures */
>  #define MADV_FREE	8		/* free pages only if memory pressure */
> diff --git a/mm/madvise.c b/mm/madvise.c
> index c05817fb570d..9a6698b56845 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -42,6 +42,7 @@ static int madvise_need_mmap_write(int behavior)
>  	case MADV_WILLNEED:
>  	case MADV_DONTNEED:
>  	case MADV_COOL:
> +	case MADV_COLD:
>  	case MADV_FREE:
>  		return 0;
>  	default:
> @@ -416,6 +417,125 @@ static long madvise_cool(struct vm_area_struct *vma,
>  	return 0;
>  }
>  
> +static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
> +				unsigned long end, struct mm_walk *walk)
> +{
> +	pte_t *orig_pte, *pte, ptent;
> +	spinlock_t *ptl;
> +	LIST_HEAD(page_list);
> +	struct page *page;
> +	int isolated = 0;
> +	struct vm_area_struct *vma = walk->vma;
> +	unsigned long next;
> +
> +	next = pmd_addr_end(addr, end);
> +	if (pmd_trans_huge(*pmd)) {
> +		spinlock_t *ptl;
> +
> +		ptl = pmd_trans_huge_lock(pmd, vma);
> +		if (!ptl)
> +			return 0;
> +
> +		if (is_huge_zero_pmd(*pmd))
> +			goto huge_unlock;
> +
> +		page = pmd_page(*pmd);
> +		if (page_mapcount(page) > 1)
> +			goto huge_unlock;
> +
> +		if (next - addr != HPAGE_PMD_SIZE) {
> +			int err;
> +
> +			get_page(page);
> +			spin_unlock(ptl);
> +			lock_page(page);
> +			err = split_huge_page(page);
> +			unlock_page(page);
> +			put_page(page);
> +			if (!err)
> +				goto regular_page;
> +			return 0;
> +		}
> +
> +		if (isolate_lru_page(page))
> +			goto huge_unlock;
> +
> +		list_add(&page->lru, &page_list);
> +huge_unlock:
> +		spin_unlock(ptl);
> +		reclaim_pages(&page_list);
> +		return 0;
> +	}
> +
> +	if (pmd_trans_unstable(pmd))
> +		return 0;
> +regular_page:
> +	orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> +	for (pte = orig_pte; addr < end; pte++, addr += PAGE_SIZE) {
> +		ptent = *pte;
> +		if (!pte_present(ptent))
> +			continue;
> +
> +		page = vm_normal_page(vma, addr, ptent);
> +		if (!page)
> +			continue;
> +
> +		if (page_mapcount(page) > 1)
> +			continue;
> +
> +		if (isolate_lru_page(page))
> +			continue;
> +
> +		isolated++;
> +		list_add(&page->lru, &page_list);
> +		if (isolated >= SWAP_CLUSTER_MAX) {
> +			pte_unmap_unlock(orig_pte, ptl);
> +			reclaim_pages(&page_list);
> +			isolated = 0;
> +			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
> +			orig_pte = pte;
> +		}
> +	}
> +
> +	pte_unmap_unlock(orig_pte, ptl);
> +	reclaim_pages(&page_list);
> +	cond_resched();
> +
> +	return 0;
> +}
> +
> +static void madvise_cold_page_range(struct mmu_gather *tlb,
> +			     struct vm_area_struct *vma,
> +			     unsigned long addr, unsigned long end)
> +{
> +	struct mm_walk warm_walk = {
> +		.pmd_entry = madvise_cold_pte_range,
> +		.mm = vma->vm_mm,
> +	};
> +
> +	tlb_start_vma(tlb, vma);
> +	walk_page_range(addr, end, &warm_walk);
> +	tlb_end_vma(tlb, vma);
> +}
> +
> +
> +static long madvise_cold(struct vm_area_struct *vma,
> +			unsigned long start_addr, unsigned long end_addr)
> +{
> +	struct mm_struct *mm = vma->vm_mm;
> +	struct mmu_gather tlb;
> +
> +	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
> +		return -EINVAL;
> +
> +	lru_add_drain();
> +	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> +	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
> +	tlb_finish_mmu(&tlb, start_addr, end_addr);
> +
> +	return 0;
> +}
> +
>  static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
>  				unsigned long end, struct mm_walk *walk)
>  
> @@ -806,6 +926,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
>  		return madvise_willneed(vma, prev, start, end);
>  	case MADV_COOL:
>  		return madvise_cool(vma, start, end);
> +	case MADV_COLD:
> +		return madvise_cold(vma, start, end);
>  	case MADV_FREE:
>  	case MADV_DONTNEED:
>  		return madvise_dontneed_free(vma, prev, start, end, behavior);
> @@ -828,6 +950,7 @@ madvise_behavior_valid(int behavior)
>  	case MADV_DONTNEED:
>  	case MADV_FREE:
>  	case MADV_COOL:
> +	case MADV_COLD:
>  #ifdef CONFIG_KSM
>  	case MADV_MERGEABLE:
>  	case MADV_UNMERGEABLE:
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index a28e5d17b495..1701b31f70a8 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2096,6 +2096,80 @@ static void shrink_active_list(unsigned long nr_to_scan,
>  			nr_deactivate, nr_rotated, sc->priority, file);
>  }
>  
> +unsigned long reclaim_pages(struct list_head *page_list)
> +{
> +	int nid = -1;
> +	unsigned long nr_isolated[2] = {0, };
> +	unsigned long nr_reclaimed = 0;
> +	LIST_HEAD(node_page_list);
> +	struct reclaim_stat dummy_stat;
> +	struct scan_control sc = {
> +		.gfp_mask = GFP_KERNEL,
> +		.priority = DEF_PRIORITY,
> +		.may_writepage = 1,
> +		.may_unmap = 1,
> +		.may_swap = 1,
> +	};
> +
> +	while (!list_empty(page_list)) {
> +		struct page *page;
> +
> +		page = lru_to_page(page_list);
> +		list_del(&page->lru);
> +
> +		if (nid == -1) {
> +			nid = page_to_nid(page);
> +			INIT_LIST_HEAD(&node_page_list);
> +			nr_isolated[0] = nr_isolated[1] = 0;
> +		}
> +
> +		if (nid == page_to_nid(page)) {
> +			list_add(&page->lru, &node_page_list);
> +			nr_isolated[!!page_is_file_cache(page)] +=
> +						hpage_nr_pages(page);
> +			continue;
> +		}
> +
> +		nid = page_to_nid(page);
> +
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> +					nr_isolated[0]);
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> +					nr_isolated[1]);
> +		nr_reclaimed += shrink_page_list(&node_page_list,
> +				NODE_DATA(nid), &sc, TTU_IGNORE_ACCESS,
> +				&dummy_stat, true);
> +		while (!list_empty(&node_page_list)) {
> +			struct page *page = lru_to_page(page_list);
> +
> +			list_del(&page->lru);
> +			putback_lru_page(page);
> +		}
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> +					-nr_isolated[0]);
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> +					-nr_isolated[1]);
> +		nr_isolated[0] = nr_isolated[1] = 0;
> +		INIT_LIST_HEAD(&node_page_list);
> +	}
> +
> +	if (!list_empty(&node_page_list)) {
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> +					nr_isolated[0]);
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> +					nr_isolated[1]);
> +		nr_reclaimed += shrink_page_list(&node_page_list,
> +				NODE_DATA(nid), &sc, TTU_IGNORE_ACCESS,
> +				&dummy_stat, true);
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_ANON,
> +					-nr_isolated[0]);
> +		mod_node_page_state(NODE_DATA(nid), NR_ISOLATED_FILE,
> +					-nr_isolated[1]);
> +	}
> +
> +	return nr_reclaimed;
> +}
> +
>  /*
>   * The inactive anon list should be small enough that the VM never has
>   * to do too much work.
> -- 
> 2.21.0.1020.gf2820cf01a-goog
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v3 2/2] initramfs: introduce do_readxattrs()
From: Roberto Sassu @ 2019-05-20  8:47 UTC (permalink / raw)
  To: hpa, viro
  Cc: linux-security-module, linux-integrity, initramfs, linux-api,
	linux-fsdevel, linux-kernel, zohar, silviu.vlasceanu,
	dmitry.kasatkin, takondra, kamensky, arnd, rob, james.w.mcmechan,
	niveditas98
In-Reply-To: <CD9A4F89-7CA5-4329-A06A-F8DEB87905A5@zytor.com>

On 5/17/2019 10:18 PM, hpa@zytor.com wrote:
> On May 17, 2019 9:55:19 AM PDT, Roberto Sassu <roberto.sassu@huawei.com> wrote:
>> This patch adds support for an alternative method to add xattrs to
>> files in
>> the rootfs filesystem. Instead of extracting them directly from the ram
>> disk image, they are extracted from a regular file called .xattr-list,
>> that
>> can be added by any ram disk generator available today. The file format
>> is:
>>
>> <file #N data len (ASCII, 10 chars)><file #N path>\0
>> <xattr #N data len (ASCII, 8 chars)><xattr #N name>\0<xattr #N value>
>>
>> .xattr-list can be generated by executing:
>>
>> $ getfattr --absolute-names -d -h -R -e hex -m - \
>>       <file list> | xattr.awk -b > ${initdir}/.xattr-list
>>
>> where the content of the xattr.awk script is:
>>
>> #! /usr/bin/awk -f
>> {
>>   if (!length($0)) {
>>     printf("%.10x%s\0", len, file);
>>     for (x in xattr) {
>>       printf("%.8x%s\0", xattr_len[x], x);
>>       for (i = 0; i < length(xattr[x]) / 2; i++) {
>>         printf("%c", strtonum("0x"substr(xattr[x], i * 2 + 1, 2)));
>>       }
>>     }
>>     i = 0;
>>     delete xattr;
>>     delete xattr_len;
>>     next;
>>   };
>>   if (i == 0) {
>>     file=$3;
>>     len=length(file) + 8 + 1;
>>   }
>>   if (i > 0) {
>>     split($0, a, "=");
>>     xattr[a[1]]=substr(a[2], 3);
>>     xattr_len[a[1]]=length(a[1]) + 1 + 8 + length(xattr[a[1]]) / 2;
>>     len+=xattr_len[a[1]];
>>   };
>>   i++;
>> }
>>
>> Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
>> ---
>> init/initramfs.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 99 insertions(+)
>>
>> diff --git a/init/initramfs.c b/init/initramfs.c
>> index 0c6dd1d5d3f6..6ec018c6279a 100644
>> --- a/init/initramfs.c
>> +++ b/init/initramfs.c
>> @@ -13,6 +13,8 @@
>> #include <linux/namei.h>
>> #include <linux/xattr.h>
>>
>> +#define XATTR_LIST_FILENAME ".xattr-list"
>> +
>> static ssize_t __init xwrite(int fd, const char *p, size_t count)
>> {
>> 	ssize_t out = 0;
>> @@ -382,6 +384,97 @@ static int __init __maybe_unused do_setxattrs(char
>> *pathname)
>> 	return 0;
>> }
>>
>> +struct path_hdr {
>> +	char p_size[10]; /* total size including p_size field */
>> +	char p_data[];   /* <path>\0<xattrs> */
>> +};
>> +
>> +static int __init do_readxattrs(void)
>> +{
>> +	struct path_hdr hdr;
>> +	char *path = NULL;
>> +	char str[sizeof(hdr.p_size) + 1];
>> +	unsigned long file_entry_size;
>> +	size_t size, path_size, total_size;
>> +	struct kstat st;
>> +	struct file *file;
>> +	loff_t pos;
>> +	int ret;
>> +
>> +	ret = vfs_lstat(XATTR_LIST_FILENAME, &st);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	total_size = st.size;
>> +
>> +	file = filp_open(XATTR_LIST_FILENAME, O_RDONLY, 0);
>> +	if (IS_ERR(file))
>> +		return PTR_ERR(file);
>> +
>> +	pos = file->f_pos;
>> +
>> +	while (total_size) {
>> +		size = kernel_read(file, (char *)&hdr, sizeof(hdr), &pos);
>> +		if (size != sizeof(hdr)) {
>> +			ret = -EIO;
>> +			goto out;
>> +		}
>> +
>> +		total_size -= size;
>> +
>> +		str[sizeof(hdr.p_size)] = 0;
>> +		memcpy(str, hdr.p_size, sizeof(hdr.p_size));
>> +		ret = kstrtoul(str, 16, &file_entry_size);
>> +		if (ret < 0)
>> +			goto out;
>> +
>> +		file_entry_size -= sizeof(sizeof(hdr.p_size));
>> +		if (file_entry_size > total_size) {
>> +			ret = -EINVAL;
>> +			goto out;
>> +		}
>> +
>> +		path = vmalloc(file_entry_size);
>> +		if (!path) {
>> +			ret = -ENOMEM;
>> +			goto out;
>> +		}
>> +
>> +		size = kernel_read(file, path, file_entry_size, &pos);
>> +		if (size != file_entry_size) {
>> +			ret = -EIO;
>> +			goto out_free;
>> +		}
>> +
>> +		total_size -= size;
>> +
>> +		path_size = strnlen(path, file_entry_size);
>> +		if (path_size == file_entry_size) {
>> +			ret = -EINVAL;
>> +			goto out_free;
>> +		}
>> +
>> +		xattr_buf = path + path_size + 1;
>> +		xattr_len = file_entry_size - path_size - 1;
>> +
>> +		ret = do_setxattrs(path);
>> +		vfree(path);
>> +		path = NULL;
>> +
>> +		if (ret < 0)
>> +			break;
>> +	}
>> +out_free:
>> +	vfree(path);
>> +out:
>> +	fput(file);
>> +
>> +	if (ret < 0)
>> +		error("Unable to parse xattrs");
>> +
>> +	return ret;
>> +}
>> +
>> static __initdata int wfd;
>>
>> static int __init do_name(void)
>> @@ -391,6 +484,11 @@ static int __init do_name(void)
>> 	if (strcmp(collected, "TRAILER!!!") == 0) {
>> 		free_hash();
>> 		return 0;
>> +	} else if (strcmp(collected, XATTR_LIST_FILENAME) == 0) {
>> +		struct kstat st;
>> +
>> +		if (!vfs_lstat(collected, &st))
>> +			do_readxattrs();
>> 	}
>> 	clean_path(collected, mode);
>> 	if (S_ISREG(mode)) {
>> @@ -562,6 +660,7 @@ static char * __init unpack_to_rootfs(char *buf,
>> unsigned long len)
>> 		buf += my_inptr;
>> 		len -= my_inptr;
>> 	}
>> +	do_readxattrs();
>> 	dir_utime();
>> 	kfree(name_buf);
>> 	kfree(symlink_buf);
> 
> Ok... I just realized this does not work for a modular initramfs, composed at load time from multiple files, which is a very real problem. Should be easy enough to deal with: instead of one large file, use one companion file per source file, perhaps something like filename..xattrs (suggesting double dots to make it less likely to conflict with a "real" file.) No leading dot, as it makes it more likely that archivers will sort them before the file proper.

Version 1 of the patch set worked exactly in this way. However, Rob
pointed out that this would be a problem if file names plus the suffix
exceed 255 characters.

Roberto

-- 
HUAWEI TECHNOLOGIES Duesseldorf GmbH, HRB 56063
Managing Director: Bo PENG, Jian LI, Yanli SHI

^ permalink raw reply

* Re: [RFC 5/7] mm: introduce external memory hinting API
From: Michal Hocko @ 2019-05-20  9:18 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, LKML, linux-mm, Johannes Weiner, Tim Murray,
	Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, linux-api
In-Reply-To: <20190520035254.57579-6-minchan@kernel.org>

[Cc linux-api]

On Mon 20-05-19 12:52:52, Minchan Kim wrote:
> There is some usecase that centralized userspace daemon want to give
> a memory hint like MADV_[COOL|COLD] to other process. Android's
> ActivityManagerService is one of them.
> 
> It's similar in spirit to madvise(MADV_WONTNEED), but the information
> required to make the reclaim decision is not known to the app. Instead,
> it is known to the centralized userspace daemon(ActivityManagerService),
> and that daemon must be able to initiate reclaim on its own without
> any app involvement.

Could you expand some more about how this all works? How does the
centralized daemon track respective ranges? How does it synchronize
against parallel modification of the address space etc.

> To solve the issue, this patch introduces new syscall process_madvise(2)
> which works based on pidfd so it could give a hint to the exeternal
> process.
> 
> int process_madvise(int pidfd, void *addr, size_t length, int advise);

OK, this makes some sense from the API point of view. When we have
discussed that at LSFMM I was contemplating about something like that
except the fd would be a VMA fd rather than the process. We could extend
and reuse /proc/<pid>/map_files interface which doesn't support the
anonymous memory right now. 

I am not saying this would be a better interface but I wanted to mention
it here for a further discussion. One slight advantage would be that
you know the exact object that you are operating on because you have a
fd for the VMA and we would have a more straightforward way to reject
operation if the underlying object has changed (e.g. unmapped and reused
for a different mapping).

> All advises madvise provides can be supported in process_madvise, too.
> Since it could affect other process's address range, only privileged
> process(CAP_SYS_PTRACE) or something else(e.g., being the same UID)
> gives it the right to ptrrace the process could use it successfully.

proc_mem_open model we use for accessing address space via proc sounds
like a good mode. You are doing something similar.

> Please suggest better idea if you have other idea about the permission.
> 
> * from v1r1
>   * use ptrace capability - surenb, dancol
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  arch/x86/entry/syscalls/syscall_32.tbl |  1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>  include/linux/proc_fs.h                |  1 +
>  include/linux/syscalls.h               |  2 ++
>  include/uapi/asm-generic/unistd.h      |  2 ++
>  kernel/signal.c                        |  2 +-
>  kernel/sys_ni.c                        |  1 +
>  mm/madvise.c                           | 45 ++++++++++++++++++++++++++
>  8 files changed, 54 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 4cd5f982b1e5..5b9dd55d6b57 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -438,3 +438,4 @@
>  425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
>  426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
>  427	i386	io_uring_register	sys_io_uring_register		__ia32_sys_io_uring_register
> +428	i386	process_madvise		sys_process_madvise		__ia32_sys_process_madvise
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 64ca0d06259a..0e5ee78161c9 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -355,6 +355,7 @@
>  425	common	io_uring_setup		__x64_sys_io_uring_setup
>  426	common	io_uring_enter		__x64_sys_io_uring_enter
>  427	common	io_uring_register	__x64_sys_io_uring_register
> +428	common	process_madvise		__x64_sys_process_madvise
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index 52a283ba0465..f8545d7c5218 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -122,6 +122,7 @@ static inline struct pid *tgid_pidfd_to_pid(const struct file *file)
>  
>  #endif /* CONFIG_PROC_FS */
>  
> +extern struct pid *pidfd_to_pid(const struct file *file);
>  struct net;
>  
>  static inline struct proc_dir_entry *proc_net_mkdir(
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index e2870fe1be5b..21c6c9a62006 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -872,6 +872,8 @@ asmlinkage long sys_munlockall(void);
>  asmlinkage long sys_mincore(unsigned long start, size_t len,
>  				unsigned char __user * vec);
>  asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
> +asmlinkage long sys_process_madvise(int pid_fd, unsigned long start,
> +				size_t len, int behavior);
>  asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
>  			unsigned long prot, unsigned long pgoff,
>  			unsigned long flags);
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index dee7292e1df6..7ee82ce04620 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -832,6 +832,8 @@ __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
>  __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
>  #define __NR_io_uring_register 427
>  __SYSCALL(__NR_io_uring_register, sys_io_uring_register)
> +#define __NR_process_madvise 428
> +__SYSCALL(__NR_process_madvise, sys_process_madvise)
>  
>  #undef __NR_syscalls
>  #define __NR_syscalls 428
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 1c86b78a7597..04e75daab1f8 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -3620,7 +3620,7 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
>  	return copy_siginfo_from_user(kinfo, info);
>  }
>  
> -static struct pid *pidfd_to_pid(const struct file *file)
> +struct pid *pidfd_to_pid(const struct file *file)
>  {
>  	if (file->f_op == &pidfd_fops)
>  		return file->private_data;
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 4d9ae5ea6caf..5277421795ab 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -278,6 +278,7 @@ COND_SYSCALL(mlockall);
>  COND_SYSCALL(munlockall);
>  COND_SYSCALL(mincore);
>  COND_SYSCALL(madvise);
> +COND_SYSCALL(process_madvise);
>  COND_SYSCALL(remap_file_pages);
>  COND_SYSCALL(mbind);
>  COND_SYSCALL_COMPAT(mbind);
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 119e82e1f065..af02aa17e5c1 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -9,6 +9,7 @@
>  #include <linux/mman.h>
>  #include <linux/pagemap.h>
>  #include <linux/page_idle.h>
> +#include <linux/proc_fs.h>
>  #include <linux/syscalls.h>
>  #include <linux/mempolicy.h>
>  #include <linux/page-isolation.h>
> @@ -16,6 +17,7 @@
>  #include <linux/hugetlb.h>
>  #include <linux/falloc.h>
>  #include <linux/sched.h>
> +#include <linux/sched/mm.h>
>  #include <linux/ksm.h>
>  #include <linux/fs.h>
>  #include <linux/file.h>
> @@ -1140,3 +1142,46 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
>  {
>  	return madvise_core(current, start, len_in, behavior);
>  }
> +
> +SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> +		size_t, len_in, int, behavior)
> +{
> +	int ret;
> +	struct fd f;
> +	struct pid *pid;
> +	struct task_struct *tsk;
> +	struct mm_struct *mm;
> +
> +	f = fdget(pidfd);
> +	if (!f.file)
> +		return -EBADF;
> +
> +	pid = pidfd_to_pid(f.file);
> +	if (IS_ERR(pid)) {
> +		ret = PTR_ERR(pid);
> +		goto err;
> +	}
> +
> +	ret = -EINVAL;
> +	rcu_read_lock();
> +	tsk = pid_task(pid, PIDTYPE_PID);
> +	if (!tsk) {
> +		rcu_read_unlock();
> +		goto err;
> +	}
> +	get_task_struct(tsk);
> +	rcu_read_unlock();
> +	mm = mm_access(tsk, PTRACE_MODE_ATTACH_REALCREDS);
> +	if (!mm || IS_ERR(mm)) {
> +		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
> +		if (ret == -EACCES)
> +			ret = -EPERM;
> +		goto err;
> +	}
> +	ret = madvise_core(tsk, start, len_in, behavior);
> +	mmput(mm);
> +	put_task_struct(tsk);
> +err:
> +	fdput(f);
> +	return ret;
> +}
> -- 
> 2.21.0.1020.gf2820cf01a-goog
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [RFC 6/7] mm: extend process_madvise syscall to support vector arrary
From: Michal Hocko @ 2019-05-20  9:22 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, LKML, linux-mm, Johannes Weiner, Tim Murray,
	Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, linux-api
In-Reply-To: <20190520035254.57579-7-minchan@kernel.org>

[Cc linux-api]

On Mon 20-05-19 12:52:53, Minchan Kim wrote:
> Currently, process_madvise syscall works for only one address range
> so user should call the syscall several times to give hints to
> multiple address range.

Is that a problem? How big of a problem? Any numbers?

> This patch extends process_madvise syscall to support multiple
> hints, address ranges and return vaules so user could give hints
> all at once.
> 
> struct pr_madvise_param {
>     int size;                       /* the size of this structure */
>     const struct iovec __user *vec; /* address range array */
> }
> 
> int process_madvise(int pidfd, ssize_t nr_elem,
> 		    int *behavior,
> 		    struct pr_madvise_param *results,
> 		    struct pr_madvise_param *ranges,
> 		    unsigned long flags);
> 
> - pidfd
> 
> target process fd
> 
> - nr_elem
> 
> the number of elemenent of array behavior, results, ranges
> 
> - behavior
> 
> hints for each address range in remote process so that user could
> give different hints for each range.

What is the guarantee of a single call? Do all hints get applied or the
first failure backs of? What are the atomicity guarantees?

> 
> - results
> 
> array of buffers to get results for associated remote address range
> action.
> 
> - ranges
> 
> array to buffers to have remote process's address ranges to be
> processed
> 
> - flags
> 
> extra argument for the future. It should be zero this moment.
> 
> Example)
> 
> struct pr_madvise_param {
>         int size;
>         const struct iovec *vec;
> };
> 
> int main(int argc, char *argv[])
> {
>         struct pr_madvise_param retp, rangep;
>         struct iovec result_vec[2], range_vec[2];
>         int hints[2];
>         long ret[2];
>         void *addr[2];
> 
>         pid_t pid;
>         char cmd[64] = {0,};
>         addr[0] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
>                           MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> 
>         if (MAP_FAILED == addr[0])
>                 return 1;
> 
>         addr[1] = mmap(NULL, ALLOC_SIZE, PROT_READ|PROT_WRITE,
>                           MAP_POPULATE|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> 
>         if (MAP_FAILED == addr[1])
>                 return 1;
> 
>         hints[0] = MADV_COLD;
> 	range_vec[0].iov_base = addr[0];
>         range_vec[0].iov_len = ALLOC_SIZE;
>         result_vec[0].iov_base = &ret[0];
>         result_vec[0].iov_len = sizeof(long);
> 	retp.vec = result_vec;
>         retp.size = sizeof(struct pr_madvise_param);
> 
>         hints[1] = MADV_COOL;
>         range_vec[1].iov_base = addr[1];
>         range_vec[1].iov_len = ALLOC_SIZE;
>         result_vec[1].iov_base = &ret[1];
>         result_vec[1].iov_len = sizeof(long);
>         rangep.vec = range_vec;
>         rangep.size = sizeof(struct pr_madvise_param);
> 
>         pid = fork();
>         if (!pid) {
>                 sleep(10);
>         } else {
>                 int pidfd = open(cmd,  O_DIRECTORY | O_CLOEXEC);
>                 if (pidfd < 0)
>                         return 1;
> 
>                 /* munmap to make pages private for the child */
>                 munmap(addr[0], ALLOC_SIZE);
>                 munmap(addr[1], ALLOC_SIZE);
>                 system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
>                 if (syscall(__NR_process_madvise, pidfd, 2, behaviors,
> 						&retp, &rangep, 0))
>                         perror("process_madvise fail\n");
>                 system("cat /proc/vmstat | egrep 'pswpout|deactivate'");
>         }
> 
>         return 0;
> }
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  include/uapi/asm-generic/mman-common.h |   5 +
>  mm/madvise.c                           | 184 +++++++++++++++++++++----
>  2 files changed, 166 insertions(+), 23 deletions(-)
> 
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index b9b51eeb8e1a..b8e230de84a6 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -74,4 +74,9 @@
>  #define PKEY_ACCESS_MASK	(PKEY_DISABLE_ACCESS |\
>  				 PKEY_DISABLE_WRITE)
>  
> +struct pr_madvise_param {
> +	int size;			/* the size of this structure */
> +	const struct iovec __user *vec;	/* address range array */
> +};
> +
>  #endif /* __ASM_GENERIC_MMAN_COMMON_H */
> diff --git a/mm/madvise.c b/mm/madvise.c
> index af02aa17e5c1..f4f569dac2bd 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -320,6 +320,7 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
>  	struct page *page;
>  	struct vm_area_struct *vma = walk->vma;
>  	unsigned long next;
> +	long nr_pages = 0;
>  
>  	next = pmd_addr_end(addr, end);
>  	if (pmd_trans_huge(*pmd)) {
> @@ -380,9 +381,12 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
>  
>  		ptep_test_and_clear_young(vma, addr, pte);
>  		deactivate_page(page);
> +		nr_pages++;
> +
>  	}
>  
>  	pte_unmap_unlock(orig_pte, ptl);
> +	*(long *)walk->private += nr_pages;
>  	cond_resched();
>  
>  	return 0;
> @@ -390,11 +394,13 @@ static int madvise_cool_pte_range(pmd_t *pmd, unsigned long addr,
>  
>  static void madvise_cool_page_range(struct mmu_gather *tlb,
>  			     struct vm_area_struct *vma,
> -			     unsigned long addr, unsigned long end)
> +			     unsigned long addr, unsigned long end,
> +			     long *nr_pages)
>  {
>  	struct mm_walk cool_walk = {
>  		.pmd_entry = madvise_cool_pte_range,
>  		.mm = vma->vm_mm,
> +		.private = nr_pages
>  	};
>  
>  	tlb_start_vma(tlb, vma);
> @@ -403,7 +409,8 @@ static void madvise_cool_page_range(struct mmu_gather *tlb,
>  }
>  
>  static long madvise_cool(struct vm_area_struct *vma,
> -			unsigned long start_addr, unsigned long end_addr)
> +			unsigned long start_addr, unsigned long end_addr,
> +			long *nr_pages)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	struct mmu_gather tlb;
> @@ -413,7 +420,7 @@ static long madvise_cool(struct vm_area_struct *vma,
>  
>  	lru_add_drain();
>  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> -	madvise_cool_page_range(&tlb, vma, start_addr, end_addr);
> +	madvise_cool_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
>  	tlb_finish_mmu(&tlb, start_addr, end_addr);
>  
>  	return 0;
> @@ -429,6 +436,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  	int isolated = 0;
>  	struct vm_area_struct *vma = walk->vma;
>  	unsigned long next;
> +	long nr_pages = 0;
>  
>  	next = pmd_addr_end(addr, end);
>  	if (pmd_trans_huge(*pmd)) {
> @@ -492,7 +500,7 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  		list_add(&page->lru, &page_list);
>  		if (isolated >= SWAP_CLUSTER_MAX) {
>  			pte_unmap_unlock(orig_pte, ptl);
> -			reclaim_pages(&page_list);
> +			nr_pages += reclaim_pages(&page_list);
>  			isolated = 0;
>  			pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
>  			orig_pte = pte;
> @@ -500,19 +508,22 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr,
>  	}
>  
>  	pte_unmap_unlock(orig_pte, ptl);
> -	reclaim_pages(&page_list);
> +	nr_pages += reclaim_pages(&page_list);
>  	cond_resched();
>  
> +	*(long *)walk->private += nr_pages;
>  	return 0;
>  }
>  
>  static void madvise_cold_page_range(struct mmu_gather *tlb,
>  			     struct vm_area_struct *vma,
> -			     unsigned long addr, unsigned long end)
> +			     unsigned long addr, unsigned long end,
> +			     long *nr_pages)
>  {
>  	struct mm_walk warm_walk = {
>  		.pmd_entry = madvise_cold_pte_range,
>  		.mm = vma->vm_mm,
> +		.private = nr_pages,
>  	};
>  
>  	tlb_start_vma(tlb, vma);
> @@ -522,7 +533,8 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
>  
>  
>  static long madvise_cold(struct vm_area_struct *vma,
> -			unsigned long start_addr, unsigned long end_addr)
> +			unsigned long start_addr, unsigned long end_addr,
> +			long *nr_pages)
>  {
>  	struct mm_struct *mm = vma->vm_mm;
>  	struct mmu_gather tlb;
> @@ -532,7 +544,7 @@ static long madvise_cold(struct vm_area_struct *vma,
>  
>  	lru_add_drain();
>  	tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
> -	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
> +	madvise_cold_page_range(&tlb, vma, start_addr, end_addr, nr_pages);
>  	tlb_finish_mmu(&tlb, start_addr, end_addr);
>  
>  	return 0;
> @@ -922,7 +934,7 @@ static int madvise_inject_error(int behavior,
>  static long
>  madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
>  		struct vm_area_struct **prev, unsigned long start,
> -		unsigned long end, int behavior)
> +		unsigned long end, int behavior, long *nr_pages)
>  {
>  	switch (behavior) {
>  	case MADV_REMOVE:
> @@ -930,9 +942,9 @@ madvise_vma(struct task_struct *tsk, struct vm_area_struct *vma,
>  	case MADV_WILLNEED:
>  		return madvise_willneed(vma, prev, start, end);
>  	case MADV_COOL:
> -		return madvise_cool(vma, start, end);
> +		return madvise_cool(vma, start, end, nr_pages);
>  	case MADV_COLD:
> -		return madvise_cold(vma, start, end);
> +		return madvise_cold(vma, start, end, nr_pages);
>  	case MADV_FREE:
>  	case MADV_DONTNEED:
>  		return madvise_dontneed_free(tsk, vma, prev, start,
> @@ -981,7 +993,7 @@ madvise_behavior_valid(int behavior)
>  }
>  
>  static int madvise_core(struct task_struct *tsk, unsigned long start,
> -			size_t len_in, int behavior)
> +			size_t len_in, int behavior, long *nr_pages)
>  {
>  	unsigned long end, tmp;
>  	struct vm_area_struct *vma, *prev;
> @@ -996,6 +1008,7 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>  
>  	if (start & ~PAGE_MASK)
>  		return error;
> +
>  	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
>  
>  	/* Check to see whether len was rounded up from small -ve to zero */
> @@ -1035,6 +1048,8 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>  	blk_start_plug(&plug);
>  	for (;;) {
>  		/* Still start < end. */
> +		long pages = 0;
> +
>  		error = -ENOMEM;
>  		if (!vma)
>  			goto out;
> @@ -1053,9 +1068,11 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>  			tmp = end;
>  
>  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
> -		error = madvise_vma(tsk, vma, &prev, start, tmp, behavior);
> +		error = madvise_vma(tsk, vma, &prev, start, tmp,
> +					behavior, &pages);
>  		if (error)
>  			goto out;
> +		*nr_pages += pages;
>  		start = tmp;
>  		if (prev && start < prev->vm_end)
>  			start = prev->vm_end;
> @@ -1140,26 +1157,137 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>   */
>  SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
>  {
> -	return madvise_core(current, start, len_in, behavior);
> +	unsigned long dummy;
> +
> +	return madvise_core(current, start, len_in, behavior, &dummy);
>  }
>  
> -SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
> -		size_t, len_in, int, behavior)
> +static int pr_madvise_copy_param(struct pr_madvise_param __user *u_param,
> +		struct pr_madvise_param *param)
> +{
> +	u32 size;
> +	int ret;
> +
> +	memset(param, 0, sizeof(*param));
> +
> +	ret = get_user(size, &u_param->size);
> +	if (ret)
> +		return ret;
> +
> +	if (size > PAGE_SIZE)
> +		return -E2BIG;
> +
> +	if (!size || size > sizeof(struct pr_madvise_param))
> +		return -EINVAL;
> +
> +	ret = copy_from_user(param, u_param, size);
> +	if (ret)
> +		return -EFAULT;
> +
> +	return ret;
> +}
> +
> +static int process_madvise_core(struct task_struct *tsk, int *behaviors,
> +				struct iov_iter *iter,
> +				const struct iovec *range_vec,
> +				unsigned long riovcnt,
> +				unsigned long flags)
> +{
> +	int i;
> +	long err;
> +
> +	for (err = 0, i = 0; i < riovcnt && iov_iter_count(iter); i++) {
> +		long ret = 0;
> +
> +		err = madvise_core(tsk, (unsigned long)range_vec[i].iov_base,
> +				range_vec[i].iov_len, behaviors[i],
> +				&ret);
> +		if (err)
> +			ret = err;
> +
> +		if (copy_to_iter(&ret, sizeof(long), iter) !=
> +				sizeof(long)) {
> +			err = -EFAULT;
> +			break;
> +		}
> +
> +		err = 0;
> +	}
> +
> +	return err;
> +}
> +
> +SYSCALL_DEFINE6(process_madvise, int, pidfd, ssize_t, nr_elem,
> +			const int __user *, hints,
> +			struct pr_madvise_param __user *, results,
> +			struct pr_madvise_param __user *, ranges,
> +			unsigned long, flags)
>  {
>  	int ret;
>  	struct fd f;
>  	struct pid *pid;
>  	struct task_struct *tsk;
>  	struct mm_struct *mm;
> +	struct pr_madvise_param result_p, range_p;
> +	const struct iovec __user *result_vec, __user *range_vec;
> +	int *behaviors;
> +	struct iovec iovstack_result[UIO_FASTIOV];
> +	struct iovec iovstack_r[UIO_FASTIOV];
> +	struct iovec *iov_l = iovstack_result;
> +	struct iovec *iov_r = iovstack_r;
> +	struct iov_iter iter;
> +
> +	if (flags != 0)
> +		return -EINVAL;
> +
> +	ret = pr_madvise_copy_param(results, &result_p);
> +	if (ret)
> +		return ret;
> +
> +	ret = pr_madvise_copy_param(ranges, &range_p);
> +	if (ret)
> +		return ret;
> +
> +	result_vec = result_p.vec;
> +	range_vec = range_p.vec;
> +
> +	if (result_p.size != sizeof(struct pr_madvise_param) ||
> +			range_p.size != sizeof(struct pr_madvise_param))
> +		return -EINVAL;
> +
> +	behaviors = kmalloc_array(nr_elem, sizeof(int), GFP_KERNEL);
> +	if (!behaviors)
> +		return -ENOMEM;
> +
> +	ret = copy_from_user(behaviors, hints, sizeof(int) * nr_elem);
> +	if (ret < 0)
> +		goto free_behavior_vec;
> +
> +	ret = import_iovec(READ, result_vec, nr_elem, UIO_FASTIOV,
> +				&iov_l, &iter);
> +	if (ret < 0)
> +		goto free_behavior_vec;
> +
> +	if (!iov_iter_count(&iter)) {
> +		ret = -EINVAL;
> +		goto free_iovecs;
> +	}
> +
> +	ret = rw_copy_check_uvector(CHECK_IOVEC_ONLY, range_vec, nr_elem,
> +				UIO_FASTIOV, iovstack_r, &iov_r);
> +	if (ret <= 0)
> +		goto free_iovecs;
>  
>  	f = fdget(pidfd);
> -	if (!f.file)
> -		return -EBADF;
> +	if (!f.file) {
> +		ret = -EBADF;
> +		goto free_iovecs;
> +	}
>  
>  	pid = pidfd_to_pid(f.file);
>  	if (IS_ERR(pid)) {
>  		ret = PTR_ERR(pid);
> -		goto err;
> +		goto put_fd;
>  	}
>  
>  	ret = -EINVAL;
> @@ -1167,7 +1295,7 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
>  	tsk = pid_task(pid, PIDTYPE_PID);
>  	if (!tsk) {
>  		rcu_read_unlock();
> -		goto err;
> +		goto put_fd;
>  	}
>  	get_task_struct(tsk);
>  	rcu_read_unlock();
> @@ -1176,12 +1304,22 @@ SYSCALL_DEFINE4(process_madvise, int, pidfd, unsigned long, start,
>  		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
>  		if (ret == -EACCES)
>  			ret = -EPERM;
> -		goto err;
> +		goto put_task;
>  	}
> -	ret = madvise_core(tsk, start, len_in, behavior);
> +
> +	ret = process_madvise_core(tsk, behaviors, &iter, iov_r,
> +					nr_elem, flags);
>  	mmput(mm);
> +put_task:
>  	put_task_struct(tsk);
> -err:
> +put_fd:
>  	fdput(f);
> +free_iovecs:
> +	if (iov_r != iovstack_r)
> +		kfree(iov_r);
> +	kfree(iov_l);
> +free_behavior_vec:
> +	kfree(behaviors);
> +
>  	return ret;
>  }
> -- 
> 2.21.0.1020.gf2820cf01a-goog
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [RFC 7/7] mm: madvise support MADV_ANONYMOUS_FILTER and MADV_FILE_FILTER
From: Michal Hocko @ 2019-05-20  9:28 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, LKML, linux-mm, Johannes Weiner, Tim Murray,
	Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, linux-api
In-Reply-To: <20190520035254.57579-8-minchan@kernel.org>

[cc linux-api]

On Mon 20-05-19 12:52:54, Minchan Kim wrote:
> System could have much faster swap device like zRAM. In that case, swapping
> is extremely cheaper than file-IO on the low-end storage.
> In this configuration, userspace could handle different strategy for each
> kinds of vma. IOW, they want to reclaim anonymous pages by MADV_COLD
> while it keeps file-backed pages in inactive LRU by MADV_COOL because
> file IO is more expensive in this case so want to keep them in memory
> until memory pressure happens.
> 
> To support such strategy easier, this patch introduces
> MADV_ANONYMOUS_FILTER and MADV_FILE_FILTER options in madvise(2) like
> that /proc/<pid>/clear_refs already has supported same filters.
> They are filters could be Ored with other existing hints using top two bits
> of (int behavior).

madvise operates on top of ranges and it is quite trivial to do the
filtering from the userspace so why do we need any additional filtering?

> Once either of them is set, the hint could affect only the interested vma
> either anonymous or file-backed.
> 
> With that, user could call a process_madvise syscall simply with a entire
> range(0x0 - 0xFFFFFFFFFFFFFFFF) but either of MADV_ANONYMOUS_FILTER and
> MADV_FILE_FILTER so there is no need to call the syscall range by range.

OK, so here is the reason you want that. The immediate question is why
cannot the monitor do the filtering from the userspace. Slightly more
work, all right, but less of an API to expose and that itself is a
strong argument against.

> * from v1r2
>   * use consistent check with clear_refs to identify anon/file vma - surenb
> 
> * from v1r1
>   * use naming "filter" for new madvise option - dancol
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  include/uapi/asm-generic/mman-common.h |  5 +++++
>  mm/madvise.c                           | 14 ++++++++++++++
>  2 files changed, 19 insertions(+)
> 
> diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h
> index b8e230de84a6..be59a1b90284 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -66,6 +66,11 @@
>  #define MADV_WIPEONFORK 18		/* Zero memory on fork, child only */
>  #define MADV_KEEPONFORK 19		/* Undo MADV_WIPEONFORK */
>  
> +#define MADV_BEHAVIOR_MASK (~(MADV_ANONYMOUS_FILTER|MADV_FILE_FILTER))
> +
> +#define MADV_ANONYMOUS_FILTER	(1<<31)	/* works for only anonymous vma */
> +#define MADV_FILE_FILTER	(1<<30)	/* works for only file-backed vma */
> +
>  /* compatibility flags */
>  #define MAP_FILE	0
>  
> diff --git a/mm/madvise.c b/mm/madvise.c
> index f4f569dac2bd..116131243540 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -1002,7 +1002,15 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>  	int write;
>  	size_t len;
>  	struct blk_plug plug;
> +	bool anon_only, file_only;
>  
> +	anon_only = behavior & MADV_ANONYMOUS_FILTER;
> +	file_only = behavior & MADV_FILE_FILTER;
> +
> +	if (anon_only && file_only)
> +		return error;
> +
> +	behavior = behavior & MADV_BEHAVIOR_MASK;
>  	if (!madvise_behavior_valid(behavior))
>  		return error;
>  
> @@ -1067,12 +1075,18 @@ static int madvise_core(struct task_struct *tsk, unsigned long start,
>  		if (end < tmp)
>  			tmp = end;
>  
> +		if (anon_only && vma->vm_file)
> +			goto next;
> +		if (file_only && !vma->vm_file)
> +			goto next;
> +
>  		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
>  		error = madvise_vma(tsk, vma, &prev, start, tmp,
>  					behavior, &pages);
>  		if (error)
>  			goto out;
>  		*nr_pages += pages;
> +next:
>  		start = tmp;
>  		if (prev && start < prev->vm_end)
>  			start = prev->vm_end;
> -- 
> 2.21.0.1020.gf2820cf01a-goog
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [RFC 0/7] introduce memory hinting API for external process
From: Michal Hocko @ 2019-05-20  9:28 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Andrew Morton, LKML, linux-mm, Johannes Weiner, Tim Murray,
	Joel Fernandes, Suren Baghdasaryan, Daniel Colascione,
	Shakeel Butt, Sonny Rao, Brian Geffon, linux-api
In-Reply-To: <20190520035254.57579-1-minchan@kernel.org>

[Cc linux-api]

On Mon 20-05-19 12:52:47, Minchan Kim wrote:
> - Background
> 
> The Android terminology used for forking a new process and starting an app
> from scratch is a cold start, while resuming an existing app is a hot start.
> While we continually try to improve the performance of cold starts, hot
> starts will always be significantly less power hungry as well as faster so
> we are trying to make hot start more likely than cold start.
> 
> To increase hot start, Android userspace manages the order that apps should
> be killed in a process called ActivityManagerService. ActivityManagerService
> tracks every Android app or service that the user could be interacting with
> at any time and translates that into a ranked list for lmkd(low memory
> killer daemon). They are likely to be killed by lmkd if the system has to
> reclaim memory. In that sense they are similar to entries in any other cache.
> Those apps are kept alive for opportunistic performance improvements but
> those performance improvements will vary based on the memory requirements of
> individual workloads.
> 
> - Problem
> 
> Naturally, cached apps were dominant consumers of memory on the system.
> However, they were not significant consumers of swap even though they are
> good candidate for swap. Under investigation, swapping out only begins
> once the low zone watermark is hit and kswapd wakes up, but the overall
> allocation rate in the system might trip lmkd thresholds and cause a cached
> process to be killed(we measured performance swapping out vs. zapping the
> memory by killing a process. Unsurprisingly, zapping is 10x times faster
> even though we use zram which is much faster than real storage) so kill
> from lmkd will often satisfy the high zone watermark, resulting in very
> few pages actually being moved to swap.
> 
> - Approach
> 
> The approach we chose was to use a new interface to allow userspace to
> proactively reclaim entire processes by leveraging platform information.
> This allowed us to bypass the inaccuracy of the kernel’s LRUs for pages
> that are known to be cold from userspace and to avoid races with lmkd
> by reclaiming apps as soon as they entered the cached state. Additionally,
> it could provide many chances for platform to use much information to
> optimize memory efficiency.
> 
> IMHO we should spell it out that this patchset complements MADV_WONTNEED
> and MADV_FREE by adding non-destructive ways to gain some free memory
> space. MADV_COLD is similar to MADV_WONTNEED in a way that it hints the
> kernel that memory region is not currently needed and should be reclaimed
> immediately; MADV_COOL is similar to MADV_FREE in a way that it hints the
> kernel that memory region is not currently needed and should be reclaimed
> when memory pressure rises.
> 
> To achieve the goal, the patchset introduce two new options for madvise.
> One is MADV_COOL which will deactive activated pages and the other is
> MADV_COLD which will reclaim private pages instantly. These new options
> complement MADV_DONTNEED and MADV_FREE by adding non-destructive ways to
> gain some free memory space. MADV_COLD is similar to MADV_DONTNEED in a way
> that it hints the kernel that memory region is not currently needed and
> should be reclaimed immediately; MADV_COOL is similar to MADV_FREE in a way
> that it hints the kernel that memory region is not currently needed and
> should be reclaimed when memory pressure rises.
> 
> This approach is similar in spirit to madvise(MADV_WONTNEED), but the
> information required to make the reclaim decision is not known to the app.
> Instead, it is known to a centralized userspace daemon, and that daemon
> must be able to initiate reclaim on its own without any app involvement.
> To solve the concern, this patch introduces new syscall -
> 
> 	struct pr_madvise_param {
> 		int size;
> 		const struct iovec *vec;
> 	}
> 
> 	int process_madvise(int pidfd, ssize_t nr_elem, int *behavior,
> 				struct pr_madvise_param *restuls,
> 				struct pr_madvise_param *ranges,
> 				unsigned long flags);
> 
> The syscall get pidfd to give hints to external process and provides
> pair of result/ranges vector arguments so that it could give several
> hints to each address range all at once.
> 
> I guess others have different ideas about the naming of syscall and options
> so feel free to suggest better naming.
> 
> - Experiment
> 
> We did bunch of testing with several hundreds of real users, not artificial
> benchmark on android. We saw about 17% cold start decreasement without any
> significant battery/app startup latency issues. And with artificial benchmark
> which launches and switching apps, we saw average 7% app launching improvement,
> 18% less lmkd kill and good stat from vmstat.
> 
> A is vanilla and B is process_madvise.
> 
> 
>                                        A          B      delta   ratio(%)
>                allocstall_dma          0          0          0       0.00
>            allocstall_movable       1464        457      -1007     -69.00
>             allocstall_normal     263210     190763     -72447     -28.00
>              allocstall_total     264674     191220     -73454     -28.00
>           compact_daemon_wake      26912      25294      -1618      -7.00
>                  compact_fail      17885      14151      -3734     -21.00
>          compact_free_scanned 4204766409 3835994922 -368771487      -9.00
>              compact_isolated    3446484    2967618    -478866     -14.00
>       compact_migrate_scanned 1621336411 1324695710 -296640701     -19.00
>                 compact_stall      19387      15343      -4044     -21.00
>               compact_success       1502       1192       -310     -21.00
> kswapd_high_wmark_hit_quickly        234        184        -50     -22.00
>             kswapd_inodesteal     221635     233093      11458       5.00
>  kswapd_low_wmark_hit_quickly      66065      54009     -12056     -19.00
>                    nr_dirtied     259934     296476      36542      14.00
>   nr_vmscan_immediate_reclaim       2587       2356       -231      -9.00
>               nr_vmscan_write    1274232    2661733    1387501     108.00
>                    nr_written    1514060    2937560    1423500      94.00
>                    pageoutrun      67561      55133     -12428     -19.00
>                    pgactivate    2335060    1984882    -350178     -15.00
>                   pgalloc_dma   13743011   14096463     353452       2.00
>               pgalloc_movable          0          0          0       0.00
>                pgalloc_normal   18742440   16802065   -1940375     -11.00
>                 pgalloc_total   32485451   30898528   -1586923      -5.00
>                  pgdeactivate    4262210    2930670   -1331540     -32.00
>                       pgfault   30812334   31085065     272731       0.00
>                        pgfree   33553970   31765164   -1788806      -6.00
>                  pginodesteal      33411      15084     -18327     -55.00
>                   pglazyfreed          0          0          0       0.00
>                    pgmajfault     551312    1508299     956987     173.00
>                pgmigrate_fail      43927      29330     -14597     -34.00
>             pgmigrate_success    1399851    1203922    -195929     -14.00
>                        pgpgin   24141776   19032156   -5109620     -22.00
>                       pgpgout     959344    1103316     143972      15.00
>                  pgpgoutclean    4639732    3765868    -873864     -19.00
>                      pgrefill    4884560    3006938   -1877622     -39.00
>                     pgrotated      37828      25897     -11931     -32.00
>                 pgscan_direct    1456037     957567    -498470     -35.00
>        pgscan_direct_throttle          0          0          0       0.00
>                 pgscan_kswapd    6667767    5047360   -1620407     -25.00
>                  pgscan_total    8123804    6004927   -2118877     -27.00
>                    pgskip_dma          0          0          0       0.00
>                pgskip_movable          0          0          0       0.00
>                 pgskip_normal      14907      25382      10475      70.00
>                  pgskip_total      14907      25382      10475      70.00
>                pgsteal_direct    1118986     690215    -428771     -39.00
>                pgsteal_kswapd    4750223    3657107   -1093116     -24.00
>                 pgsteal_total    5869209    4347322   -1521887     -26.00
>                        pswpin     417613    1392647     975034     233.00
>                       pswpout    1274224    2661731    1387507     108.00
>                 slabs_scanned   13686905   10807200   -2879705     -22.00
>           workingset_activate     668966     569444     -99522     -15.00
>        workingset_nodereclaim      38957      32621      -6336     -17.00
>            workingset_refault    2816795    2179782    -637013     -23.00
>            workingset_restore     294320     168601    -125719     -43.00
> 
> pgmajfault is increased by 173% because swapin is increased by 200% by
> process_madvise hint. However, swap read based on zram is much cheaper
> than file IO in performance point of view and app hot start by swapin is
> also cheaper than cold start from the beginning of app which needs many IO
> from storage and initialization steps.
> 
> This patchset is against on next-20190517.
> 
> Minchan Kim (7):
>   mm: introduce MADV_COOL
>   mm: change PAGEREF_RECLAIM_CLEAN with PAGE_REFRECLAIM
>   mm: introduce MADV_COLD
>   mm: factor out madvise's core functionality
>   mm: introduce external memory hinting API
>   mm: extend process_madvise syscall to support vector arrary
>   mm: madvise support MADV_ANONYMOUS_FILTER and MADV_FILE_FILTER
> 
>  arch/x86/entry/syscalls/syscall_32.tbl |   1 +
>  arch/x86/entry/syscalls/syscall_64.tbl |   1 +
>  include/linux/page-flags.h             |   1 +
>  include/linux/page_idle.h              |  15 +
>  include/linux/proc_fs.h                |   1 +
>  include/linux/swap.h                   |   2 +
>  include/linux/syscalls.h               |   2 +
>  include/uapi/asm-generic/mman-common.h |  12 +
>  include/uapi/asm-generic/unistd.h      |   2 +
>  kernel/signal.c                        |   2 +-
>  kernel/sys_ni.c                        |   1 +
>  mm/madvise.c                           | 600 +++++++++++++++++++++----
>  mm/swap.c                              |  43 ++
>  mm/vmscan.c                            |  80 +++-
>  14 files changed, 680 insertions(+), 83 deletions(-)
> 
> -- 
> 2.21.0.1020.gf2820cf01a-goog
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v3 2/2] initramfs: introduce do_readxattrs()
From: Roberto Sassu @ 2019-05-20  9:39 UTC (permalink / raw)
  To: Arvind Sankar, H. Peter Anvin
  Cc: viro, linux-security-module, linux-integrity, initramfs,
	linux-api, linux-fsdevel, linux-kernel, zohar, silviu.vlasceanu,
	dmitry.kasatkin, takondra, kamensky, arnd, rob, james.w.mcmechan,
	niveditas98
In-Reply-To: <20190517221731.GA11358@rani.riverdale.lan>

On 5/18/2019 12:17 AM, Arvind Sankar wrote:
> On Fri, May 17, 2019 at 02:47:31PM -0700, H. Peter Anvin wrote:
>> On 5/17/19 2:02 PM, Arvind Sankar wrote:
>>> On Fri, May 17, 2019 at 01:18:11PM -0700, hpa@zytor.com wrote:
>>>>
>>>> Ok... I just realized this does not work for a modular initramfs, composed at load time from multiple files, which is a very real problem. Should be easy enough to deal with: instead of one large file, use one companion file per source file, perhaps something like filename..xattrs (suggesting double dots to make it less likely to conflict with a "real" file.) No leading dot, as it makes it more likely that archivers will sort them before the file proper.
>>> This version of the patch was changed from the previous one exactly to deal with this case --
>>> it allows for the bootloader to load multiple initramfs archives, each
>>> with its own .xattr-list file, and to have that work properly.
>>> Could you elaborate on the issue that you see?
>>>
>>
>> Well, for one thing, how do you define "cpio archive", each with its own
>> .xattr-list file? Second, that would seem to depend on the ordering, no,
>> in which case you depend critically on .xattr-list file following the
>> files, which most archivers won't do.
>>
>> Either way it seems cleaner to have this per file; especially if/as it
>> can be done without actually mucking up the format.
>>
>> I need to run, but I'll post a more detailed explanation of what I did
>> in a little bit.
>>
>> 	-hpa
>>
> Not sure what you mean by how do I define it? Each cpio archive will
> contain its own .xattr-list file with signatures for the files within
> it, that was the idea.
> 
> You need to review the code more closely I think -- it does not depend
> on the .xattr-list file following the files to which it applies.
> 
> The code first extracts .xattr-list as though it was a regular file. If
> a later dupe shows up (presumably from a second archive, although the
> patch will actually allow a second one in the same archive), it will
> then process the existing .xattr-list file and apply the attributes
> listed within it. It then will proceed to read the second one and
> overwrite the first one with it (this is the normal behaviour in the
> kernel cpio parser). At the end once all the archives have been
> extracted, if there is an .xattr-list file in the rootfs it will be
> parsed (it would've been the last one encountered, which hasn't been
> parsed yet, just extracted).
> 
> Regarding the idea to use the high 16 bits of the mode field in
> the header that's another possibility. It would just require additional
> support in the program that actually creates the archive though, which
> the current patch doesn't.

Yes, for adding signatures for a subset of files, no changes to the ram
disk generator are necessary. Everything is done by a custom module. To
support a generic use case, it would be necessary to modify the
generator to execute getfattr and the awk script after files have been
placed in the temporary directory.

If I understood the new proposal correctly, it would be task for cpio to
read file metadata after the content and create a new record for each
file with mode 0x18000, type of metadata encoded in the file name and
metadata as file content. I don't know how easy it would be to modify
cpio. Probably the amount of changes would be reasonable.

The kernel will behave in a similar way. It will call do_readxattrs() in
do_copy() for each file. Since the only difference between the current
and the new proposal would be two additional calls to do_readxattrs() in
do_name() and unpack_to_rootfs(), maybe we could support both.

Roberto

-- 
HUAWEI TECHNOLOGIES Duesseldorf GmbH, HRB 56063
Managing Director: Bo PENG, Jian LI, Yanli SHI

^ permalink raw reply

* [PATCH RFC v8 00/10] namei: resolveat(2) path resolution restrictions
From: Aleksa Sarai @ 2019-05-20 13:32 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan
  Cc: Aleksa Sarai, Eric Biederman, Andy Lutomirski, Jann Horn,
	Christian Brauner, David Drysdale, Tycho Andersen, Kees Cook,
	Linus Torvalds, containers, linux-fsdevel, linux-api,
	Andrew Morton, Alexei Starovoitov, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, linux-kselftest, linux-kernel, linux-arch

Patch changelog:
  v8:
    * Default to O_CLOEXEC to match other new fd-creation syscalls
      (users can always disable O_CLOEXEC afterwards). [Christian]
    * Implement magic-link restrictions based on their mode. This is
      done through a series of masks and is designed to avoid breaking
      users -- most users don't have chained O_PATH fd re-opens.
    * Add O_EMPTYPATH which allows for fd re-opening without needing
      procfs. This would help some users of fd re-opening, and with the
      changes to magic-link permissions we now have the right semantics
      for such a flag.
    * Add selftests for resolveat(2), O_EMPTYPATH, and the magic-link
	  mode semantics.
  v7:
    * Remove execveat(2) support for these flags since it might
      result in some pretty hairy security issues with setuid binaries.
      There are other avenues we can go down to solve the issues with
      CVE-2019-5736. [Jann]
    * Reserve an additional bit in resolveat(2) for the eXecute access
      mode if we end up implementing it.
  v6:
    * Drop O_* flags API to the new LOOKUP_ path scoping bits and
      instead introduce resolveat(2) as an alternative method of
      obtaining an O_PATH. The justification for this is included in
      patch 6 (though switching back to O_* flags is trivial).
  v5:
    * In response to CVE-2019-5736 (one of the vectors showed that
      open(2)+fexec(3) cannot be used to scope binfmt_script's implicit
      open_exec()), AT_* flags have been re-added and are now piped
      through to binfmt_script (and other binfmt_* that use open_exec)
      but are only supported for execveat(2) for now.
  v4:
    * Remove AT_* flag reservations, as they require more discussion.
    * Switch to path_is_under() over __d_path() for breakout checking.
    * Make O_XDEV no longer block openat("/tmp", "/", O_XDEV) -- dirfd
      is now ignored for absolute paths to match other flags.
    * Improve the dirfd_path_init() refactor and move it to a separate
      commit.
    * Remove reference to Linux-capsicum.
    * Switch "proclink" name to magic-link.
  v3: [resend]
  v2:
    * Made ".." resolution with AT_THIS_ROOT and AT_BENEATH safe(r) with
      some semi-aggressive __d_path checking (see patch 3).
    * Disallowed "proclinks" with AT_THIS_ROOT and AT_BENEATH, in the
      hopes they can be re-enabled once safe.
    * Removed the selftests as they will be reimplemented as xfstests.
    * Removed stat(2) support, since you can already get it through
      O_PATH and fstatat(2).

The need for some sort of control over VFS's path resolution (to avoid
malicious paths resulting in inadvertent breakouts) has been a very
long-standing desire of many userspace applications. This patchset is a
revival of Al Viro's old AT_NO_JUMPS[1,2] patchset (which was a variant
of David Drysdale's O_BENEATH patchset[3] which was a spin-off of the
Capsicum project[4]) with a few additions and changes made based on the
previous discussion within [5] as well as others I felt were useful.

In line with the conclusions of the original discussion of AT_NO_JUMPS,
the flag has been split up into separate flags. However, instead of
being an openat(2) flag it is provided through a new syscall
resolveat(2) which provides an alternative way to get an O_PATH file
descriptor (the reasoning for doing this is included in patch 6). The
following new LOOKUP_ flags are added:

  * LOOKUP_XDEV blocks all mountpoint crossings (upwards, downwards, or
    through absolute links). Absolute pathnames alone in openat(2) do
    not trigger this.

  * LOOKUP_NO_MAGICLINKS blocks resolution through /proc/$pid/fd-style
    links. This is done by blocking the usage of nd_jump_link() during
	resolution in a filesystem. The term "magic-links" is used to match
	with the only reference to these links in Documentation/, but I'm
	happy to change the name.

	It should be noted that this is different to the scope of
	~LOOKUP_FOLLOW in that it applies to all path components. However,
	you can do resolveat(NO_FOLLOW|NO_MAGICLINKS) on a magic-link and it
	will *not* fail (assuming that no parent component was a
	magic-link), and you will have an fd for the magic-link.

  * LOOKUP_BENEATH disallows escapes to outside the starting dirfd's
    tree, using techniques such as ".." or absolute links. Absolute
    paths in openat(2) are also disallowed. Conceptually this flag is to
    ensure you "stay below" a certain point in the filesystem tree --
    but this requires some additional to protect against various races
    that would allow escape using ".." (see patch 4 for more detail).

    Currently LOOKUP_BENEATH implies LOOKUP_NO_MAGICLINKS, because it
    can trivially beam you around the filesystem (breaking the
    protection). In future, there might be similar safety checks as in
    patch 4, but that requires more discussion.

In addition, two new flags are added that expand on the above ideas:

  * LOOKUP_NO_SYMLINKS does what it says on the tin. No symlink
	resolution is allowed at all, including magic-links. Just as with
	LOOKUP_NO_MAGICLINKS this can still be used with NOFOLLOW to open an
	fd for the symlink as long as no parent path had a symlink
	component.

  * LOOKUP_IN_ROOT is an extension of LOOKUP_BENEATH that, rather than
    blocking attempts to move past the root, forces all such movements
    to be scoped to the starting point. This provides chroot(2)-like
    protection but without the cost of a chroot(2) for each filesystem
    operation, as well as being safe against race attacks that chroot(2)
    is not.

    If a race is detected (as with LOOKUP_BENEATH) then an error is
    generated, and similar to LOOKUP_BENEATH it is not permitted to cross
    magic-links with LOOKUP_IN_ROOT.

    The primary need for this is from container runtimes, which
    currently need to do symlink scoping in userspace[6] when opening
    paths in a potentially malicious container. There is a long list of
    CVEs that could have bene mitigated by having O_THISROOT (such as
    CVE-2017-1002101, CVE-2017-1002102, CVE-2018-15664, and
    CVE-2019-5736, just to name a few).

And further, several semantics of file descriptor "re-opening" are now
changed to prevent attacks like CVE-2019-5736 by restricting how
magic-links can be resolved (based on their mode). This required some
other changes to the semantics of the modes of O_PATH file descriptor's
associated /proc/self/fd magic-links. resolveat(2) has the ability to
further restrict re-opening of its own O_PATH fds, so that users can
make even better use of this feature.

Finally, O_EMPTYPATH was added so that users can do /proc/self/fd-style
re-opening without depending on procfs. The new restricted semantics for
magic-links are applied here too.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Christian Brauner <christian@brauner.io>
Cc: David Drysdale <drysdale@google.com>
Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <containers@lists.linux-foundation.org>
Cc: <linux-fsdevel@vger.kernel.org>
Cc: <linux-api@vger.kernel.org>

[1]: https://lwn.net/Articles/721443/
[2]: https://lore.kernel.org/patchwork/patch/784221/
[3]: https://lwn.net/Articles/619151/
[4]: https://lwn.net/Articles/603929/
[5]: https://lwn.net/Articles/723057/
[6]: https://github.com/cyphar/filepath-securejoin

Aleksa Sarai (10):
  namei: obey trailing magic-link DAC permissions
  procfs: switch magic-link modes to be more sane
  open: O_EMPTYPATH: procfs-less file descriptor re-opening
  namei: split out nd->dfd handling to dirfd_path_init
  namei: O_BENEATH-style path resolution flags
  namei: LOOKUP_IN_ROOT: chroot-like path resolution
  namei: aggressively check for nd->root escape on ".." resolution
  namei: resolveat(2) syscall
  kselftest: save-and-restore errno to allow for %m formatting
  selftests: add resolveat(2) selftests

 arch/alpha/kernel/syscalls/syscall.tbl        |   1 +
 arch/arm/tools/syscall.tbl                    |   1 +
 arch/arm64/include/asm/unistd.h               |   2 +-
 arch/arm64/include/asm/unistd32.h             |   3 +
 arch/ia64/kernel/syscalls/syscall.tbl         |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl         |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl     |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl     |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl     |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl       |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl      |   1 +
 arch/s390/kernel/syscalls/syscall.tbl         |   1 +
 arch/sh/kernel/syscalls/syscall.tbl           |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl        |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl        |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl        |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl       |   1 +
 fs/fcntl.c                                    |   2 +-
 fs/internal.h                                 |   1 +
 fs/namei.c                                    | 397 ++++++++++++++---
 fs/open.c                                     |  10 +-
 fs/proc/base.c                                |  20 +-
 fs/proc/fd.c                                  |  16 +-
 fs/proc/namespaces.c                          |   2 +-
 include/linux/fcntl.h                         |  10 +-
 include/linux/fs.h                            |   4 +
 include/linux/namei.h                         |   8 +
 include/linux/types.h                         |   2 +-
 include/uapi/asm-generic/fcntl.h              |   5 +
 include/uapi/asm-generic/unistd.h             |   5 +-
 include/uapi/linux/fcntl.h                    |  10 +
 tools/testing/selftests/Makefile              |   1 +
 tools/testing/selftests/kselftest.h           |  15 +
 tools/testing/selftests/resolveat/.gitignore  |   1 +
 tools/testing/selftests/resolveat/Makefile    |   6 +
 tools/testing/selftests/resolveat/helpers.h   | 195 +++++++++
 .../selftests/resolveat/linkmode_test.c       | 306 ++++++++++++++
 .../selftests/resolveat/resolveat_test.c      | 400 ++++++++++++++++++
 39 files changed, 1350 insertions(+), 87 deletions(-)
 create mode 100644 tools/testing/selftests/resolveat/.gitignore
 create mode 100644 tools/testing/selftests/resolveat/Makefile
 create mode 100644 tools/testing/selftests/resolveat/helpers.h
 create mode 100644 tools/testing/selftests/resolveat/linkmode_test.c
 create mode 100644 tools/testing/selftests/resolveat/resolveat_test.c

-- 
2.21.0

^ permalink raw reply

* [PATCH RFC v8 01/10] namei: obey trailing magic-link DAC permissions
From: Aleksa Sarai @ 2019-05-20 13:32 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan
  Cc: Aleksa Sarai, Andy Lutomirski, Christian Brauner, Eric Biederman,
	Andrew Morton, Alexei Starovoitov, Kees Cook, Jann Horn,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, linux-kselftest,
	linux-fsdevel, linux-api, linux-kernel, linux-arch
In-Reply-To: <20190520133305.11925-1-cyphar@cyphar.com>

The ability for userspace to "re-open" file descriptors through
/proc/self/fd has been a very useful tool for all sorts of usecases
(container runtimes are one common example). However, the current
interface for doing this has resulted in some pretty subtle security
holes. Userspace can re-open a file descriptor with more permissions
than the original, which can result in cases such as /proc/$pid/exe
being re-opened O_RDWR at a later date even though (by definition)
/proc/$pid/exe cannot be opened for writing. When combined with O_PATH
the results can get even more confusing.

We cannot block this outright. Aside from userspace already depending on
it, it's a useful feature which can actually increase the security of
userspace. For instance, LXC keeps an O_PATH of the container's
/dev/pts/ptmx that gets re-opened to create new ptys and then uses
TIOCGPTPEER to get the slave end. This allows for pty allocation without
resolving paths inside an (untrusted) container's rootfs. There isn't a
trivial way of doing this that is as straight-forward and safe as O_PATH
re-opening.

Instead we have to restrict it in such a way that it doesn't break
(good) users but does block potential attackers. The solution applied in
this patch is to restrict *re-opening* (not resolution through)
magic-links by requiring that mode of the link be obeyed. Normal
symlinks have modes of a+rwx but magic-links have other modes. These
magic-link modes were historically ignored during path resolution, but
they've now been re-purposed for more useful ends.

It is also necessary to define semantics for the mode of an O_PATH
descriptor, since re-opening a magic-link through an O_PATH needs to be
just as restricted as the corresponding magic-link otherwise the above
protection can be bypassed. There are two distinct cases:

 1. The target is a regular file (not a magic-link). Userspace depends
    on being able to re-open the O_PATH of a regular file, so we must
    define the mode to be a+rwx.

 2. The target is a magic-link. In this case, we simply copy the mode of
    the magic-link. This results in an O_PATH of a magic-link
    effectively acting as a no-op in terms of how much re-opening
    privileges a process has.

CAP_DAC_OVERRIDE can be used to override all of these restrictions, but
we only permit &init_userns's capabilities to affect these semantics.
The reason for this is that there isn't a clear way to track what
user_ns is the original owner of a given O_PATH chain -- thus an
unprivileged user could create a new userns and O_PATH the file
descriptor, owning it. All signs would indicate that the user really
does have CAP_DAC_OVERRIDE over the new descriptor and the protection
would be bypassed. We thus opt for the more conservative approach.

One final exception is given, which is that non-O_PATH file descriptors
are given re-open rights equivalent to the permissions available at
open-time. This allows for O_RDONLY file descriptors to be re-opened
O_RDWR as long as the user had MAY_WRITE access at the time of opening
the O_RDONLY descriptor. This is necessary to avoid breaking userspace
(some of the kernel's own selftests depended on this "feature").

Co-developed-by: Andy Lutomirski <luto@kernel.org>
Co-developed-by: Christian Brauner <christian@brauner.io>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 fs/internal.h         |   1 +
 fs/namei.c            | 117 ++++++++++++++++++++++++++++++++++++++++--
 fs/open.c             |   3 +-
 fs/proc/fd.c          |  16 ++++--
 include/linux/fs.h    |   4 ++
 include/linux/types.h |   2 +-
 6 files changed, 132 insertions(+), 11 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index bfd9bbbe369b..60d8a079a331 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -123,6 +123,7 @@ struct open_flags {
 	int acc_mode;
 	int intent;
 	int lookup_flags;
+	fmode_t opath_mask;
 };
 extern struct file *do_filp_open(int dfd, struct filename *pathname,
 		const struct open_flags *op);
diff --git a/fs/namei.c b/fs/namei.c
index 5ebd64b21970..5ff61624b8f0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -506,6 +506,8 @@ struct nameidata {
 	struct inode	*link_inode;
 	unsigned	root_seq;
 	int		dfd;
+	fmode_t 	opath_mask;
+	int		acc_mode; /* op.acc_mode */
 } __randomize_layout;
 
 static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
@@ -514,7 +516,14 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
 	p->stack = p->internal;
 	p->dfd = dfd;
 	p->name = name;
-	p->total_link_count = old ? old->total_link_count : 0;
+	p->total_link_count = 0;
+	p->acc_mode = 0;
+	p->opath_mask = FMODE_PATH_READ | FMODE_PATH_WRITE;
+	if (old) {
+		p->total_link_count = old->total_link_count;
+		p->acc_mode = old->acc_mode;
+		p->opath_mask = old->opath_mask;
+	}
 	p->saved = old;
 	current->nameidata = p;
 }
@@ -1042,8 +1051,40 @@ static int may_create_in_sticky(struct dentry * const dir,
 	return 0;
 }
 
+/**
+ * may_reopen_magiclink - Check permissions for opening a trailing magic-link
+ * @opath_mask: the O_PATH mask of the magiclink
+ * @acc_mode: ACC_MODE which the user is attempting
+ *
+ * We block magic-link re-opening if the @opath_mask is more strict than the
+ * @acc_mode being requested, unless the user is capable(CAP_DAC_OVERRIDE).
+ *
+ * Returns 0 if successful, -ve on error.
+ */
+static int may_open_magiclink(fmode_t opath_mask, int acc_mode)
+{
+	/*
+	 * We only allow for init_userns to be able to override magic-links.
+	 * This is done to avoid cases where an unprivileger userns could take
+	 * an O_PATH of the fd, resulting in it being very unclear whether
+	 * CAP_DAC_OVERRIDE should work on the new O_PATH fd (given that it
+	 * pipes through to the underlying file).
+	 */
+	if (capable(CAP_DAC_OVERRIDE))
+		return 0;
+
+	if ((acc_mode & MAY_READ) &&
+	    !(opath_mask & (FMODE_READ | FMODE_PATH_READ)))
+		return -EACCES;
+	if ((acc_mode & MAY_WRITE) &&
+	    !(opath_mask & (FMODE_WRITE | FMODE_PATH_WRITE)))
+		return -EACCES;
+
+	return 0;
+}
+
 static __always_inline
-const char *get_link(struct nameidata *nd)
+const char *get_link(struct nameidata *nd, bool trailing)
 {
 	struct saved *last = nd->stack + nd->depth - 1;
 	struct dentry *dentry = last->link.dentry;
@@ -1081,6 +1122,50 @@ const char *get_link(struct nameidata *nd)
 		} else {
 			res = get(dentry, inode, &last->done);
 		}
+		/* If we just jumped it was because of a magic-link. */
+		if (unlikely(nd->flags & LOOKUP_JUMPED)) {
+			/*
+			 * For trailing_symlink we check whether the symlink's
+			 * mode allows us to do what we want through acc_mode.
+			 * In addition, we need to stash away what the link
+			 * mode is in case we are about to O_PATH this
+			 * magic-link.
+			 *
+			 * This is only done for magic-links, as a security
+			 * measure to prevent users from being able to re-open
+			 * files with additional permissions or similar tricks
+			 * through procfs. This is not strictly POSIX-friendly,
+			 * but technically neither are magic-links.
+			 */
+			if (trailing) {
+				fmode_t opath_mask = 0;
+				int mode = inode->i_mode;
+
+				/*
+				 * Bare-bones acl_permission_check shift so
+				 * that opath_mask can be adjusted using creds.
+				 */
+				if (uid_eq(current_fsuid(), inode->i_uid))
+					mode >>= 6;
+				else if (in_group_p(inode->i_gid))
+					mode >>= 3;
+
+				/* Figure out the O_PATH mask. */
+				if (mode & MAY_READ)
+					opath_mask |= FMODE_PATH_READ;
+				if (mode & MAY_WRITE)
+					opath_mask |= FMODE_PATH_WRITE;
+
+				/*
+				 * Is the new opath_mask more restrictive than
+				 * the acc_mode being requested?
+				 */
+				error = may_open_magiclink(opath_mask, nd->acc_mode);
+				if (error)
+					return ERR_PTR(error);
+				nd->opath_mask &= opath_mask;
+			}
+		}
 		if (IS_ERR_OR_NULL(res))
 			return res;
 	}
@@ -2142,7 +2227,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			return err;
 
 		if (err) {
-			const char *s = get_link(nd);
+			const char *s = get_link(nd, false);
 
 			if (IS_ERR(s))
 				return PTR_ERR(s);
@@ -2258,7 +2343,7 @@ static const char *trailing_symlink(struct nameidata *nd)
 		return ERR_PTR(error);
 	nd->flags |= LOOKUP_PARENT;
 	nd->stack[0].name = NULL;
-	s = get_link(nd);
+	s = get_link(nd, true);
 	return s ? s : "";
 }
 
@@ -3508,6 +3593,7 @@ static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
 	if (!error) {
 		audit_inode(nd->name, path.dentry, 0);
 		error = vfs_open(&path, file);
+		file->f_mode |= nd->opath_mask;
 		path_put(&path);
 	}
 	return error;
@@ -3519,6 +3605,9 @@ static struct file *path_openat(struct nameidata *nd,
 	struct file *file;
 	int error;
 
+	nd->acc_mode = op->acc_mode;
+	nd->opath_mask = op->opath_mask;
+
 	file = alloc_empty_file(op->open_flag, current_cred());
 	if (IS_ERR(file))
 		return file;
@@ -3537,6 +3626,26 @@ static struct file *path_openat(struct nameidata *nd,
 		terminate_walk(nd);
 	}
 	if (likely(!error)) {
+		/*
+		 * In order to allow for re-opening of a read-only fds as
+		 * read-write (something that some userspace tools depend on,
+		 * including some kernel selftests), we give additional
+		 * permissions based on what permissions are available at
+		 * open-time. This must be scoped to opath_mask to avoid
+		 * obvious O_PATH attacks.
+		 */
+		if (likely(!(file->f_mode & FMODE_PATH))) {
+			fmode_t add_perms = 0;
+
+			if (!(nd->acc_mode & MAY_READ) &&
+			    !inode_permission(file->f_inode, MAY_READ))
+				add_perms |= FMODE_PATH_READ;
+			if (!(nd->acc_mode & MAY_WRITE) &&
+			    !inode_permission(file->f_inode, MAY_WRITE))
+				add_perms |= FMODE_PATH_WRITE;
+
+			file->f_mode |= add_perms & nd->opath_mask;
+		}
 		if (likely(file->f_mode & FMODE_OPENED))
 			return file;
 		WARN_ON(1);
diff --git a/fs/open.c b/fs/open.c
index a00350018a47..c1d4f343e85f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -981,8 +981,9 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 		acc_mode |= MAY_APPEND;
 
 	op->acc_mode = acc_mode;
-
 	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+	/* For O_PATH backwards-compatibility we default to an all-set mask. */
+	op->opath_mask = FMODE_PATH_READ | FMODE_PATH_WRITE;
 
 	if (flags & O_CREAT) {
 		op->intent |= LOOKUP_CREATE;
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 81882a13212d..8c0ccf2b703b 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -104,11 +104,17 @@ static void tid_fd_update_inode(struct task_struct *task, struct inode *inode,
 	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
 
 	if (S_ISLNK(inode->i_mode)) {
-		unsigned i_mode = S_IFLNK;
-		if (f_mode & FMODE_READ)
-			i_mode |= S_IRUSR | S_IXUSR;
-		if (f_mode & FMODE_WRITE)
-			i_mode |= S_IWUSR | S_IXUSR;
+		unsigned i_mode = S_IFLNK | S_IXUGO;
+		/*
+		 * Construct the mode bits based on the open-mode. Note that
+		 * giving o+rwx permissions here is not a problem since all
+		 * /proc/self/fd magic-link resolution is gated with
+		 * ptrace_may_access().
+		 */
+		if (f_mode & (FMODE_READ | FMODE_PATH_READ))
+			i_mode |= S_IRUGO;
+		if (f_mode & (FMODE_WRITE | FMODE_PATH_WRITE))
+			i_mode |= S_IWUGO;
 		inode->i_mode = i_mode;
 	}
 	security_task_to_inode(task, inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fe94c48481a6..0b2af61411cb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -173,6 +173,10 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File does not contribute to nr_files count */
 #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
 
+/* File is an O_PATH descriptor which can be upgraded to (read, write). */
+#define FMODE_PATH_READ		((__force fmode_t)0x40000000)
+#define FMODE_PATH_WRITE	((__force fmode_t)0x80000000)
+
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
  * that indicates that they should check the contents of the iovec are
diff --git a/include/linux/types.h b/include/linux/types.h
index cc0dbbe551d5..45aaf711885c 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -157,7 +157,7 @@ typedef u32 dma_addr_t;
 
 typedef unsigned int __bitwise gfp_t;
 typedef unsigned int __bitwise slab_flags_t;
-typedef unsigned int __bitwise fmode_t;
+typedef unsigned long __bitwise fmode_t;
 
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
 typedef u64 phys_addr_t;
-- 
2.21.0

^ permalink raw reply related

* [PATCH RFC v8 02/10] procfs: switch magic-link modes to be more sane
From: Aleksa Sarai @ 2019-05-20 13:32 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan
  Cc: Aleksa Sarai, Eric Biederman, Andy Lutomirski, Andrew Morton,
	Alexei Starovoitov, Kees Cook, Jann Horn, Christian Brauner,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, linux-kselftest,
	linux-fsdevel, linux-api, linux-kernel, linux-arch
In-Reply-To: <20190520133305.11925-1-cyphar@cyphar.com>

Now that magic-link modes are obeyed for file re-opening purposes, some
of the pre-existing magic-link modes need to be adjusted to be more
semantically correct.

The most blatant example of this is /proc/self/exe, which had a mode of
a+rwx even though tautologically the file could never be opened for
writing (because it is the current->mm of a live process).

With the new O_PATH restrictions, changing the default mode of these
magic-links allows us to avoid delayed-access attacks such as we saw in
CVE-2019-5736.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 fs/proc/base.c       | 20 ++++++++++----------
 fs/proc/namespaces.c |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6a803a0b75df..17fd447043ff 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -133,9 +133,9 @@ struct pid_entry {
 
 #define DIR(NAME, MODE, iops, fops)	\
 	NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
-#define LNK(NAME, get_link)					\
-	NOD(NAME, (S_IFLNK|S_IRWXUGO),				\
-		&proc_pid_link_inode_operations, NULL,		\
+#define LNK(NAME, MODE, get_link)			\
+	NOD(NAME, (S_IFLNK|(MODE)),			\
+		&proc_pid_link_inode_operations, NULL,	\
 		{ .proc_get_link = get_link } )
 #define REG(NAME, MODE, fops)				\
 	NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
@@ -2995,9 +2995,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
 #endif
 	REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
-	LNK("cwd",        proc_cwd_link),
-	LNK("root",       proc_root_link),
-	LNK("exe",        proc_exe_link),
+	LNK("cwd",        S_IRWXUGO, proc_cwd_link),
+	LNK("root",       S_IRWXUGO, proc_root_link),
+	LNK("exe",        S_IRUGO|S_IXUGO, proc_exe_link),
 	REG("mounts",     S_IRUGO, proc_mounts_operations),
 	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
 	REG("mountstats", S_IRUSR, proc_mountstats_operations),
@@ -3394,11 +3394,11 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
 #endif
 	REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
-	LNK("cwd",       proc_cwd_link),
-	LNK("root",      proc_root_link),
-	LNK("exe",       proc_exe_link),
+	LNK("cwd",       S_IRWXUGO, proc_cwd_link),
+	LNK("root",      S_IRWXUGO, proc_root_link),
+	LNK("exe",       S_IRUGO|S_IXUGO, proc_exe_link),
 	REG("mounts",    S_IRUGO, proc_mounts_operations),
-	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
+	REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
 	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
 	REG("smaps",     S_IRUGO, proc_pid_smaps_operations),
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index dd2b35f78b09..cd1e130913f7 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -94,7 +94,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry,
 	struct inode *inode;
 	struct proc_inode *ei;
 
-	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO);
+	inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRUGO);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 
-- 
2.21.0

^ permalink raw reply related

* [PATCH RFC v8 03/10] open: O_EMPTYPATH: procfs-less file descriptor re-opening
From: Aleksa Sarai @ 2019-05-20 13:32 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan
  Cc: Aleksa Sarai, Eric Biederman, Andy Lutomirski, Andrew Morton,
	Alexei Starovoitov, Kees Cook, Jann Horn, Christian Brauner,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, linux-kselftest,
	linux-fsdevel, linux-api, linux-kernel, linux-arch
In-Reply-To: <20190520133305.11925-1-cyphar@cyphar.com>

Userspace has made use of /proc/self/fd very liberally to allow for
descriptors to be re-opened. There are a wide variety of uses for this
feature, but it has always required constructing a pathname and could
not be done without procfs mounted. The obvious solution for this is to
extend openat(2) to have an AT_EMPTY_PATH-equivalent -- O_EMPTYPATH.

Now that descriptor re-opening has been made safe through the new
magic-link resolution restrictions, we can replicate these restrictions
for O_EMPTYPATH. In particular, we only allow "upgrading" the file
descriptor if the corresponding FMODE_PATH_* bit is set (or the
FMODE_{READ,WRITE} cases for non-O_PATH file descriptors).

When doing openat(O_EMPTYPATH|O_PATH), O_PATH takes precedence and
O_EMPTYPATH is ignored. Very few users ever have a need to O_PATH
re-open an existing file descriptor, and so accommodating them at the
expense of further complicating O_PATH makes little sense. Ultimately,
if users ask for this we can always add RESOLVE_EMPTY_PATH to
resolveat(2) in the future.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 fs/fcntl.c                       |  2 +-
 fs/namei.c                       | 27 +++++++++++++++++++++++++++
 fs/open.c                        |  7 ++++++-
 include/linux/fcntl.h            |  2 +-
 include/uapi/asm-generic/fcntl.h |  5 +++++
 5 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 083185174c6d..6c85c4d0c006 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1031,7 +1031,7 @@ static int __init fcntl_init(void)
 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 	 * is defined as O_NONBLOCK on some platforms and not on others.
 	 */
-	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
+	BUILD_BUG_ON(22 - 1 /* for O_RDONLY being 0 */ !=
 		HWEIGHT32(
 			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
 			__FMODE_EXEC | __FMODE_NONOTIFY));
diff --git a/fs/namei.c b/fs/namei.c
index 5ff61624b8f0..46c5302eea4a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3599,6 +3599,31 @@ static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
 	return error;
 }
 
+static int do_o_emptypath(struct nameidata *nd, struct file *newfile)
+{
+	int error;
+	struct fd f;
+
+	/* We don't support AT_FDCWD since O_PATH cannot be set here. */
+	f = fdget_raw(nd->dfd);
+	if (!f.file)
+		return -EBADF;
+
+	/* Update opath_mask as though we went through trailing_symlink(). */
+	if (!(f.file->f_mode & (FMODE_READ | FMODE_PATH_READ)))
+		nd->opath_mask &= ~FMODE_PATH_READ;
+	if (!(f.file->f_mode & (FMODE_WRITE | FMODE_PATH_WRITE)))
+		nd->opath_mask &= ~FMODE_PATH_WRITE;
+
+	/* Obey the same restrictions as magic-links. */
+	error = may_open_magiclink(f.file->f_mode, nd->acc_mode);
+	if (!error)
+		error = vfs_open(&f.file->f_path, newfile);
+
+	fdput(f);
+	return error;
+}
+
 static struct file *path_openat(struct nameidata *nd,
 			const struct open_flags *op, unsigned flags)
 {
@@ -3614,6 +3639,8 @@ static struct file *path_openat(struct nameidata *nd,
 
 	if (unlikely(file->f_flags & __O_TMPFILE)) {
 		error = do_tmpfile(nd, flags, op, file);
+	} else if (unlikely(file->f_flags & O_EMPTYPATH)) {
+		error = do_o_emptypath(nd, file);
 	} else if (unlikely(file->f_flags & O_PATH)) {
 		error = do_o_path(nd, flags, file);
 	} else {
diff --git a/fs/open.c b/fs/open.c
index c1d4f343e85f..a6ccfea899b9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -995,6 +995,8 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 		lookup_flags |= LOOKUP_DIRECTORY;
 	if (!(flags & O_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
+	if (flags & O_EMPTYPATH)
+		lookup_flags |= LOOKUP_EMPTY;
 	op->lookup_flags = lookup_flags;
 	return 0;
 }
@@ -1056,14 +1058,17 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
 {
 	struct open_flags op;
 	int fd = build_open_flags(flags, mode, &op);
+	int empty = 0;
 	struct filename *tmp;
 
 	if (fd)
 		return fd;
 
-	tmp = getname(filename);
+	tmp = getname_flags(filename, op.lookup_flags, &empty);
 	if (IS_ERR(tmp))
 		return PTR_ERR(tmp);
+	if (!empty)
+		op.open_flag &= ~O_EMPTYPATH;
 
 	fd = get_unused_fd_flags(flags);
 	if (fd >= 0) {
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index d019df946cb2..2868ae6c8fc1 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -9,7 +9,7 @@
 	(O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
 	 O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \
 	 FASYNC	| O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
-	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
+	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_EMPTYPATH)
 
 #ifndef force_o_largefile
 #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 9dc0bf0c5a6e..307f7c414a51 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -89,6 +89,11 @@
 #define __O_TMPFILE	020000000
 #endif
 
+#ifndef O_EMPTYPATH
+#define O_EMPTYPATH 040000000
+#endif
+
+
 /* a horrid kludge trying to make sure that this will fail on old kernels */
 #define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
 #define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)      
-- 
2.21.0

^ permalink raw reply related

* [PATCH RFC v8 04/10] namei: split out nd->dfd handling to dirfd_path_init
From: Aleksa Sarai @ 2019-05-20 13:32 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan
  Cc: Aleksa Sarai, Eric Biederman, Andy Lutomirski, Andrew Morton,
	Alexei Starovoitov, Kees Cook, Jann Horn, Christian Brauner,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, linux-kselftest,
	linux-fsdevel, linux-api, linux-kernel, linux-arch
In-Reply-To: <20190520133305.11925-1-cyphar@cyphar.com>

Previously, path_init's handling of *at(dfd, ...) was only done once,
but with LOOKUP_BENEATH (and LOOKUP_IN_ROOT) we have to parse the
initial nd->path at different times (before or after absolute path
handling) depending on whether we have been asked to scope resolution
within a root.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 fs/namei.c | 103 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 44 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 46c5302eea4a..dc323e25d4d2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2251,9 +2251,59 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	}
 }
 
+/*
+ * Configure nd->path based on the nd->dfd. This is only used as part of
+ * path_init().
+ */
+static inline int dirfd_path_init(struct nameidata *nd)
+{
+	if (nd->dfd == AT_FDCWD) {
+		if (nd->flags & LOOKUP_RCU) {
+			struct fs_struct *fs = current->fs;
+			unsigned seq;
+
+			do {
+				seq = read_seqcount_begin(&fs->seq);
+				nd->path = fs->pwd;
+				nd->inode = nd->path.dentry->d_inode;
+				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			} while (read_seqcount_retry(&fs->seq, seq));
+		} else {
+			get_fs_pwd(current->fs, &nd->path);
+			nd->inode = nd->path.dentry->d_inode;
+		}
+	} else {
+		/* Caller must check execute permissions on the starting path component */
+		struct fd f = fdget_raw(nd->dfd);
+		struct dentry *dentry;
+
+		if (!f.file)
+			return -EBADF;
+
+		dentry = f.file->f_path.dentry;
+
+		if (*nd->name->name && unlikely(!d_can_lookup(dentry))) {
+			fdput(f);
+			return -ENOTDIR;
+		}
+
+		nd->path = f.file->f_path;
+		if (nd->flags & LOOKUP_RCU) {
+			nd->inode = nd->path.dentry->d_inode;
+			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		} else {
+			path_get(&nd->path);
+			nd->inode = nd->path.dentry->d_inode;
+		}
+		fdput(f);
+	}
+	return 0;
+}
+
 /* must be paired with terminate_walk() */
 static const char *path_init(struct nameidata *nd, unsigned flags)
 {
+	int error;
 	const char *s = nd->name->name;
 
 	if (!*s)
@@ -2287,52 +2337,17 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 
 	nd->m_seq = read_seqbegin(&mount_lock);
 	if (*s == '/') {
-		set_root(nd);
-		if (likely(!nd_jump_root(nd)))
-			return s;
-		return ERR_PTR(-ECHILD);
-	} else if (nd->dfd == AT_FDCWD) {
-		if (flags & LOOKUP_RCU) {
-			struct fs_struct *fs = current->fs;
-			unsigned seq;
-
-			do {
-				seq = read_seqcount_begin(&fs->seq);
-				nd->path = fs->pwd;
-				nd->inode = nd->path.dentry->d_inode;
-				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-			} while (read_seqcount_retry(&fs->seq, seq));
-		} else {
-			get_fs_pwd(current->fs, &nd->path);
-			nd->inode = nd->path.dentry->d_inode;
-		}
-		return s;
-	} else {
-		/* Caller must check execute permissions on the starting path component */
-		struct fd f = fdget_raw(nd->dfd);
-		struct dentry *dentry;
-
-		if (!f.file)
-			return ERR_PTR(-EBADF);
-
-		dentry = f.file->f_path.dentry;
-
-		if (*s && unlikely(!d_can_lookup(dentry))) {
-			fdput(f);
-			return ERR_PTR(-ENOTDIR);
-		}
-
-		nd->path = f.file->f_path;
-		if (flags & LOOKUP_RCU) {
-			nd->inode = nd->path.dentry->d_inode;
-			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-		} else {
-			path_get(&nd->path);
-			nd->inode = nd->path.dentry->d_inode;
-		}
-		fdput(f);
+		if (likely(!nd->root.mnt))
+			set_root(nd);
+		error = nd_jump_root(nd);
+		if (unlikely(error))
+			s = ERR_PTR(error);
 		return s;
 	}
+	error = dirfd_path_init(nd);
+	if (unlikely(error))
+		return ERR_PTR(error);
+	return s;
 }
 
 static const char *trailing_symlink(struct nameidata *nd)
-- 
2.21.0

^ permalink raw reply related

* [PATCH RFC v8 05/10] namei: O_BENEATH-style path resolution flags
From: Aleksa Sarai @ 2019-05-20 13:33 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan
  Cc: Aleksa Sarai, Christian Brauner, David Drysdale, Andy Lutomirski,
	Linus Torvalds, Eric Biederman, Andrew Morton, Alexei Starovoitov,
	Kees Cook, Jann Horn, Tycho Andersen, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, containers, linux-kselftest, linux-fsdevel,
	linux-api, linux-kernel, linux-arch
In-Reply-To: <20190520133305.11925-1-cyphar@cyphar.com>

Add the following flags to allow various restrictions on path
resolution (these affect the *entire* resolution, rather than just the
final path component -- as is the case with most other AT_* flags).

The primary justification for these flags is to allow for programs to be
far more strict about how they want path resolution to handle symlinks,
mountpoint crossings, and paths that escape the dirfd (through an
absolute path or ".." shenanigans).

This is of particular concern to container runtimes that want to be very
careful about malicious root filesystems that a container's init might
have screwed around with (and there is no real way to protect against
this in userspace if you consider potential races against a malicious
container's init). More classical applications (which have their own
potentially buggy userspace path sanitisation code) include web
servers, archive extraction tools, network file servers, and so on.

These flags are exposed to userspace in a later patchset.

* LOOKUP_XDEV: Disallow mount-point crossing (both *down* into one, or
  *up* from one). The primary "scoping" use is to blocking resolution
  that crosses a bind-mount, which has a similar property to a symlink
  (in the way that it allows for escape from the starting-point). Since
  it is not possible to differentiate bind-mounts However since
  bind-mounting requires privileges (in ways symlinks don't) this has
  been split from LOOKUP_BENEATH. The naming is based on "find -xdev" as
  well as -EXDEV (though find(1) doesn't walk upwards, the semantics
  seem obvious).

* LOOKUP_NO_MAGICLINKS: Disallows ->get_link "symlink" jumping. This is
  a very specific restriction, and it exists because /proc/$pid/fd/...
  "symlinks" allow for access outside nd->root and pose risk to
  container runtimes that don't want to be tricked into accessing a host
  path (but do want to allow no-funny-business symlink resolution).

* LOOKUP_NO_SYMLINKS: Disallows symlink jumping *of any kind*. Implies
  LOOKUP_NO_MAGICLINKS (obviously).

* LOOKUP_BENEATH: Disallow "escapes" from the starting point of the
  filesystem tree during resolution (you must stay "beneath" the
  starting point at all times). Currently this is done by disallowing
  ".." and absolute paths (either in the given path or found during
  symlink resolution) entirely, as well as all magic-link jumping.

  The wholesale banning of ".." is because it is currently not safe to
  allow ".." resolution (races can cause the path to be moved outside of
  the root -- this is conceptually similar to historical chroot(2)
  escape attacks). Future patches in this series will address this, and
  will re-enable ".." resolution once it is safe. With those patches,
  ".." resolution will only be allowed if it remains in the root
  throughout resolution (such as "a/../b" not "a/../../outside/b").

  The banning of magic-link jumping is done because it is not clear
  whether semantically they should be allowed -- while some magic-links
  are safe there are many that can cause escapes (and once a
  resolution is outside of the root, O_BENEATH will no longer detect
  it). Future patches may re-enable magic-link jumping when such jumps
  would remain inside the root.

The LOOKUP_NO_*LINK flags return -ELOOP if path resolution would
violates their requirement, while the others all return -EXDEV.

This is a refresh of Al's AT_NO_JUMPS patchset[1] (which was a variation
on David Drysdale's O_BENEATH patchset[2], which in turn was based on
the Capsicum project[3]). Input from Linus and Andy in the AT_NO_JUMPS
thread[4] determined most of the API changes made in this refresh.

[1]: https://lwn.net/Articles/721443/
[2]: https://lwn.net/Articles/619151/
[3]: https://lwn.net/Articles/603929/
[4]: https://lwn.net/Articles/723057/

Cc: Christian Brauner <christian@brauner.io>
Suggested-by: David Drysdale <drysdale@google.com>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 fs/namei.c            | 74 ++++++++++++++++++++++++++++++++++++-------
 include/linux/namei.h |  7 ++++
 2 files changed, 70 insertions(+), 11 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index dc323e25d4d2..f997c82eb9c2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -852,6 +852,13 @@ static inline void path_to_nameidata(const struct path *path,
 
 static int nd_jump_root(struct nameidata *nd)
 {
+	if (unlikely(nd->flags & LOOKUP_BENEATH))
+		return -EXDEV;
+	if (unlikely(nd->flags & LOOKUP_XDEV)) {
+		/* Absolute path arguments to path_init() are allowed. */
+		if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
+			return -EXDEV;
+	}
 	if (nd->flags & LOOKUP_RCU) {
 		struct dentry *d;
 		nd->path = nd->root;
@@ -1092,6 +1099,9 @@ const char *get_link(struct nameidata *nd, bool trailing)
 	int error;
 	const char *res;
 
+	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS))
+		return ERR_PTR(-ELOOP);
+
 	if (!(nd->flags & LOOKUP_RCU)) {
 		touch_atime(&last->link);
 		cond_resched();
@@ -1124,6 +1134,11 @@ const char *get_link(struct nameidata *nd, bool trailing)
 		}
 		/* If we just jumped it was because of a magic-link. */
 		if (unlikely(nd->flags & LOOKUP_JUMPED)) {
+			if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
+				return ERR_PTR(-ELOOP);
+			/* Not currently safe. */
+			if (unlikely(nd->flags & LOOKUP_BENEATH))
+				return ERR_PTR(-EXDEV);
 			/*
 			 * For trailing_symlink we check whether the symlink's
 			 * mode allows us to do what we want through acc_mode.
@@ -1172,8 +1187,9 @@ const char *get_link(struct nameidata *nd, bool trailing)
 	if (*res == '/') {
 		if (!nd->root.mnt)
 			set_root(nd);
-		if (unlikely(nd_jump_root(nd)))
-			return ERR_PTR(-ECHILD);
+		error = nd_jump_root(nd);
+		if (unlikely(error))
+			return ERR_PTR(error);
 		while (unlikely(*++res == '/'))
 			;
 	}
@@ -1354,12 +1370,16 @@ static int follow_managed(struct path *path, struct nameidata *nd)
 		break;
 	}
 
-	if (need_mntput && path->mnt == mnt)
-		mntput(path->mnt);
+	if (need_mntput) {
+		if (path->mnt == mnt)
+			mntput(path->mnt);
+		if (unlikely(nd->flags & LOOKUP_XDEV))
+			ret = -EXDEV;
+		else
+			nd->flags |= LOOKUP_JUMPED;
+	}
 	if (ret == -EISDIR || !ret)
 		ret = 1;
-	if (need_mntput)
-		nd->flags |= LOOKUP_JUMPED;
 	if (unlikely(ret < 0))
 		path_put_conditional(path, nd);
 	return ret;
@@ -1416,6 +1436,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		mounted = __lookup_mnt(path->mnt, path->dentry);
 		if (!mounted)
 			break;
+		if (unlikely(nd->flags & LOOKUP_XDEV))
+			return false;
 		path->mnt = &mounted->mnt;
 		path->dentry = mounted->mnt.mnt_root;
 		nd->flags |= LOOKUP_JUMPED;
@@ -1436,8 +1458,11 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	struct inode *inode = nd->inode;
 
 	while (1) {
-		if (path_equal(&nd->path, &nd->root))
+		if (path_equal(&nd->path, &nd->root)) {
+			if (unlikely(nd->flags & LOOKUP_BENEATH))
+				return -EXDEV;
 			break;
+		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			struct dentry *old = nd->path.dentry;
 			struct dentry *parent = old->d_parent;
@@ -1462,6 +1487,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 				return -ECHILD;
 			if (&mparent->mnt == nd->path.mnt)
 				break;
+			if (unlikely(nd->flags & LOOKUP_XDEV))
+				return -EXDEV;
 			/* we know that mountpoint was pinned */
 			nd->path.dentry = mountpoint;
 			nd->path.mnt = &mparent->mnt;
@@ -1476,6 +1503,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 			return -ECHILD;
 		if (!mounted)
 			break;
+		if (unlikely(nd->flags & LOOKUP_XDEV))
+			return -EXDEV;
 		nd->path.mnt = &mounted->mnt;
 		nd->path.dentry = mounted->mnt.mnt_root;
 		inode = nd->path.dentry->d_inode;
@@ -1564,8 +1593,11 @@ static int path_parent_directory(struct path *path)
 static int follow_dotdot(struct nameidata *nd)
 {
 	while(1) {
-		if (path_equal(&nd->path, &nd->root))
+		if (path_equal(&nd->path, &nd->root)) {
+			if (unlikely(nd->flags & LOOKUP_BENEATH))
+				return -EXDEV;
 			break;
+		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			int ret = path_parent_directory(&nd->path);
 			if (ret)
@@ -1574,6 +1606,8 @@ static int follow_dotdot(struct nameidata *nd)
 		}
 		if (!follow_up(&nd->path))
 			break;
+		if (unlikely(nd->flags & LOOKUP_XDEV))
+			return -EXDEV;
 	}
 	follow_mount(&nd->path);
 	nd->inode = nd->path.dentry->d_inode;
@@ -1788,6 +1822,13 @@ static inline int may_lookup(struct nameidata *nd)
 static inline int handle_dots(struct nameidata *nd, int type)
 {
 	if (type == LAST_DOTDOT) {
+		/*
+		 * LOOKUP_BENEATH resolving ".." is not currently safe -- races can
+		 * cause our parent to have moved outside of the root and us to skip
+		 * over it.
+		 */
+		if (unlikely(nd->flags & LOOKUP_BENEATH))
+			return -EXDEV;
 		if (!nd->root.mnt)
 			set_root(nd);
 		if (nd->flags & LOOKUP_RCU) {
@@ -2336,6 +2377,15 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	nd->path.dentry = NULL;
 
 	nd->m_seq = read_seqbegin(&mount_lock);
+
+	if (unlikely(nd->flags & LOOKUP_BENEATH)) {
+		error = dirfd_path_init(nd);
+		if (unlikely(error))
+			return ERR_PTR(error);
+		nd->root = nd->path;
+		if (!(nd->flags & LOOKUP_RCU))
+			path_get(&nd->root);
+	}
 	if (*s == '/') {
 		if (likely(!nd->root.mnt))
 			set_root(nd);
@@ -2344,9 +2394,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 			s = ERR_PTR(error);
 		return s;
 	}
-	error = dirfd_path_init(nd);
-	if (unlikely(error))
-		return ERR_PTR(error);
+	if (likely(!nd->path.mnt)) {
+		error = dirfd_path_init(nd);
+		if (unlikely(error))
+			return ERR_PTR(error);
+	}
 	return s;
 }
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 9138b4471dbf..7bc819ad0cd3 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -50,6 +50,13 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_EMPTY		0x4000
 #define LOOKUP_DOWN		0x8000
 
+/* Scoping flags for lookup. */
+#define LOOKUP_BENEATH		0x010000 /* No escaping from starting point. */
+#define LOOKUP_XDEV		0x020000 /* No mountpoint crossing. */
+#define LOOKUP_NO_MAGICLINKS	0x040000 /* No /proc/$pid/fd/ "symlink" crossing. */
+#define LOOKUP_NO_SYMLINKS	0x080000 /* No symlink crossing *at all*.
+					    Implies LOOKUP_NO_MAGICLINKS. */
+
 extern int path_pts(struct path *path);
 
 extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
-- 
2.21.0

^ permalink raw reply related

* [PATCH RFC v8 06/10] namei: LOOKUP_IN_ROOT: chroot-like path resolution
From: Aleksa Sarai @ 2019-05-20 13:33 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan
  Cc: Aleksa Sarai, Christian Brauner, Eric Biederman, Andy Lutomirski,
	Andrew Morton, Alexei Starovoitov, Kees Cook, Jann Horn,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, linux-kselftest,
	linux-fsdevel, linux-api, linux-kernel, linux-arch
In-Reply-To: <20190520133305.11925-1-cyphar@cyphar.com>

The primary motivation for the need for this flag is container runtimes
which have to interact with malicious root filesystems in the host
namespaces. One of the first requirements for a container runtime to be
secure against a malicious rootfs is that they correctly scope symlinks
(that is, they should be scoped as though they are chroot(2)ed into the
container's rootfs) and ".."-style paths[*]. The already-existing
LOOKUP_XDEV and LOOKUP_NO_MAGICLINKS help defend against other potential
attacks in a malicious rootfs scenario.

Currently most container runtimes try to do this resolution in
userspace[1], causing many potential race conditions. In addition, the
"obvious" alternative (actually performing a {ch,pivot_}root(2))
requires a fork+exec (for some runtimes) which is *very* costly if
necessary for every filesystem operation involving a container.

[*] At the moment, ".." and magic-link jumping are disallowed for the
    same reason it is disabled for LOOKUP_BENEATH -- currently it is not
    safe to allow it. Future patches may enable it unconditionally once
    we have resolved the possible races (for "..") and semantics (for
    magic-link jumping).

The most significant *at(2) semantic change with LOOKUP_IN_ROOT is that
absolute pathnames no longer cause dirfd to be ignored completely. The
rationale is that LOOKUP_IN_ROOT must necessarily chroot-scope symlinks
with absolute paths to dirfd, and so doing it for the base path seems to
be the most consistent behaviour (and also avoids foot-gunning users who
want to scope paths that are absolute).

[1]: https://github.com/cyphar/filepath-securejoin

Co-developed-by: Christian Brauner <christian@brauner.io>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 fs/namei.c            | 6 +++---
 include/linux/namei.h | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index f997c82eb9c2..d18671a06bdb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1137,7 +1137,7 @@ const char *get_link(struct nameidata *nd, bool trailing)
 			if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
 				return ERR_PTR(-ELOOP);
 			/* Not currently safe. */
-			if (unlikely(nd->flags & LOOKUP_BENEATH))
+			if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT)))
 				return ERR_PTR(-EXDEV);
 			/*
 			 * For trailing_symlink we check whether the symlink's
@@ -1827,7 +1827,7 @@ static inline int handle_dots(struct nameidata *nd, int type)
 		 * cause our parent to have moved outside of the root and us to skip
 		 * over it.
 		 */
-		if (unlikely(nd->flags & LOOKUP_BENEATH))
+		if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT)))
 			return -EXDEV;
 		if (!nd->root.mnt)
 			set_root(nd);
@@ -2378,7 +2378,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 
 	nd->m_seq = read_seqbegin(&mount_lock);
 
-	if (unlikely(nd->flags & LOOKUP_BENEATH)) {
+	if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT))) {
 		error = dirfd_path_init(nd);
 		if (unlikely(error))
 			return ERR_PTR(error);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 7bc819ad0cd3..4b1ee717cb14 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -56,6 +56,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_NO_MAGICLINKS	0x040000 /* No /proc/$pid/fd/ "symlink" crossing. */
 #define LOOKUP_NO_SYMLINKS	0x080000 /* No symlink crossing *at all*.
 					    Implies LOOKUP_NO_MAGICLINKS. */
+#define LOOKUP_IN_ROOT		0x100000 /* Treat dirfd as %current->fs->root. */
 
 extern int path_pts(struct path *path);
 
-- 
2.21.0

^ permalink raw reply related

* [PATCH RFC v8 07/10] namei: aggressively check for nd->root escape on ".." resolution
From: Aleksa Sarai @ 2019-05-20 13:33 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan
  Cc: Aleksa Sarai, Jann Horn, Kees Cook, Eric Biederman,
	Andy Lutomirski, Andrew Morton, Alexei Starovoitov,
	Christian Brauner, Tycho Andersen, David Drysdale, Chanho Min,
	Oleg Nesterov, Aleksa Sarai, Linus Torvalds, containers,
	linux-kselftest, linux-fsdevel, linux-api, linux-kernel,
	linux-arch
In-Reply-To: <20190520133305.11925-1-cyphar@cyphar.com>

This patch allows for LOOKUP_BENEATH and LOOKUP_IN_ROOT to safely permit
".." resolution (in the case of LOOKUP_BENEATH the resolution will still
fail if ".." resolution would resolve a path outside of the root --
while LOOKUP_IN_ROOT will chroot(2)-style scope it). magic-link jumps
are still disallowed entirely because now they could result in
inconsistent behaviour if resolution encounters a subsequent "..".

The need for this patch is explained by observing there is a fairly
easy-to-exploit race condition with chroot(2) (and thus by extension
LOOKUP_IN_ROOT and LOOKUP_BENEATH if ".." is allowed) where a rename(2)
of a path can be used to "skip over" nd->root and thus escape to the
filesystem above nd->root.

  thread1 [attacker]:
    for (;;)
      renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
  thread2 [victim]:
    for (;;)
      resolveat(dirb, "b/c/../../etc/shadow", RESOLVE_IN_ROOT);

With fairly significant regularity, thread2 will resolve to
"/etc/shadow" rather than "/a/b/etc/shadow". There is also a similar
(though somewhat more privileged) attack using MS_MOVE.

With this patch, such cases will be detected *during* ".." resolution
(which is the weak point of chroot(2) -- since walking *into* a
subdirectory tautologically cannot result in you walking *outside*
nd->root -- except through a bind-mount or magic-link). By detecting
this at ".." resolution (rather than checking only at the end of the
entire resolution) we can both correct escapes by jumping back to the
root (in the case of LOOKUP_IN_ROOT), as well as avoid revealing to
attackers the structure of the filesystem outside of the root (through
timing attacks for instance).

In order to avoid a quadratic lookup with each ".." entry, we only
activate the slow path if a write through &rename_lock or &mount_lock
has occurred during path resolution (&rename_lock and &mount_lock are
re-taken to further optimise the lookup). Since the primary attack being
protected against is MS_MOVE or rename(2), not doing additional checks
unless a mount or rename have occurred avoids making the common case
slow.

The use of path_is_under() here might seem suspect, but on further
inspection of the most important race (a path was *inside* the root but
is now *outside*), there appears to be no attack potential:

  * If path_is_under() occurs before the rename, then the path will be
    resolved -- however the path was originally inside the root and thus
    there is no escape (and to userspace it'd look like the rename
    occurred after the path was resolved). If path_is_under() occurs
    afterwards, the resolution is blocked.

  * Subsequent ".." jumps are guaranteed to check path_is_under() -- by
    construction, &rename_lock or &mount_lock must have been taken by
    the attacker after path_is_under() returned in the victim. Thus ".."
    will not be able to escape from the previously-inside-root path.

  * Walking down in the moved path is still safe since the entire
    subtree was moved (either by rename(2) or MS_MOVE) and because (as
    discussed above) walking down is safe.

I have run a variant of the above attack in a loop on several machines
with this patch, and no instances of a breakout were detected. While
this is not concrete proof that this is safe, when combined with the
above argument it should lend some trustworthiness to this construction.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 fs/namei.c | 48 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index d18671a06bdb..6c3bbe627965 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -491,7 +491,7 @@ struct nameidata {
 	struct path	root;
 	struct inode	*inode; /* path.dentry.d_inode */
 	unsigned int	flags;
-	unsigned	seq, m_seq;
+	unsigned	seq, m_seq, r_seq;
 	int		last_type;
 	unsigned	depth;
 	int		total_link_count;
@@ -1822,19 +1822,35 @@ static inline int may_lookup(struct nameidata *nd)
 static inline int handle_dots(struct nameidata *nd, int type)
 {
 	if (type == LAST_DOTDOT) {
-		/*
-		 * LOOKUP_BENEATH resolving ".." is not currently safe -- races can
-		 * cause our parent to have moved outside of the root and us to skip
-		 * over it.
-		 */
-		if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT)))
-			return -EXDEV;
+		int error = 0;
+
 		if (!nd->root.mnt)
 			set_root(nd);
-		if (nd->flags & LOOKUP_RCU) {
-			return follow_dotdot_rcu(nd);
-		} else
-			return follow_dotdot(nd);
+		if (nd->flags & LOOKUP_RCU)
+			error = follow_dotdot_rcu(nd);
+		else
+			error = follow_dotdot(nd);
+		if (error)
+			return error;
+
+		if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT))) {
+			bool m_retry = read_seqretry(&mount_lock, nd->m_seq);
+			bool r_retry = read_seqretry(&rename_lock, nd->r_seq);
+
+			/*
+			 * Don't bother checking unless there's a racing
+			 * rename(2) or MS_MOVE.
+			 */
+			if (likely(!m_retry && !r_retry))
+				return 0;
+
+			if (m_retry && !(nd->flags & LOOKUP_RCU))
+				nd->m_seq = read_seqbegin(&mount_lock);
+			if (r_retry)
+				nd->r_seq = read_seqbegin(&rename_lock);
+			if (!path_is_under(&nd->path, &nd->root))
+				return -EXDEV;
+		}
 	}
 	return 0;
 }
@@ -2355,6 +2371,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
 	nd->depth = 0;
+
+	nd->m_seq = read_seqbegin(&mount_lock);
+	if (unlikely(flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT)))
+		nd->r_seq = read_seqbegin(&rename_lock);
+
 	if (flags & LOOKUP_ROOT) {
 		struct dentry *root = nd->root.dentry;
 		struct inode *inode = root->d_inode;
@@ -2365,7 +2386,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 		if (flags & LOOKUP_RCU) {
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 			nd->root_seq = nd->seq;
-			nd->m_seq = read_seqbegin(&mount_lock);
 		} else {
 			path_get(&nd->path);
 		}
@@ -2376,8 +2396,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	nd->path.mnt = NULL;
 	nd->path.dentry = NULL;
 
-	nd->m_seq = read_seqbegin(&mount_lock);
-
 	if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT))) {
 		error = dirfd_path_init(nd);
 		if (unlikely(error))
-- 
2.21.0

^ permalink raw reply related

* [PATCH RFC v8 08/10] namei: resolveat(2) syscall
From: Aleksa Sarai @ 2019-05-20 13:33 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan
  Cc: Aleksa Sarai, Christian Brauner, Eric Biederman, Andy Lutomirski,
	Andrew Morton, Alexei Starovoitov, Kees Cook, Jann Horn,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, linux-kselftest,
	linux-fsdevel, linux-api, linux-kernel, linux-arch
In-Reply-To: <20190520133305.11925-1-cyphar@cyphar.com>

The most obvious syscall to add support for the new LOOKUP_* scoping
flags would be openat(2) (along with the required execveat(2) change
included in this series). However, there are a few reasons to not do
this:

 * The new LOOKUP_* flags are intended to be security features, and
   openat(2) will silently ignore all unknown flags. This means that
   users would need to avoid foot-gunning themselves constantly when
   using this interface if it were part of openat(2).

 * Resolution scoping feels like a different operation to the existing
   O_* flags. And since openat(2) has limited flag space, it seems to be
   quite wasteful to clutter it with 5 flags that are all
   resolution-related. Arguably O_NOFOLLOw is also a resolution flag but
   its entire purpose is to error out if you encounter a trailing
   symlink not to scope resolution.

 * Other systems would be able to reimplement this syscall allowing for
   cross-OS standardisation rather than being hidden amongst O_* flags
   which may result in it not being used by all the parties that might
   want to use it (file servers, web servers, container runtimes, etc).

 * It gives us the opportunity to iterate on the O_PATH interface. In
   particular, the RESOLVE_UPGRADE_* flags are only possible because we
   have a clean slate without needing to re-use the ACC_MODE flag
   design.

To this end, we introduce the resolveat(2) syscall. At the moment it's
effectively another way of getting a bog-standard O_PATH descriptor but
with the ability to use the new LOOKUP_* flags. Following the example of
other new file-handle-creation syscalls (pidfds, seccomp notify fds) we
default to O_CLOEXEC and users can unset O_CLOEXEC using fcntl(2) if
they really want to.

Because resolveat(2) only provides the ability to get O_PATH
descriptors, users will need to get creative with /proc/self/fd in order
to get a usable file descriptor for other uses. However, the new
addition of O_EMPTYPATH shortcuts this problem and allows for easy usage
of the new LOOKUP_* flags with openat(2) without cluttering the open(2)
flag bits.

In order to allow for userspace to lock down their usage of file
descriptor re-opening, resolveat(2) has flags (RESOLVE_UPGRADE_*) which
allow users to disallow certain re-opening modes.

Co-developed-by: Christian Brauner <christian@brauner.io>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 arch/alpha/kernel/syscalls/syscall.tbl      |  1 +
 arch/arm/tools/syscall.tbl                  |  1 +
 arch/arm64/include/asm/unistd.h             |  2 +-
 arch/arm64/include/asm/unistd32.h           |  3 ++
 arch/ia64/kernel/syscalls/syscall.tbl       |  1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |  1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |  1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |  1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |  1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |  1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |  1 +
 arch/s390/kernel/syscalls/syscall.tbl       |  1 +
 arch/sh/kernel/syscalls/syscall.tbl         |  1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |  1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |  1 +
 fs/namei.c                                  | 50 +++++++++++++++++++++
 include/linux/fcntl.h                       |  8 +++-
 include/uapi/asm-generic/unistd.h           |  5 ++-
 include/uapi/linux/fcntl.h                  | 10 +++++
 22 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 63ed39cbd3bd..72f431b1dc9c 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -461,5 +461,6 @@
 530	common	getegid				sys_getegid
 531	common	geteuid				sys_geteuid
 532	common	getppid				sys_getppid
+533	common	resolveat			sys_resolveat
 # all other architectures have common numbers for new syscall, alpha
 # is the exception.
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 9016f4081bb9..11f19c78185e 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -437,3 +437,4 @@
 421	common	rt_sigtimedwait_time64		sys_rt_sigtimedwait
 422	common	futex_time64			sys_futex
 423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
+435	common	resolveat			sys_resolveat
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index d1dd93436e1e..d04eb26cfaeb 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -44,7 +44,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		424
+#define __NR_compat_syscalls		436
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 5590f2623690..9978234748a9 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -867,6 +867,9 @@ __SYSCALL(__NR_futex_time64, sys_futex)
 #define __NR_sched_rr_get_interval_time64 423
 __SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval)
 
+#define __NR_resolveat 435
+__SYSCALL(__NR_resolveat, sys_sched_rr_get_interval)
+
 /*
  * Please add new compat syscalls above this comment and update
  * __NR_compat_syscalls in asm/unistd.h.
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index ab9cda5f6136..26a3daab202e 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -344,3 +344,4 @@
 332	common	pkey_free			sys_pkey_free
 333	common	rseq				sys_rseq
 # 334 through 423 are reserved to sync up with other architectures
+435	common	resolveat			sys_resolveat
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 125c14178979..3fdbd42077f1 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -423,3 +423,4 @@
 421	common	rt_sigtimedwait_time64		sys_rt_sigtimedwait
 422	common	futex_time64			sys_futex
 423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
+435	common	resolveat			sys_resolveat
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 8ee3a8c18498..6948436a939b 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -429,3 +429,4 @@
 421	common	rt_sigtimedwait_time64		sys_rt_sigtimedwait
 422	common	futex_time64			sys_futex
 423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
+435	common	resolveat			sys_resolveat
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 15f4117900ee..36795694e495 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -362,3 +362,4 @@
 421	n32	rt_sigtimedwait_time64		compat_sys_rt_sigtimedwait_time64
 422	n32	futex_time64			sys_futex
 423	n32	sched_rr_get_interval_time64	sys_sched_rr_get_interval
+435	n32	resolveat			sys_resolveat
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index c85502e67b44..5fcde2d31fe7 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -338,3 +338,4 @@
 327	n64	rseq				sys_rseq
 328	n64	io_pgetevents			sys_io_pgetevents
 # 329 through 423 are reserved to sync up with other architectures
+435	n64	resolveat			sys_resolveat
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 2e063d0f837e..71152b788e37 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -411,3 +411,4 @@
 421	o32	rt_sigtimedwait_time64		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time64
 422	o32	futex_time64			sys_futex			sys_futex
 423	o32	sched_rr_get_interval_time64	sys_sched_rr_get_interval	sys_sched_rr_get_interval
+435	o32	resolveat			sys_resolveat			sys_resolveat
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index b26766c6647d..68e415658235 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -420,3 +420,4 @@
 421	32	rt_sigtimedwait_time64		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time64
 422	32	futex_time64			sys_futex			sys_futex
 423	32	sched_rr_get_interval_time64	sys_sched_rr_get_interval	sys_sched_rr_get_interval
+435	common	resolveat			sys_resolveat			sys_resolveat
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index b18abb0c3dae..69b0bf1df8b1 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -505,3 +505,4 @@
 421	32	rt_sigtimedwait_time64		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time64
 422	32	futex_time64			sys_futex			sys_futex
 423	32	sched_rr_get_interval_time64	sys_sched_rr_get_interval	sys_sched_rr_get_interval
+435	common	resolveat			sys_resolveat			sys_resolveat
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 02579f95f391..9610f18eb2ea 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -426,3 +426,4 @@
 421	32	rt_sigtimedwait_time64	-				compat_sys_rt_sigtimedwait_time64
 422	32	futex_time64		-				sys_futex
 423	32	sched_rr_get_interval_time64	-			sys_sched_rr_get_interval
+435	common	resolveat		sys_resolveat			-
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index bfda678576e4..ed53ca7717cf 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -426,3 +426,4 @@
 421	common	rt_sigtimedwait_time64		sys_rt_sigtimedwait
 422	common	futex_time64			sys_futex
 423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
+435	common	resolveat			sys_resolveat
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index b9a5a04b2d2c..41778972430b 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -469,3 +469,4 @@
 421	32	rt_sigtimedwait_time64		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait_time64
 422	32	futex_time64			sys_futex			sys_futex
 423	32	sched_rr_get_interval_time64	sys_sched_rr_get_interval	sys_sched_rr_get_interval
+435 common	resolveat			sys_resolveat			sys_resolveat
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4cd5f982b1e5..f3cb515e52fb 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -438,3 +438,4 @@
 425	i386	io_uring_setup		sys_io_uring_setup		__ia32_sys_io_uring_setup
 426	i386	io_uring_enter		sys_io_uring_enter		__ia32_sys_io_uring_enter
 427	i386	io_uring_register	sys_io_uring_register		__ia32_sys_io_uring_register
+435	i386	resolveat		sys_resolveat			__ia32_sys_resolveat
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 64ca0d06259a..5f4a988f3bc9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -355,6 +355,7 @@
 425	common	io_uring_setup		__x64_sys_io_uring_setup
 426	common	io_uring_enter		__x64_sys_io_uring_enter
 427	common	io_uring_register	__x64_sys_io_uring_register
+435	common	resolveat		__x64_sys_resolveat
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 6af49929de85..2d5b57ebd6c1 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -394,3 +394,4 @@
 421	common	rt_sigtimedwait_time64		sys_rt_sigtimedwait
 422	common	futex_time64			sys_futex
 423	common	sched_rr_get_interval_time64	sys_sched_rr_get_interval
+435	common	resolveat			sys_resolveat
diff --git a/fs/namei.c b/fs/namei.c
index 6c3bbe627965..4b4eb6c624cb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3790,6 +3790,56 @@ struct file *do_filp_open(int dfd, struct filename *pathname,
 	return filp;
 }
 
+SYSCALL_DEFINE3(resolveat, int, dfd, const char __user *, path,
+		unsigned long, flags)
+{
+	int fd;
+	struct filename *tmp;
+	struct open_flags op = {
+		.open_flag = O_PATH | O_CLOEXEC,
+		.opath_mask = FMODE_PATH_READ | FMODE_PATH_WRITE,
+	};
+
+	if (flags & ~VALID_RESOLVE_FLAGS)
+		return -EINVAL;
+
+	if (flags & RESOLVE_UPGRADE_NOREAD)
+		op.opath_mask &= ~FMODE_PATH_READ;
+	if (flags & RESOLVE_UPGRADE_NOWRITE)
+		op.opath_mask &= ~FMODE_PATH_WRITE;
+
+	if (!(flags & RESOLVE_NO_FOLLOW))
+		op.lookup_flags |= LOOKUP_FOLLOW;
+	if (flags & RESOLVE_BENEATH)
+		op.lookup_flags |= LOOKUP_BENEATH;
+	if (flags & RESOLVE_XDEV)
+		op.lookup_flags |= LOOKUP_XDEV;
+	if (flags & RESOLVE_NO_MAGICLINKS)
+		op.lookup_flags |= LOOKUP_NO_MAGICLINKS;
+	if (flags & RESOLVE_NO_SYMLINKS)
+		op.lookup_flags |= LOOKUP_NO_SYMLINKS;
+	if (flags & RESOLVE_IN_ROOT)
+		op.lookup_flags |= LOOKUP_IN_ROOT;
+
+	tmp = getname(path);
+	if (IS_ERR(tmp))
+		return PTR_ERR(tmp);
+
+	fd = get_unused_fd_flags(op.open_flag);
+	if (fd >= 0) {
+		struct file *f = do_filp_open(dfd, tmp, &op);
+		if (IS_ERR(f)) {
+			put_unused_fd(fd);
+			fd = PTR_ERR(f);
+		} else {
+			fsnotify_open(f);
+			fd_install(fd, f);
+		}
+	}
+	putname(tmp);
+	return fd;
+}
+
 struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 		const char *name, const struct open_flags *op)
 {
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 2868ae6c8fc1..6356eb479765 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -4,13 +4,19 @@
 
 #include <uapi/linux/fcntl.h>
 
-/* list of all valid flags for the open/openat flags argument: */
+/* List of all valid flags for the open/openat flags argument: */
 #define VALID_OPEN_FLAGS \
 	(O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
 	 O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \
 	 FASYNC	| O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
 	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | O_EMPTYPATH)
 
+/* List of all valid flags for resolveat(2). */
+#define VALID_RESOLVE_FLAGS \
+	(RESOLVE_UPGRADE_NOWRITE | RESOLVE_UPGRADE_NOREAD | \
+	 RESOLVE_NO_FOLLOW | RESOLVE_BENEATH | RESOLVE_XDEV | \
+	 RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | RESOLVE_IN_ROOT)
+
 #ifndef force_o_largefile
 #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index dee7292e1df6..f788c9f7d374 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -833,8 +833,11 @@ __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
 #define __NR_io_uring_register 427
 __SYSCALL(__NR_io_uring_register, sys_io_uring_register)
 
+#define __NR_resolveat 435
+__SYSCALL(__NR_resolveat, sys_resolveat)
+
 #undef __NR_syscalls
-#define __NR_syscalls 428
+#define __NR_syscalls 436
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 1d338357df8a..185712fa9cc0 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -93,5 +93,15 @@
 
 #define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */
 
+/* Bottom bit is reserved for RESOLVE_UPGRADE_NOEXEC. */
+#define RESOLVE_UPGRADE_NOWRITE	0x002 /* Disallow re-opening for write. */
+#define RESOLVE_UPGRADE_NOREAD	0x004 /* Disallow re-opening for read. */
+#define RESOLVE_NO_FOLLOW	0x008 /* Don't follow trailing symlinks. */
+
+#define RESOLVE_BENEATH		0x010 /* Block "lexical" trickery like "..", symlinks, absolute paths, etc. */
+#define RESOLVE_XDEV		0x020 /* Block mount-point crossings (includes bind-mounts). */
+#define RESOLVE_NO_MAGICLINKS	0x040 /* Block procfs-style "magic" symlinks. */
+#define RESOLVE_NO_SYMLINKS	0x080 /* Block all symlinks (implies AT_NO_MAGICLINKS). */
+#define RESOLVE_IN_ROOT		0x100 /* Scope ".." and "/" resolution to dirfd (like chroot(2)). */
 
 #endif /* _UAPI_LINUX_FCNTL_H */
-- 
2.21.0

^ permalink raw reply related

* [PATCH RFC v8 09/10] kselftest: save-and-restore errno to allow for %m formatting
From: Aleksa Sarai @ 2019-05-20 13:33 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan
  Cc: Aleksa Sarai, Eric Biederman, Andy Lutomirski, Andrew Morton,
	Alexei Starovoitov, Kees Cook, Jann Horn, Christian Brauner,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, linux-kselftest,
	linux-fsdevel, linux-api, linux-kernel, linux-arch
In-Reply-To: <20190520133305.11925-1-cyphar@cyphar.com>

Previously, using "%m" in a ksft_* format string can result in strange
output because the errno value wasn't saved before calling other libc
functions. The solution is to simply save and restore the errno before
we format the user-supplied format string.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 tools/testing/selftests/kselftest.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
index 47e1d995c182..ce049af7415b 100644
--- a/tools/testing/selftests/kselftest.h
+++ b/tools/testing/selftests/kselftest.h
@@ -10,6 +10,7 @@
 #ifndef __KSELFTEST_H
 #define __KSELFTEST_H
 
+#include <errno.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <stdarg.h>
@@ -72,58 +73,68 @@ static inline void ksft_print_cnts(void)
 
 static inline void ksft_print_msg(const char *msg, ...)
 {
+	int saved_errno = errno;
 	va_list args;
 
 	va_start(args, msg);
 	printf("# ");
+	errno = saved_errno;
 	vprintf(msg, args);
 	va_end(args);
 }
 
 static inline void ksft_test_result_pass(const char *msg, ...)
 {
+	int saved_errno = errno;
 	va_list args;
 
 	ksft_cnt.ksft_pass++;
 
 	va_start(args, msg);
 	printf("ok %d ", ksft_test_num());
+	errno = saved_errno;
 	vprintf(msg, args);
 	va_end(args);
 }
 
 static inline void ksft_test_result_fail(const char *msg, ...)
 {
+	int saved_errno = errno;
 	va_list args;
 
 	ksft_cnt.ksft_fail++;
 
 	va_start(args, msg);
 	printf("not ok %d ", ksft_test_num());
+	errno = saved_errno;
 	vprintf(msg, args);
 	va_end(args);
 }
 
 static inline void ksft_test_result_skip(const char *msg, ...)
 {
+	int saved_errno = errno;
 	va_list args;
 
 	ksft_cnt.ksft_xskip++;
 
 	va_start(args, msg);
 	printf("ok %d # skip ", ksft_test_num());
+	errno = saved_errno;
 	vprintf(msg, args);
 	va_end(args);
 }
 
 static inline void ksft_test_result_error(const char *msg, ...)
 {
+	int saved_errno = errno;
 	va_list args;
 
 	ksft_cnt.ksft_error++;
 
 	va_start(args, msg);
 	printf("not ok %d # error ", ksft_test_num());
+	errno = saved_errno;
 	vprintf(msg, args);
 	va_end(args);
 }
@@ -143,10 +154,12 @@ static inline int ksft_exit_fail(void)
 
 static inline int ksft_exit_fail_msg(const char *msg, ...)
 {
+	int saved_errno = errno;
 	va_list args;
 
 	va_start(args, msg);
 	printf("Bail out! ");
+	errno = saved_errno;
 	vprintf(msg, args);
 	va_end(args);
 
@@ -169,10 +182,12 @@ static inline int ksft_exit_xpass(void)
 static inline int ksft_exit_skip(const char *msg, ...)
 {
 	if (msg) {
+		int saved_errno = errno;
 		va_list args;
 
 		va_start(args, msg);
 		printf("1..%d # Skipped: ", ksft_test_num());
+		errno = saved_errno;
 		vprintf(msg, args);
 		va_end(args);
 	} else {
-- 
2.21.0

^ permalink raw reply related

* [PATCH RFC v8 10/10] selftests: add resolveat(2) selftests
From: Aleksa Sarai @ 2019-05-20 13:33 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells, Shuah Khan, Shuah Khan
  Cc: Aleksa Sarai, Eric Biederman, Andy Lutomirski, Andrew Morton,
	Alexei Starovoitov, Kees Cook, Jann Horn, Christian Brauner,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, linux-kselftest,
	linux-fsdevel, linux-api, linux-kernel, linux-arch
In-Reply-To: <20190520133305.11925-1-cyphar@cyphar.com>

Test all of the various resolveat(2) flags, as well as how file
descriptor re-opening works. A small stress-test of a symlink-rename
attack is included to show that the protections against ".."-based
attacks are sufficient.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 tools/testing/selftests/Makefile              |   1 +
 tools/testing/selftests/resolveat/.gitignore  |   1 +
 tools/testing/selftests/resolveat/Makefile    |   6 +
 tools/testing/selftests/resolveat/helpers.h   | 195 +++++++++
 .../selftests/resolveat/linkmode_test.c       | 306 ++++++++++++++
 .../selftests/resolveat/resolveat_test.c      | 400 ++++++++++++++++++
 6 files changed, 909 insertions(+)
 create mode 100644 tools/testing/selftests/resolveat/.gitignore
 create mode 100644 tools/testing/selftests/resolveat/Makefile
 create mode 100644 tools/testing/selftests/resolveat/helpers.h
 create mode 100644 tools/testing/selftests/resolveat/linkmode_test.c
 create mode 100644 tools/testing/selftests/resolveat/resolveat_test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 971fc8428117..f558d6f21c4b 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -37,6 +37,7 @@ TARGETS += powerpc
 TARGETS += proc
 TARGETS += pstore
 TARGETS += ptrace
+TARGETS += resolveat
 TARGETS += rseq
 TARGETS += rtc
 TARGETS += seccomp
diff --git a/tools/testing/selftests/resolveat/.gitignore b/tools/testing/selftests/resolveat/.gitignore
new file mode 100644
index 000000000000..bd68f6c3fd07
--- /dev/null
+++ b/tools/testing/selftests/resolveat/.gitignore
@@ -0,0 +1 @@
+/*_test
diff --git a/tools/testing/selftests/resolveat/Makefile b/tools/testing/selftests/resolveat/Makefile
new file mode 100644
index 000000000000..375eaf4a55e7
--- /dev/null
+++ b/tools/testing/selftests/resolveat/Makefile
@@ -0,0 +1,6 @@
+CFLAGS += -g -I../../../../usr/include/
+
+TEST_GEN_PROGS := linkmode_test resolveat_test
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/resolveat/helpers.h b/tools/testing/selftests/resolveat/helpers.h
new file mode 100644
index 000000000000..c765f606cdbc
--- /dev/null
+++ b/tools/testing/selftests/resolveat/helpers.h
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2018-2019 SUSE LLC.
+ */
+
+#ifndef __RESOLVEAT_H__
+#define __RESOLVEAT_H__
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <syscall.h>
+#include <limits.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#define ARRAY_LEN(X) (sizeof (X) / sizeof (*(X)))
+
+#ifndef __NR_resolveat
+#define __NR_resolveat 435
+#define RESOLVE_UPGRADE_NOWRITE	0x002 /* Disallow re-opening for write. */
+#define RESOLVE_UPGRADE_NOREAD	0x004 /* Disallow re-opening for read. */
+#define RESOLVE_NO_FOLLOW	0x008 /* Don't follow trailing symlinks. */
+#define RESOLVE_BENEATH		0x010 /* Block "lexical" trickery like "..", symlinks, absolute paths, etc. */
+#define RESOLVE_XDEV		0x020 /* Block mount-point crossings (includes bind-mounts). */
+#define RESOLVE_NO_MAGICLINKS	0x040 /* Block procfs-style "magic" symlinks. */
+#define RESOLVE_NO_SYMLINKS	0x080 /* Block all symlinks (implies AT_NO_MAGICLINKS). */
+#define RESOLVE_IN_ROOT		0x100 /* Scope ".." and "/" resolution to dirfd (like chroot(2)). */
+#endif /* __NR_resolveat */
+
+#ifndef O_EMPTYPATH
+#define O_EMPTYPATH 040000000
+#endif /* O_EMPTYPATH */
+
+#define E_func(func, ...)						\
+	do {								\
+		if (func(__VA_ARGS__) < 0)				\
+			ksft_exit_fail_msg("%s:%d %s failed\n", \
+					   __FILE__, __LINE__, #func);\
+	} while (0)
+
+#define E_mkdirat(...)   E_func(mkdirat,   __VA_ARGS__)
+#define E_symlinkat(...) E_func(symlinkat, __VA_ARGS__)
+#define E_touchat(...)   E_func(touchat,   __VA_ARGS__)
+#define E_readlink(...)  E_func(readlink,  __VA_ARGS__)
+#define E_fstatat(...)   E_func(fstatat,   __VA_ARGS__)
+#define E_asprintf(...)  E_func(asprintf,  __VA_ARGS__)
+#define E_fchdir(...)    E_func(fchdir,    __VA_ARGS__)
+#define E_mount(...)     E_func(mount,     __VA_ARGS__)
+#define E_unshare(...)   E_func(unshare,   __VA_ARGS__)
+#define E_setresuid(...) E_func(setresuid, __VA_ARGS__)
+#define E_chmod(...)     E_func(chmod,     __VA_ARGS__)
+
+#define E_assert(expr, msg, ...)					\
+	do {								\
+		if (!(expr))						\
+			ksft_exit_fail_msg("ASSERT(%s:%d) failed (%s): " msg "\n", \
+					   __FILE__, __LINE__, #expr, ##__VA_ARGS__); \
+	} while (0)
+
+typedef int (*openfunc_t)(int dfd, const char *path, unsigned int flags);
+
+static int sys_resolveat(int dfd, const char *path, unsigned int flags)
+{
+	int ret = syscall(__NR_resolveat, dfd, path, flags);
+	return ret >= 0 ? ret : -errno;
+}
+
+static int sys_openat(int dfd, const char *path, unsigned int flags)
+{
+	int ret = openat(dfd, path, flags);
+	return ret >= 0 ? ret : -errno;
+}
+
+static int sys_execveat(int dfd, const char *path,
+			char *const argv[], char *const envp[], int flags)
+{
+	int ret = syscall(SYS_execveat, dfd, path, argv, envp, flags);
+	return ret >= 0 ? ret : -errno;
+}
+
+static char *resolveat_flags(unsigned int flags)
+{
+	char *flagset, *p;
+
+	E_asprintf(&flagset, "%s%s%s%s%s%s%s%s0",
+		   (flags & RESOLVE_UPGRADE_NOWRITE)	? "RESOLVE_UPGRADE_NOWRITE|" : "",
+		   (flags & RESOLVE_UPGRADE_NOREAD)	? "RESOLVE_UPGRADE_NOREAD|" : "",
+		   (flags & RESOLVE_NO_FOLLOW)		? "RESOLVE_NO_FOLLOW|" : "",
+		   (flags & RESOLVE_BENEATH)		? "RESOLVE_BENEATH|" : "",
+		   (flags & RESOLVE_XDEV)		? "RESOLVE_XDEV|" : "",
+		   (flags & RESOLVE_NO_MAGICLINKS)	? "RESOLVE_NO_MAGICLINKS|" : "",
+		   (flags & RESOLVE_NO_SYMLINKS)	? "RESOLVE_NO_SYMLINKS|" : "",
+		   (flags & RESOLVE_IN_ROOT)		? "RESOLVE_IN_ROOT|" : "");
+
+	/* Fix up the trailing |0. */
+	p = strstr(flagset, "|0");
+	if (p)
+		*p = '\0';
+	return flagset;
+}
+
+static char *openat_flags(unsigned int flags)
+{
+	char *flagset;
+	const char *modeflag = "(none)";
+
+	/* Handle the peculiarity of the ACC_MODE flags. */
+	switch (flags & 0x03) {
+		case O_RDWR:
+			modeflag = "O_RDWR";
+			break;
+		case O_RDONLY:
+			modeflag = "O_RDONLY";
+			break;
+		case O_WRONLY:
+			modeflag = "O_WRONLY";
+			break;
+	}
+
+	/* TODO: Add more open flags. */
+	E_asprintf(&flagset, "%s", modeflag);
+	return flagset;
+}
+
+static int touchat(int dfd, const char *path)
+{
+	int fd = openat(dfd, path, O_CREAT);
+	if (fd >= 0)
+		close(fd);
+	return fd;
+}
+
+static char *fdreadlink(int fd)
+{
+	char *target, *tmp;
+
+	E_asprintf(&tmp, "/proc/self/fd/%d", fd);
+
+	target = malloc(PATH_MAX);
+	if (!target)
+		ksft_exit_fail_msg("fdreadlink: malloc failed\n");
+	memset(target, 0, PATH_MAX);
+
+	E_readlink(tmp, target, PATH_MAX);
+	free(tmp);
+	return target;
+}
+
+static bool fdequal(int fd, int dfd, const char *path)
+{
+	char *fdpath, *dfdpath, *other;
+	bool cmp;
+
+	fdpath = fdreadlink(fd);
+	dfdpath = fdreadlink(dfd);
+
+	if (!path)
+		E_asprintf(&other, "%s", dfdpath);
+	else if (*path == '/')
+		E_asprintf(&other, "%s", path);
+	else
+		E_asprintf(&other, "%s/%s", dfdpath, path);
+
+	cmp = !strcmp(fdpath, other);
+	if (!cmp)
+		ksft_print_msg("fdequal: expected '%s' but got '%s'\n", other, fdpath);
+
+	free(fdpath);
+	free(dfdpath);
+	free(other);
+	return cmp;
+}
+
+static void test_resolveat_supported(void)
+{
+	int fd = sys_resolveat(AT_FDCWD, ".", 0);
+	if (fd == -ENOSYS)
+		ksft_exit_skip("resolveat(2) unsupported on this kernel\n");
+	if (fd < 0)
+		ksft_exit_fail_msg("resolveat(2) supported check failed: %s\n", strerror(-fd));
+	close(fd);
+}
+
+#endif /* __RESOLVEAT_H__ */
diff --git a/tools/testing/selftests/resolveat/linkmode_test.c b/tools/testing/selftests/resolveat/linkmode_test.c
new file mode 100644
index 000000000000..b60375099494
--- /dev/null
+++ b/tools/testing/selftests/resolveat/linkmode_test.c
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2018-2019 SUSE LLC.
+ */
+
+#define _GNU_SOURCE
+#include <libgen.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <syscall.h>
+#include <limits.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "helpers.h"
+
+static mode_t fdmode(int fd)
+{
+	char *fdpath;
+	struct stat statbuf;
+	mode_t mode;
+
+	E_asprintf(&fdpath, "/proc/self/fd/%d", fd);
+	E_fstatat(AT_FDCWD, fdpath, &statbuf, AT_SYMLINK_NOFOLLOW);
+	mode = (statbuf.st_mode & ~S_IFMT);
+	free(fdpath);
+
+	return mode;
+}
+
+static int reopen_proc(int fd, unsigned int flags)
+{
+	int ret, saved_errno;
+	char *fdpath;
+
+	E_asprintf(&fdpath, "/proc/self/fd/%d", fd);
+	ret = open(fdpath, flags);
+	saved_errno = errno;
+	free(fdpath);
+
+	return ret >= 0 ? ret : -saved_errno;
+}
+
+static int reopen_oemptypath(int fd, unsigned int flags)
+{
+	int ret = openat(fd, "", O_EMPTYPATH | flags);
+	return ret >= 0 ? ret : -errno;
+}
+
+struct reopen_test {
+	openfunc_t open;
+	mode_t chmod_mode;
+	struct {
+		unsigned int flags;
+		mode_t mode;
+		int err;
+	} orig, new;
+};
+
+static bool reopen(int fd, struct reopen_test *test)
+{
+	int newfd;
+	mode_t proc_mode;
+	bool failed = false;
+
+	/* Check that the proc mode is correct. */
+	proc_mode = fdmode(fd);
+	if (proc_mode != test->orig.mode) {
+		ksft_print_msg("incorrect fdmode (got[%o] != want[%o])\n",
+			       proc_mode, test->orig.mode);
+		failed = true;
+	}
+
+	/* Re-open through /proc. */
+	newfd = reopen_proc(fd, test->new.flags);
+	if (newfd != test->new.err && (newfd < 0 || test->new.err < 0)) {
+		ksft_print_msg("/proc failure (%d != %d [%s])\n",
+			       newfd, test->new.err, strerror(-test->new.err));
+		failed = true;
+	}
+	if (newfd >= 0) {
+		proc_mode = fdmode(newfd);
+		if (proc_mode != test->new.mode) {
+			ksft_print_msg("/proc wrong fdmode (got[%o] != want[%o])\n",
+				       proc_mode, test->new.mode);
+			failed = true;
+		}
+		close(newfd);
+	}
+
+	/* Re-open with O_EMPTYPATH. */
+	newfd = reopen_oemptypath(fd, test->new.flags);
+	if (newfd != test->new.err && (newfd < 0 || test->new.err < 0)) {
+		ksft_print_msg("O_EMPTYPATH failure (%d != %d [%s])\n",
+			       newfd, test->new.err, strerror(-test->new.err));
+		failed = true;
+	}
+	if (newfd >= 0) {
+		proc_mode = fdmode(newfd);
+		if (proc_mode != test->new.mode) {
+			ksft_print_msg("O_EMPTYPATH wrong fdmode (got[%o] != want[%o])\n",
+				       proc_mode, test->new.mode);
+			failed = true;
+		}
+		close(newfd);
+	}
+
+	return failed;
+}
+
+void test_reopen_ordinary(bool privileged)
+{
+	int fd;
+	int err_access = privileged ? 0 : -EACCES;
+	char tmpfile[] = "/tmp/reopen-testfile.XXXXXX";
+
+	fd = mkstemp(tmpfile);
+	E_assert(fd >= 0, "mkstemp failed: %m\n");
+	close(fd);
+
+	struct reopen_test tests[] = {
+		/* Re-opening with the same mode should succeed. */
+		{ .open = sys_openat,	  .chmod_mode = 0400,
+		  .orig.flags = O_RDONLY, .orig.mode  = privileged ? 0777 : 0555,
+		  .new.flags  = O_RDONLY, .new.mode   = privileged ? 0777 : 0555},
+		{ .open = sys_openat,	  .chmod_mode = 0200,
+		  .orig.flags = O_WRONLY, .orig.mode  = privileged ? 0777 : 0333,
+		  .new.flags  = O_WRONLY, .new.mode   = privileged ? 0777 : 0333 },
+		{ .open = sys_openat,	  .chmod_mode = 0600,
+		  .orig.flags =   O_RDWR, .orig.mode  = 0777,
+		  .new.flags  =   O_RDWR, .new.mode   = 0777 },
+		{ .open = sys_openat,	  .chmod_mode = 0600,
+		  .orig.flags =   O_RDWR, .orig.mode  = 0777,
+		  .new.flags  = O_RDONLY, .new.mode   = 0777 },
+		{ .open = sys_openat,	  .chmod_mode = 0600,
+		  .orig.flags =   O_RDWR, .orig.mode  = 0777,
+		  .new.flags  = O_WRONLY, .new.mode   = 0777 },
+
+		/*
+		 * Upgrading the mode of a normal file works if the user had the
+		 * required access at original-open time.
+		 */
+		{ .open = sys_openat,	  .chmod_mode = 0600,
+		  .orig.flags = O_RDONLY, .orig.mode  = 0777,
+		  .new.flags  = O_WRONLY, .new.mode   = 0777 },
+		{ .open = sys_openat,	  .chmod_mode = 0600,
+		  .orig.flags = O_WRONLY, .orig.mode  = 0777,
+		  .new.flags  = O_RDONLY, .new.mode   = 0777 },
+		{ .open = sys_openat,	  .chmod_mode = 0600,
+		  .orig.flags = O_RDONLY, .orig.mode  = 0777,
+		  .new.flags  =   O_RDWR, .new.mode   = 0777 },
+		{ .open = sys_openat,	  .chmod_mode = 0600,
+		  .orig.flags = O_WRONLY, .orig.mode  = 0777,
+		  .new.flags  =   O_RDWR, .new.mode   = 0777 },
+
+		/* However, re-open will be blocked given insufficient permissions. */
+		{ .open = sys_openat,	  .chmod_mode = 0400,
+		  .orig.flags = O_RDONLY, .orig.mode  = privileged ? 0777 : 0555,
+		  .new.flags  = O_WRONLY, .new.mode   = 0777, .new.err = err_access },
+		{ .open = sys_openat,	  .chmod_mode = 0200,
+		  .orig.flags = O_WRONLY, .orig.mode  = privileged ? 0777 : 0333,
+		  .new.flags  = O_RDONLY, .new.mode   = 0777, .new.err = err_access },
+		{ .open = sys_openat,	  .chmod_mode = 0400,
+		  .orig.flags = O_RDONLY, .orig.mode  = privileged ? 0777 : 0555,
+		  .new.flags  =   O_RDWR, .new.mode   = 0777, .new.err = err_access },
+		{ .open = sys_openat,	  .chmod_mode = 0200,
+		  .orig.flags = O_WRONLY, .orig.mode  = privileged ? 0777 : 0333,
+		  .new.flags  =   O_RDWR, .new.mode   = 0777, .new.err = err_access },
+
+		/* O_PATH re-opens (of ordinary files) will always work. */
+		{ .open = sys_openat,	  .chmod_mode = 0000,
+		  .orig.flags =   O_PATH, .orig.mode  = 0777,
+		  .new.flags  = O_WRONLY, .new.mode   = 0777 },
+		{ .open = sys_resolveat,  .chmod_mode = 0000,
+		  .orig.flags =        0, .orig.mode  = 0777,
+		  .new.flags  = O_WRONLY, .new.mode   = 0777 },
+		{ .open = sys_openat,	  .chmod_mode = 0000,
+		  .orig.flags =   O_PATH, .orig.mode  = 0777,
+		  .new.flags  = O_RDONLY, .new.mode   = 0777 },
+		{ .open = sys_resolveat,  .chmod_mode = 0000,
+		  .orig.flags =        0, .orig.mode  = 0777,
+		  .new.flags  = O_RDONLY, .new.mode   = 0777 },
+		{ .open = sys_openat,	  .chmod_mode = 0000,
+		  .orig.flags =   O_PATH, .orig.mode  = 0777,
+		  .new.flags  =   O_RDWR, .new.mode   = 0777 },
+		{ .open = sys_resolveat,  .chmod_mode = 0000,
+		  .orig.flags =        0, .orig.mode  = 0777,
+		  .new.flags  =   O_RDWR, .new.mode   = 0777 },
+
+		/*
+		 * resolveat(2) RESOLVE_UPGRADE_NO* flags. In the privileged case, the
+		 * re-open will work but the mode will still be scoped to the mode
+		 * (or'd with the open acc_mode).
+		 */
+		{ .open = sys_resolveat,  .chmod_mode = 0000,
+		  .orig.flags = RESOLVE_UPGRADE_NOREAD | RESOLVE_UPGRADE_NOWRITE,
+		                          .orig.mode  = 0111,
+		  .new.flags  = O_RDONLY, .new.mode   = 0555, .new.err = err_access },
+		{ .open = sys_resolveat,  .chmod_mode = 0000,
+		  .orig.flags = RESOLVE_UPGRADE_NOREAD | RESOLVE_UPGRADE_NOWRITE,
+		                          .orig.mode  = 0111,
+		  .new.flags  = O_WRONLY, .new.mode   = 0333, .new.err = err_access },
+		{ .open = sys_resolveat,  .chmod_mode = 0000,
+		  .orig.flags = RESOLVE_UPGRADE_NOREAD | RESOLVE_UPGRADE_NOWRITE,
+		                          .orig.mode  = 0111,
+		  .new.flags  =   O_RDWR, .new.mode   = 0777, .new.err = err_access },
+		{ .open = sys_resolveat,  .chmod_mode = 0000,
+		  .orig.flags = RESOLVE_UPGRADE_NOWRITE,
+		                          .orig.mode  = 0555,
+		  .new.flags  = O_RDONLY, .new.mode   = 0555 },
+		{ .open = sys_resolveat,  .chmod_mode = 0000,
+		  .orig.flags = RESOLVE_UPGRADE_NOREAD,
+		                          .orig.mode  = 0333,
+		  .new.flags  = O_WRONLY, .new.mode   = 0333 },
+		{ .open = sys_resolveat,  .chmod_mode = 0000,
+		  .orig.flags = RESOLVE_UPGRADE_NOWRITE,
+		                          .orig.mode  = 0555,
+		  .new.flags  = O_RDONLY, .new.mode   = 0555 },
+		{ .open = sys_resolveat,  .chmod_mode = 0000,
+		  .orig.flags = RESOLVE_UPGRADE_NOREAD,
+		                          .orig.mode  = 0333,
+		  .new.flags  = O_RDONLY, .new.mode   = 0777, .new.err = err_access },
+		{ .open = sys_resolveat,  .chmod_mode = 0000,
+		  .orig.flags = RESOLVE_UPGRADE_NOWRITE,
+		                          .orig.mode  = 0555,
+		  .new.flags  = O_WRONLY, .new.mode   = 0777, .new.err = err_access },
+	};
+
+	for (int i = 0; i < ARRAY_LEN(tests); i++) {
+		int fd;
+		char *orig_flagset, *new_flagset;
+		struct reopen_test *test = &tests[i];
+		void (*resultfn)(const char *msg, ...) = ksft_test_result_pass;
+
+		E_chmod(tmpfile, test->chmod_mode);
+
+		fd = test->open(AT_FDCWD, tmpfile, test->orig.flags);
+		E_assert(fd >= 0, "open '%s' failed: %m\n", tmpfile);
+
+		/* Make sure that any EACCES we see is not from inode permissions. */
+		E_chmod(tmpfile, 0777);
+
+		if (reopen(fd, test))
+			resultfn = ksft_test_result_fail;
+
+		close(fd);
+
+		new_flagset = openat_flags(test->new.flags);
+		if (test->open == sys_openat)
+			orig_flagset = openat_flags(test->orig.flags);
+		else if (test->open == sys_resolveat)
+			orig_flagset = resolveat_flags(test->orig.flags);
+		else
+			ksft_exit_fail_msg("unknown test->open\n");
+
+		resultfn("%sordinary reopen of (orig[%s]=%s, new=%s) chmod=%.3o %s\n",
+			 privileged ? "privileged " : "",
+			 test->open == sys_openat ? "openat" : "resolveat",
+			 orig_flagset, new_flagset, test->chmod_mode,
+			 test->new.err < 0 ? strerror(-test->new.err) : "works");
+
+		free(new_flagset);
+		free(orig_flagset);
+	}
+
+	unlink(tmpfile);
+}
+
+
+int main(int argc, char **argv)
+{
+	bool privileged;
+
+	ksft_print_header();
+	test_resolveat_supported();
+
+	/*
+	 * Technically we should be checking CAP_DAC_OVERRIDE, but it's easier to
+	 * just assume that euid=0 has the full capability set.
+	 */
+	privileged = (geteuid() == 0);
+	if (!privileged)
+		ksft_test_result_skip("privileged tests require euid == 0\n");
+	else {
+		test_reopen_ordinary(privileged);
+
+		E_setresuid(65534, 65534, 65534);
+		privileged = (geteuid() == 0);
+	}
+
+	test_reopen_ordinary(privileged);
+
+	if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
+		ksft_exit_fail();
+	else
+		ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/resolveat/resolveat_test.c b/tools/testing/selftests/resolveat/resolveat_test.c
new file mode 100644
index 000000000000..72f2e8c5dfe0
--- /dev/null
+++ b/tools/testing/selftests/resolveat/resolveat_test.c
@@ -0,0 +1,400 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2018-2019 SUSE LLC.
+ */
+
+#define _GNU_SOURCE
+#include <libgen.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/mman.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <syscall.h>
+#include <limits.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "helpers.h"
+
+/*
+ * Construct a test directory with the following structure:
+ *
+ * root/
+ * |-- procexe -> /proc/self/exe
+ * |-- procroot -> /proc/self/root
+ * |-- root/
+ * |-- mnt/ [mountpoint]
+ * |   |-- self -> ../mnt/
+ * |   `-- absself -> /mnt/
+ * |-- etc/
+ * |   `-- passwd
+ * |-- relsym -> etc/passwd
+ * |-- abssym -> /etc/passwd
+ * |-- abscheeky -> /cheeky
+ * |-- abscheeky -> /cheeky
+ * `-- cheeky/
+ *     |-- absself -> /
+ *     |-- self -> ../../root/
+ *     |-- garbageself -> /../../root/
+ *     |-- passwd -> ../cheeky/../cheeky/../etc/../etc/passwd
+ *     |-- abspasswd -> /../cheeky/../cheeky/../etc/../etc/passwd
+ *     |-- dotdotlink -> ../../../../../../../../../../../../../../etc/passwd
+ *     `-- garbagelink -> /../../../../../../../../../../../../../../etc/passwd
+ */
+int setup_testdir(void)
+{
+	int dfd, tmpfd;
+	char dirname[] = "/tmp/resolveat-testdir.XXXXXX";
+
+	/* Unshare and make /tmp a new directory. */
+	E_unshare(CLONE_NEWNS);
+	E_mount("", "/tmp", "", MS_PRIVATE, "");
+
+	/* Make the top-level directory. */
+	if (!mkdtemp(dirname))
+		ksft_exit_fail_msg("setup_testdir: failed to create tmpdir\n");
+	dfd = open(dirname, O_PATH | O_DIRECTORY);
+	if (dfd < 0)
+		ksft_exit_fail_msg("setup_testdir: failed to open tmpdir\n");
+
+	/* A sub-directory which is actually used for tests. */
+	E_mkdirat(dfd, "root", 0755);
+	tmpfd = openat(dfd, "root", O_PATH | O_DIRECTORY);
+	if (tmpfd < 0)
+		ksft_exit_fail_msg("setup_testdir: failed to open tmpdir\n");
+	close(dfd);
+	dfd = tmpfd;
+
+	E_symlinkat("/proc/self/exe", dfd, "procexe");
+	E_symlinkat("/proc/self/root", dfd, "procroot");
+	E_mkdirat(dfd, "root", 0755);
+
+	/* There is no mountat(2), so use chdir. */
+	E_mkdirat(dfd, "mnt", 0755);
+	E_fchdir(dfd);
+	E_mount("tmpfs", "./mnt", "tmpfs", MS_NOSUID | MS_NODEV, "");
+	E_symlinkat("../mnt/", dfd, "mnt/self");
+	E_symlinkat("/mnt/", dfd, "mnt/absself");
+
+	E_mkdirat(dfd, "etc", 0755);
+	E_touchat(dfd, "etc/passwd");
+
+	E_symlinkat("etc/passwd", dfd, "relsym");
+	E_symlinkat("/etc/passwd", dfd, "abssym");
+	E_symlinkat("/cheeky", dfd, "abscheeky");
+
+	E_mkdirat(dfd, "cheeky", 0755);
+
+	E_symlinkat("/", dfd, "cheeky/absself");
+	E_symlinkat("../../root/", dfd, "cheeky/self");
+	E_symlinkat("/../../root/", dfd, "cheeky/garbageself");
+
+	E_symlinkat("../cheeky/../etc/../etc/passwd", dfd, "cheeky/passwd");
+	E_symlinkat("/../cheeky/../etc/../etc/passwd", dfd, "cheeky/abspasswd");
+
+	E_symlinkat("../../../../../../../../../../../../../../etc/passwd",
+		    dfd, "cheeky/dotdotlink");
+	E_symlinkat("/../../../../../../../../../../../../../../etc/passwd",
+		    dfd, "cheeky/garbagelink");
+
+	return dfd;
+}
+
+struct basic_test {
+	const char *dir;
+	const char *path;
+	unsigned int flags;
+	bool pass;
+	union {
+		int err;
+		const char *path;
+	} out;
+};
+
+void test_resolveat_basic_tests(void)
+{
+	int rootfd;
+	char *procselfexe;
+
+	E_asprintf(&procselfexe, "/proc/%d/exe", getpid());
+	rootfd = setup_testdir();
+
+	struct basic_test tests[] = {
+		/** RESOLVE_BENEATH **/
+		/* Attempts to cross dirfd should be blocked. */
+		{ .path = "/",			.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "cheeky/absself",	.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "abscheeky/absself",	.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "..",			.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "../root/",		.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "cheeky/self",	.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "abscheeky/self",	.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "cheeky/garbageself",	.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "abscheeky/garbageself", .flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		/* Only relative paths that stay inside dirfd should work. */
+		{ .path = "root",		.flags = RESOLVE_BENEATH,
+		  .out.path = "root",		.pass = true },
+		{ .path = "etc",		.flags = RESOLVE_BENEATH,
+		  .out.path = "etc",		.pass = true },
+		{ .path = "etc/passwd",		.flags = RESOLVE_BENEATH,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "relsym",		.flags = RESOLVE_BENEATH,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "cheeky/passwd",	.flags = RESOLVE_BENEATH,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "abscheeky/passwd",	.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "abssym",		.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "/etc/passwd",	.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "cheeky/abspasswd",	.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "abscheeky/abspasswd", .flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		/* Tricky paths should fail. */
+		{ .path = "cheeky/dotdotlink",	.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "abscheeky/dotdotlink", .flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "cheeky/garbagelink",	.flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "abscheeky/garbagelink", .flags = RESOLVE_BENEATH,
+		  .out.err = -EXDEV,		.pass = false },
+
+		/** RESOLVE_IN_ROOT **/
+		/* All attempts to cross the dirfd will be scoped-to-root. */
+		{ .path = "/",			.flags = RESOLVE_IN_ROOT,
+		  .out.path = NULL,		.pass = true },
+		{ .path = "cheeky/absself",	.flags = RESOLVE_IN_ROOT,
+		  .out.path = NULL,		.pass = true },
+		{ .path = "abscheeky/absself",	.flags = RESOLVE_IN_ROOT,
+		  .out.path = NULL,		.pass = true },
+		{ .path = "..",			.flags = RESOLVE_IN_ROOT,
+		  .out.path = NULL,		.pass = true },
+		{ .path = "../root/",		.flags = RESOLVE_IN_ROOT,
+		  .out.path = "root",		.pass = true },
+		{ .path = "../root/",		.flags = RESOLVE_IN_ROOT,
+		  .out.path = "root",		.pass = true },
+		{ .path = "cheeky/self",	.flags = RESOLVE_IN_ROOT,
+		  .out.path = "root",		.pass = true },
+		{ .path = "cheeky/garbageself",	.flags = RESOLVE_IN_ROOT,
+		  .out.path = "root",		.pass = true },
+		{ .path = "abscheeky/garbageself", .flags = RESOLVE_IN_ROOT,
+		  .out.path = "root",		.pass = true },
+		{ .path = "root",		.flags = RESOLVE_IN_ROOT,
+		  .out.path = "root",		.pass = true },
+		{ .path = "etc",		.flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc",		.pass = true },
+		{ .path = "etc/passwd",		.flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "relsym",		.flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "cheeky/passwd",	.flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "abscheeky/passwd",	.flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "abssym",		.flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "/etc/passwd",	.flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "cheeky/abspasswd",	.flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "abscheeky/abspasswd",.flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "cheeky/dotdotlink",	.flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "abscheeky/dotdotlink", .flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "/../../../../abscheeky/dotdotlink", .flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "cheeky/garbagelink",	.flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "abscheeky/garbagelink", .flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+		{ .path = "/../../../../abscheeky/garbagelink", .flags = RESOLVE_IN_ROOT,
+		  .out.path = "etc/passwd",	.pass = true },
+
+		/** RESOLVE_XDEV **/
+		/* Crossing *down* into a mountpoint is disallowed. */
+		{ .path = "mnt",		.flags = RESOLVE_XDEV,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "mnt/",		.flags = RESOLVE_XDEV,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "mnt/.",		.flags = RESOLVE_XDEV,
+		  .out.err = -EXDEV,		.pass = false },
+		/* Crossing *up* out of a mountpoint is disallowed. */
+		{ .dir = "mnt", .path = ".",	.flags = RESOLVE_XDEV,
+		  .out.path = "mnt",		.pass = true },
+		{ .dir = "mnt", .path = "..",	.flags = RESOLVE_XDEV,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .dir = "mnt", .path = "../mnt", .flags = RESOLVE_XDEV,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .dir = "mnt", .path = "self",	.flags = RESOLVE_XDEV,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .dir = "mnt", .path = "absself", .flags = RESOLVE_XDEV,
+		  .out.err = -EXDEV,		.pass = false },
+		/* Jumping to "/" is ok, but later components cannot cross. */
+		{ .dir = "mnt", .path = "/",	.flags = RESOLVE_XDEV,
+		  .out.path = "/",		.pass = true },
+		{ .dir = "/", .path = "/",	.flags = RESOLVE_XDEV,
+		  .out.path = "/",		.pass = true },
+		{ .path = "/proc/1",		.flags = RESOLVE_XDEV,
+		  .out.err = -EXDEV,		.pass = false },
+		{ .path = "/tmp",		.flags = RESOLVE_XDEV,
+		  .out.err = -EXDEV,		.pass = false },
+
+		/** RESOLVE_NO_MAGICLINKS **/
+		/* Regular symlinks should work. */
+		{ .path = "relsym",		.flags = RESOLVE_NO_MAGICLINKS,
+		  .out.path = "etc/passwd",	.pass = true },
+		/* Magic-links should not work. */
+		{ .path = "procexe",		.flags = RESOLVE_NO_MAGICLINKS,
+		  .out.err = -ELOOP,		.pass = false },
+		{ .path = "/proc/self/exe",	.flags = RESOLVE_NO_MAGICLINKS,
+		  .out.err = -ELOOP,		.pass = false },
+		{ .path = "procroot/etc",	.flags = RESOLVE_NO_MAGICLINKS,
+		  .out.err = -ELOOP,		.pass = false },
+		{ .path = "/proc/self/root/etc", .flags = RESOLVE_NO_MAGICLINKS,
+		  .out.err = -ELOOP,		.pass = false },
+		{ .path = "/proc/self/root/etc", .flags = RESOLVE_NO_MAGICLINKS | RESOLVE_NO_FOLLOW,
+		  .out.err = -ELOOP,		.pass = false },
+		{ .path = "/proc/self/exe",	.flags = RESOLVE_NO_MAGICLINKS | RESOLVE_NO_FOLLOW,
+		  .out.path = procselfexe,	.pass = true },
+
+		/** RESOLVE_NO_SYMLINKS **/
+		/* Normal paths should work. */
+		{ .path = ".",			.flags = RESOLVE_NO_SYMLINKS,
+		  .out.path = NULL,		.pass = true },
+		{ .path = "root",		.flags = RESOLVE_NO_SYMLINKS,
+		  .out.path = "root",		.pass = true },
+		{ .path = "etc",		.flags = RESOLVE_NO_SYMLINKS,
+		  .out.path = "etc",		.pass = true },
+		{ .path = "etc/passwd",		.flags = RESOLVE_NO_SYMLINKS,
+		  .out.path = "etc/passwd",	.pass = true },
+		/* Regular symlinks are blocked. */
+		{ .path = "relsym",		.flags = RESOLVE_NO_SYMLINKS,
+		  .out.err = -ELOOP,		.pass = false },
+		{ .path = "abssym",		.flags = RESOLVE_NO_SYMLINKS,
+		  .out.err = -ELOOP,		.pass = false },
+		{ .path = "cheeky/garbagelink",	.flags = RESOLVE_NO_SYMLINKS,
+		  .out.err = -ELOOP,		.pass = false },
+		{ .path = "abscheeky/garbagelink", .flags = RESOLVE_NO_SYMLINKS,
+		  .out.err = -ELOOP,		.pass = false },
+		{ .path = "abscheeky/absself",	.flags = RESOLVE_NO_SYMLINKS,
+		  .out.err = -ELOOP,		.pass = false },
+		/* Trailing symlinks with NO_FOLLOW. */
+		{ .path = "relsym",		.flags = RESOLVE_NO_SYMLINKS | RESOLVE_NO_FOLLOW,
+		  .out.path = "relsym",		.pass = true },
+		{ .path = "abssym",		.flags = RESOLVE_NO_SYMLINKS | RESOLVE_NO_FOLLOW,
+		  .out.path = "abssym",		.pass = true },
+		{ .path = "cheeky/garbagelink",	.flags = RESOLVE_NO_SYMLINKS | RESOLVE_NO_FOLLOW,
+		  .out.path = "cheeky/garbagelink", .pass = true },
+		{ .path = "abscheeky/garbagelink", .flags = RESOLVE_NO_SYMLINKS | RESOLVE_NO_FOLLOW,
+		  .out.err = -ELOOP,		.pass = false },
+		{ .path = "abscheeky/absself",	.flags = RESOLVE_NO_SYMLINKS | RESOLVE_NO_FOLLOW,
+		  .out.err = -ELOOP,		.pass = false },
+	};
+
+	for (int i = 0; i < ARRAY_LEN(tests); i++) {
+		int dfd, fd;
+		bool failed;
+		void (*resultfn)(const char *msg, ...) = ksft_test_result_pass;
+
+		struct basic_test *test = &tests[i];
+		char *flagstr = resolveat_flags(test->flags);
+
+		if (test->dir)
+			dfd = openat(rootfd, test->dir, O_PATH | O_DIRECTORY);
+		else
+			dfd = dup(rootfd);
+		if (dfd < 0) {
+			resultfn = ksft_test_result_error;
+			goto next;
+		}
+
+		fd = sys_resolveat(dfd, test->path, test->flags);
+		if (test->pass)
+			failed = (fd < 0 || !fdequal(fd, rootfd, test->out.path));
+		else
+			failed = (fd != test->out.err);
+		if (fd >= 0)
+			close(fd);
+		close(dfd);
+
+		if (failed)
+			resultfn = ksft_test_result_fail;
+
+next:
+		if (test->pass)
+			resultfn("resolveat(root[%s], %s, %s) ==> %s\n",
+				 test->dir ?: ".", test->path, flagstr,
+				 test->out.path ?: ".");
+		else
+			resultfn("resolveat(root[%s], %s, %s) ==> %d (%s)\n",
+				 test->dir ?: ".", test->path, flagstr,
+				 test->out.err, strerror(-test->out.err));
+		free(flagstr);
+	}
+
+	free(procselfexe);
+	close(rootfd);
+}
+
+
+static int proc_exec(int fd)
+{
+	int err, saved_errno;
+	char *procpath;
+	char *argv[] = {"foo", NULL};
+	char *envp[] = {"bar", NULL};
+
+	E_asprintf(&procpath, "/proc/self/fd/%d", fd);
+	err = execve(procpath, argv, envp);
+	saved_errno = errno;
+	free(procpath);
+
+	return err >= 0 ? err : -saved_errno;
+}
+
+static int fd_exec(int fd)
+{
+	char *argv[] = {"foo", NULL};
+	char *envp[] = {"bar", NULL};
+
+	return sys_execveat(fd, "", argv, envp, AT_EMPTY_PATH);
+}
+
+int main(int argc, char **argv)
+{
+	ksft_print_header();
+	test_resolveat_supported();
+
+	/* NOTE: We should be checking for CAP_SYS_ADMIN here... */
+	if (geteuid() != 0)
+		ksft_exit_skip("resolveat(2) tests require euid == 0\n");
+
+	test_resolveat_basic_tests();
+
+	if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
+		ksft_exit_fail();
+	else
+		ksft_exit_pass();
+}
-- 
2.21.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox