Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH v6 4/6] namei: aggressively check for nd->root escape on ".." resolution
From: Aleksa Sarai @ 2019-05-06 16:54 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells
  Cc: Aleksa Sarai, Jann Horn, Kees Cook, Eric Biederman,
	Andy Lutomirski, Andrew Morton, Alexei Starovoitov,
	Christian Brauner, Tycho Andersen, David Drysdale, Chanho Min,
	Oleg Nesterov, Aleksa Sarai, Linus Torvalds, containers,
	linux-fsdevel, linux-api, linux-kernel, linux-arch
In-Reply-To: <20190506165439.9155-1-cyphar@cyphar.com>

This patch allows for O_BENEATH and O_THISROOT to safely permit ".."
resolution (in the case of O_BENEATH the resolution will still fail if
".." resolution would resolve a path outside of the root -- while
O_THISROOT will chroot(2)-style scope it). "magic link" jumps are still
disallowed entirely because now they could result in inconsistent
behaviour if resolution encounters a subsequent "..".

The need for this patch is explained by observing there is a fairly
easy-to-exploit race condition with chroot(2) (and thus by extension
O_THISROOT and O_BENEATH) where a rename(2) of a path can be used to
"skip over" nd->root and thus escape to the filesystem above nd->root.

  thread1 [attacker]:
    for (;;)
      renameat2(AT_FDCWD, "/a/b/c", AT_FDCWD, "/a/d", RENAME_EXCHANGE);
  thread2 [victim]:
    for (;;)
      openat(dirb, "b/c/../../etc/shadow", O_THISROOT);

With fairly significant regularity, thread2 will resolve to
"/etc/shadow" rather than "/a/b/etc/shadow". There is also a similar
(though somewhat more privileged) attack using MS_MOVE.

With this patch, such cases will be detected *during* ".." resolution
(which is the weak point of chroot(2) -- since walking *into* a
subdirectory tautologically cannot result in you walking *outside*
nd->root -- except through a bind-mount or "magic link"). By detecting
this at ".." resolution (rather than checking only at the end of the
entire resolution) we can both correct escapes by jumping back to the
root (in the case of O_THISROOT), as well as avoid revealing to
attackers the structure of the filesystem outside of the root (through
timing attacks for instance).

In order to avoid a quadratic lookup with each ".." entry, we only
activate the slow path if a write through &rename_lock or &mount_lock
have occurred during path resolution (&rename_lock and &mount_lock are
re-taken to further optimise the lookup). Since the primary attack being
protected against is MS_MOVE or rename(2), not doing additional checks
unless a mount or rename have occurred avoids making the common case
slow.

The use of path_is_under() here might seem suspect, but on further
inspection of the most important race (a path was *inside* the root but
is now *outside*), there appears to be no attack potential. If
path_is_under() occurs before the rename, then the path will be resolved
but since the path was originally inside the root there is no escape.
Subsequent ".." jumps are guaranteed to check path_is_under() (by
construction, &rename_lock or &mount_lock must have been taken by the
attacker after path_is_under() returned in the victim), and thus will
not be able to escape from the previously-inside-root path. Walking down
is still safe since the entire subtree was moved (either by rename(2) or
MS_MOVE) and because (as discussed above) walking down is safe.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jann Horn <jannh@google.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 fs/namei.c | 48 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 3a3cba593b85..2b6a1bf4e745 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -491,7 +491,7 @@ struct nameidata {
 	struct path	root;
 	struct inode	*inode; /* path.dentry.d_inode */
 	unsigned int	flags;
-	unsigned	seq, m_seq;
+	unsigned	seq, m_seq, r_seq;
 	int		last_type;
 	unsigned	depth;
 	int		total_link_count;
@@ -1739,19 +1739,35 @@ static inline int may_lookup(struct nameidata *nd)
 static inline int handle_dots(struct nameidata *nd, int type)
 {
 	if (type == LAST_DOTDOT) {
-		/*
-		 * LOOKUP_BENEATH resolving ".." is not currently safe -- races can
-		 * cause our parent to have moved outside of the root and us to skip
-		 * over it.
-		 */
-		if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT)))
-			return -EXDEV;
+		int error = 0;
+
 		if (!nd->root.mnt)
 			set_root(nd);
-		if (nd->flags & LOOKUP_RCU) {
-			return follow_dotdot_rcu(nd);
-		} else
-			return follow_dotdot(nd);
+		if (nd->flags & LOOKUP_RCU)
+			error = follow_dotdot_rcu(nd);
+		else
+			error = follow_dotdot(nd);
+		if (error)
+			return error;
+
+		if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT))) {
+			bool m_retry = read_seqretry(&mount_lock, nd->m_seq);
+			bool r_retry = read_seqretry(&rename_lock, nd->r_seq);
+
+			/*
+			 * Don't bother checking unless there's a racing
+			 * rename(2) or MS_MOVE.
+			 */
+			if (likely(!m_retry && !r_retry))
+				return 0;
+
+			if (m_retry && !(nd->flags & LOOKUP_RCU))
+				nd->m_seq = read_seqbegin(&mount_lock);
+			if (r_retry)
+				nd->r_seq = read_seqbegin(&rename_lock);
+			if (!path_is_under(&nd->path, &nd->root))
+				return -EXDEV;
+		}
 	}
 	return 0;
 }
@@ -2272,6 +2288,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
 	nd->depth = 0;
+
+	nd->m_seq = read_seqbegin(&mount_lock);
+	if (unlikely(flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT)))
+		nd->r_seq = read_seqbegin(&rename_lock);
+
 	if (flags & LOOKUP_ROOT) {
 		struct dentry *root = nd->root.dentry;
 		struct inode *inode = root->d_inode;
@@ -2282,7 +2303,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 		if (flags & LOOKUP_RCU) {
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 			nd->root_seq = nd->seq;
-			nd->m_seq = read_seqbegin(&mount_lock);
 		} else {
 			path_get(&nd->path);
 		}
@@ -2293,8 +2313,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	nd->path.mnt = NULL;
 	nd->path.dentry = NULL;
 
-	nd->m_seq = read_seqbegin(&mount_lock);
-
 	if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT))) {
 		error = dirfd_path_init(nd);
 		if (unlikely(error))
-- 
2.21.0

^ permalink raw reply related

* [PATCH v6 3/6] namei: LOOKUP_IN_ROOT: chroot-like path resolution
From: Aleksa Sarai @ 2019-05-06 16:54 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells
  Cc: Aleksa Sarai, Eric Biederman, Christian Brauner, Kees Cook,
	Andy Lutomirski, Andrew Morton, Alexei Starovoitov, Jann Horn,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, linux-fsdevel,
	linux-api, linux-kernel, linux-arch
In-Reply-To: <20190506165439.9155-1-cyphar@cyphar.com>

The primary motivation for the need for this flag is container runtimes
which have to interact with malicious root filesystems in the host
namespaces. One of the first requirements for a container runtime to be
secure against a malicious rootfs is that they correctly scope symlinks
(that is, they should be scoped as though they are chroot(2)ed into the
container's rootfs) and ".."-style paths[*]. The already-existing O_XDEV
and O_NOMAGICLINKS[**] help defend against other potential attacks in a
malicious rootfs scenario.

Currently most container runtimes try to do this resolution in
userspace[1], causing many potential race conditions. In addition, the
"obvious" alternative (actually performing a {ch,pivot_}root(2))
requires a fork+exec (for some runtimes) which is *very* costly if
necessary for every filesystem operation involving a container.

[*] At the moment, ".." and "magic link" jumping are disallowed for the
    same reason it is disabled for LOOKUP_BENEATH -- currently it is not
    safe to allow it. Future patches may enable it unconditionally once
    we have resolved the possible races (for "..") and semantics (for
    "magic link" jumping).

The most significant openat(2) semantic change with LOOKUP_THISROOT is
that absolute pathnames no longer cause dirfd to be ignored completely.
The rationale is that LOOKUP_THISROOT must necessarily chroot-scope
symlinks with absolute paths to dirfd, and so doing it for the base path
seems to be the most consistent behaviour (and also avoids foot-gunning
users who want to scope paths that are absolute).

[1]: https://github.com/cyphar/filepath-securejoin

Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Christian Brauner <christian@brauner.io>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 fs/namei.c            | 6 +++---
 include/linux/namei.h | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index e13a02720a9d..3a3cba593b85 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1095,7 +1095,7 @@ const char *get_link(struct nameidata *nd)
 			if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
 				return ERR_PTR(-ELOOP);
 			/* Not currently safe. */
-			if (unlikely(nd->flags & LOOKUP_BENEATH))
+			if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT)))
 				return ERR_PTR(-EXDEV);
 		}
 		if (IS_ERR_OR_NULL(res))
@@ -1744,7 +1744,7 @@ static inline int handle_dots(struct nameidata *nd, int type)
 		 * cause our parent to have moved outside of the root and us to skip
 		 * over it.
 		 */
-		if (unlikely(nd->flags & LOOKUP_BENEATH))
+		if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT)))
 			return -EXDEV;
 		if (!nd->root.mnt)
 			set_root(nd);
@@ -2295,7 +2295,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 
 	nd->m_seq = read_seqbegin(&mount_lock);
 
-	if (unlikely(nd->flags & LOOKUP_BENEATH)) {
+	if (unlikely(nd->flags & (LOOKUP_BENEATH | LOOKUP_IN_ROOT))) {
 		error = dirfd_path_init(nd);
 		if (unlikely(error))
 			return ERR_PTR(error);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 7bc819ad0cd3..4b1ee717cb14 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -56,6 +56,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_NO_MAGICLINKS	0x040000 /* No /proc/$pid/fd/ "symlink" crossing. */
 #define LOOKUP_NO_SYMLINKS	0x080000 /* No symlink crossing *at all*.
 					    Implies LOOKUP_NO_MAGICLINKS. */
+#define LOOKUP_IN_ROOT		0x100000 /* Treat dirfd as %current->fs->root. */
 
 extern int path_pts(struct path *path);
 
-- 
2.21.0

^ permalink raw reply related

* [PATCH v6 2/6] namei: O_BENEATH-style path resolution flags
From: Aleksa Sarai @ 2019-05-06 16:54 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells
  Cc: Aleksa Sarai, Eric Biederman, Christian Brauner, Kees Cook,
	David Drysdale, Andy Lutomirski, Linus Torvalds, Andrew Morton,
	Alexei Starovoitov, Jann Horn, Tycho Andersen, Chanho Min,
	Oleg Nesterov, Aleksa Sarai, containers, linux-fsdevel, linux-api,
	linux-kernel, linux-arch
In-Reply-To: <20190506165439.9155-1-cyphar@cyphar.com>

Add the following flags to allow various restrictions on path
resolution (these affect the *entire* resolution, rather than just the
final path component -- as is the case with most other AT_* flags).

The primary justification for these flags is to allow for programs to be
far more strict about how they want path resolution to handle symlinks,
mountpoint crossings, and paths that escape the dirfd (through an
absolute path or ".." shenanigans).

This is of particular concern to container runtimes that want to be very
careful about malicious root filesystems that a container's init might
have screwed around with (and there is no real way to protect against
this in userspace if you consider potential races against a malicious
container's init). More classical applications (which have their own
potentially buggy userspace path sanitisation code) include web
servers, archive extraction tools, network file servers, and so on.

These flags are exposed to userspace in a later patchset.

* LOOKUP_XDEV: Disallow mount-point crossing (both *down* into one, or
  *up* from one). The primary "scoping" use is to blocking resolution
  that crosses a bind-mount, which has a similar property to a symlink
  (in the way that it allows for escape from the starting-point). Since
  it is not possible to differentiate bind-mounts However since
  bind-mounting requires privileges (in ways symlinks don't) this has
  been split from LOOKUP_BENEATH. The naming is based on "find -xdev" as
  well as -EXDEV (though find(1) doesn't walk upwards, the semantics
  seem obvious).

* LOOKUP_NO_MAGICLINKS: Disallows ->get_link "symlink" jumping. This is
  a very specific restriction, and it exists because /proc/$pid/fd/...
  "symlinks" allow for access outside nd->root and pose risk to
  container runtimes that don't want to be tricked into accessing a host
  path (but do want to allow no-funny-business symlink resolution).

* LOOKUP_NO_SYMLINKS: Disallows symlink jumping *of any kind*. Implies
  LOOKUP_NO_MAGICLINKS (obviously).

* LOOKUP_BENEATH: Disallow "escapes" from the starting point of the
  filesystem tree during resolution (you must stay "beneath" the
  starting point at all times). Currently this is done by disallowing
  ".." and absolute paths (either in the given path or found during
  symlink resolution) entirely, as well as all "magic link" jumping.

  The wholesale banning of ".." is because it is currently not safe to
  allow ".." resolution (races can cause the path to be moved outside of
  the root -- this is conceptually similar to historical chroot(2)
  escape attacks). Future patches in this series will address this, and
  will re-enable ".." resolution once it is safe. With those patches,
  ".." resolution will only be allowed if it remains in the root
  throughout resolution (such as "a/../b" not "a/../../outside/b").

  The banning of "magic link" jumping is done because it is not clear
  whether semantically they should be allowed -- while some "magic
  links" are safe there are many that can cause escapes (and once a
  resolution is outside of the root, O_BENEATH will no longer detect
  it). Future patches may re-enable "magic link" jumping when such jumps
  would remain inside the root.

The LOOKUP_NO_*LINK flags return -ELOOP if path resolution would
violates their requirement, while the others all return -EXDEV.

This is a refresh of Al's AT_NO_JUMPS patchset[1] (which was a variation
on David Drysdale's O_BENEATH patchset[2], which in turn was based on
the Capsicum project[3]). Input from Linus and Andy in the AT_NO_JUMPS
thread[4] determined most of the API changes made in this refresh.

[1]: https://lwn.net/Articles/721443/
[2]: https://lwn.net/Articles/619151/
[3]: https://lwn.net/Articles/603929/
[4]: https://lwn.net/Articles/723057/

Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Christian Brauner <christian@brauner.io>
Cc: Kees Cook <keescook@chromium.org>
Suggested-by: David Drysdale <drysdale@google.com>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Suggested-by: Andy Lutomirski <luto@kernel.org>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 fs/namei.c            | 76 ++++++++++++++++++++++++++++++++++++-------
 include/linux/namei.h |  7 ++++
 2 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 2a91b72aa5e9..e13a02720a9d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -843,6 +843,12 @@ static inline void path_to_nameidata(const struct path *path,
 
 static int nd_jump_root(struct nameidata *nd)
 {
+	if (unlikely(nd->flags & LOOKUP_BENEATH))
+		return -EXDEV;
+	if (unlikely(nd->flags & LOOKUP_XDEV)) {
+		if (nd->path.mnt != nd->root.mnt)
+			return -EXDEV;
+	}
 	if (nd->flags & LOOKUP_RCU) {
 		struct dentry *d;
 		nd->path = nd->root;
@@ -1051,6 +1057,9 @@ const char *get_link(struct nameidata *nd)
 	int error;
 	const char *res;
 
+	if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS))
+		return ERR_PTR(-ELOOP);
+
 	if (!(nd->flags & LOOKUP_RCU)) {
 		touch_atime(&last->link);
 		cond_resched();
@@ -1081,14 +1090,23 @@ const char *get_link(struct nameidata *nd)
 		} else {
 			res = get(dentry, inode, &last->done);
 		}
+		/* If we just jumped it was because of a procfs-style link. */
+		if (unlikely(nd->flags & LOOKUP_JUMPED)) {
+			if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
+				return ERR_PTR(-ELOOP);
+			/* Not currently safe. */
+			if (unlikely(nd->flags & LOOKUP_BENEATH))
+				return ERR_PTR(-EXDEV);
+		}
 		if (IS_ERR_OR_NULL(res))
 			return res;
 	}
 	if (*res == '/') {
 		if (!nd->root.mnt)
 			set_root(nd);
-		if (unlikely(nd_jump_root(nd)))
-			return ERR_PTR(-ECHILD);
+		error = nd_jump_root(nd);
+		if (unlikely(error))
+			return ERR_PTR(error);
 		while (unlikely(*++res == '/'))
 			;
 	}
@@ -1269,12 +1287,16 @@ static int follow_managed(struct path *path, struct nameidata *nd)
 		break;
 	}
 
-	if (need_mntput && path->mnt == mnt)
-		mntput(path->mnt);
+	if (need_mntput) {
+		if (path->mnt == mnt)
+			mntput(path->mnt);
+		if (unlikely(nd->flags & LOOKUP_XDEV))
+			ret = -EXDEV;
+		else
+			nd->flags |= LOOKUP_JUMPED;
+	}
 	if (ret == -EISDIR || !ret)
 		ret = 1;
-	if (need_mntput)
-		nd->flags |= LOOKUP_JUMPED;
 	if (unlikely(ret < 0))
 		path_put_conditional(path, nd);
 	return ret;
@@ -1331,6 +1353,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		mounted = __lookup_mnt(path->mnt, path->dentry);
 		if (!mounted)
 			break;
+		if (unlikely(nd->flags & LOOKUP_XDEV))
+			return false;
 		path->mnt = &mounted->mnt;
 		path->dentry = mounted->mnt.mnt_root;
 		nd->flags |= LOOKUP_JUMPED;
@@ -1351,8 +1375,11 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	struct inode *inode = nd->inode;
 
 	while (1) {
-		if (path_equal(&nd->path, &nd->root))
+		if (path_equal(&nd->path, &nd->root)) {
+			if (unlikely(nd->flags & LOOKUP_BENEATH))
+				return -EXDEV;
 			break;
+		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			struct dentry *old = nd->path.dentry;
 			struct dentry *parent = old->d_parent;
@@ -1377,6 +1404,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 				return -ECHILD;
 			if (&mparent->mnt == nd->path.mnt)
 				break;
+			if (unlikely(nd->flags & LOOKUP_XDEV))
+				return -EXDEV;
 			/* we know that mountpoint was pinned */
 			nd->path.dentry = mountpoint;
 			nd->path.mnt = &mparent->mnt;
@@ -1391,6 +1420,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 			return -ECHILD;
 		if (!mounted)
 			break;
+		if (unlikely(nd->flags & LOOKUP_XDEV))
+			return -EXDEV;
 		nd->path.mnt = &mounted->mnt;
 		nd->path.dentry = mounted->mnt.mnt_root;
 		inode = nd->path.dentry->d_inode;
@@ -1479,8 +1510,11 @@ static int path_parent_directory(struct path *path)
 static int follow_dotdot(struct nameidata *nd)
 {
 	while(1) {
-		if (path_equal(&nd->path, &nd->root))
+		if (path_equal(&nd->path, &nd->root)) {
+			if (unlikely(nd->flags & LOOKUP_BENEATH))
+				return -EXDEV;
 			break;
+		}
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			int ret = path_parent_directory(&nd->path);
 			if (ret)
@@ -1489,6 +1523,8 @@ static int follow_dotdot(struct nameidata *nd)
 		}
 		if (!follow_up(&nd->path))
 			break;
+		if (unlikely(nd->flags & LOOKUP_XDEV))
+			return -EXDEV;
 	}
 	follow_mount(&nd->path);
 	nd->inode = nd->path.dentry->d_inode;
@@ -1703,6 +1739,13 @@ static inline int may_lookup(struct nameidata *nd)
 static inline int handle_dots(struct nameidata *nd, int type)
 {
 	if (type == LAST_DOTDOT) {
+		/*
+		 * LOOKUP_BENEATH resolving ".." is not currently safe -- races can
+		 * cause our parent to have moved outside of the root and us to skip
+		 * over it.
+		 */
+		if (unlikely(nd->flags & LOOKUP_BENEATH))
+			return -EXDEV;
 		if (!nd->root.mnt)
 			set_root(nd);
 		if (nd->flags & LOOKUP_RCU) {
@@ -2251,6 +2294,15 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	nd->path.dentry = NULL;
 
 	nd->m_seq = read_seqbegin(&mount_lock);
+
+	if (unlikely(nd->flags & LOOKUP_BENEATH)) {
+		error = dirfd_path_init(nd);
+		if (unlikely(error))
+			return ERR_PTR(error);
+		nd->root = nd->path;
+		if (!(nd->flags & LOOKUP_RCU))
+			path_get(&nd->root);
+	}
 	if (*s == '/') {
 		if (likely(!nd->root.mnt))
 			set_root(nd);
@@ -2259,9 +2311,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 			s = ERR_PTR(error);
 		return s;
 	}
-	error = dirfd_path_init(nd);
-	if (unlikely(error))
-		return ERR_PTR(error);
+	if (likely(!nd->path.mnt)) {
+		error = dirfd_path_init(nd);
+		if (unlikely(error))
+			return ERR_PTR(error);
+	}
 	return s;
 }
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 9138b4471dbf..7bc819ad0cd3 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -50,6 +50,13 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_EMPTY		0x4000
 #define LOOKUP_DOWN		0x8000
 
+/* Scoping flags for lookup. */
+#define LOOKUP_BENEATH		0x010000 /* No escaping from starting point. */
+#define LOOKUP_XDEV		0x020000 /* No mountpoint crossing. */
+#define LOOKUP_NO_MAGICLINKS	0x040000 /* No /proc/$pid/fd/ "symlink" crossing. */
+#define LOOKUP_NO_SYMLINKS	0x080000 /* No symlink crossing *at all*.
+					    Implies LOOKUP_NO_MAGICLINKS. */
+
 extern int path_pts(struct path *path);
 
 extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty);
-- 
2.21.0

^ permalink raw reply related

* [PATCH v6 1/6] namei: split out nd->dfd handling to dirfd_path_init
From: Aleksa Sarai @ 2019-05-06 16:54 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells
  Cc: Aleksa Sarai, Eric Biederman, Andy Lutomirski, Andrew Morton,
	Alexei Starovoitov, Kees Cook, Jann Horn, Christian Brauner,
	Tycho Andersen, David Drysdale, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, Linus Torvalds, containers, linux-fsdevel,
	linux-api, linux-kernel, linux-arch
In-Reply-To: <20190506165439.9155-1-cyphar@cyphar.com>

Previously, path_init's handling of *at(dfd, ...) was only done once,
but with O_BENEATH (and O_THISROOT) we have to parse the initial
nd->path at different times (before or after absolute path handling)
depending on whether we have been asked to scope resolution within a
root.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 fs/namei.c | 103 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 44 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 5ebd64b21970..2a91b72aa5e9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2166,9 +2166,59 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	}
 }
 
+/*
+ * Configure nd->path based on the nd->dfd. This is only used as part of
+ * path_init().
+ */
+static inline int dirfd_path_init(struct nameidata *nd)
+{
+	if (nd->dfd == AT_FDCWD) {
+		if (nd->flags & LOOKUP_RCU) {
+			struct fs_struct *fs = current->fs;
+			unsigned seq;
+
+			do {
+				seq = read_seqcount_begin(&fs->seq);
+				nd->path = fs->pwd;
+				nd->inode = nd->path.dentry->d_inode;
+				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			} while (read_seqcount_retry(&fs->seq, seq));
+		} else {
+			get_fs_pwd(current->fs, &nd->path);
+			nd->inode = nd->path.dentry->d_inode;
+		}
+	} else {
+		/* Caller must check execute permissions on the starting path component */
+		struct fd f = fdget_raw(nd->dfd);
+		struct dentry *dentry;
+
+		if (!f.file)
+			return -EBADF;
+
+		dentry = f.file->f_path.dentry;
+
+		if (*nd->name->name && unlikely(!d_can_lookup(dentry))) {
+			fdput(f);
+			return -ENOTDIR;
+		}
+
+		nd->path = f.file->f_path;
+		if (nd->flags & LOOKUP_RCU) {
+			nd->inode = nd->path.dentry->d_inode;
+			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+		} else {
+			path_get(&nd->path);
+			nd->inode = nd->path.dentry->d_inode;
+		}
+		fdput(f);
+	}
+	return 0;
+}
+
 /* must be paired with terminate_walk() */
 static const char *path_init(struct nameidata *nd, unsigned flags)
 {
+	int error;
 	const char *s = nd->name->name;
 
 	if (!*s)
@@ -2202,52 +2252,17 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 
 	nd->m_seq = read_seqbegin(&mount_lock);
 	if (*s == '/') {
-		set_root(nd);
-		if (likely(!nd_jump_root(nd)))
-			return s;
-		return ERR_PTR(-ECHILD);
-	} else if (nd->dfd == AT_FDCWD) {
-		if (flags & LOOKUP_RCU) {
-			struct fs_struct *fs = current->fs;
-			unsigned seq;
-
-			do {
-				seq = read_seqcount_begin(&fs->seq);
-				nd->path = fs->pwd;
-				nd->inode = nd->path.dentry->d_inode;
-				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-			} while (read_seqcount_retry(&fs->seq, seq));
-		} else {
-			get_fs_pwd(current->fs, &nd->path);
-			nd->inode = nd->path.dentry->d_inode;
-		}
-		return s;
-	} else {
-		/* Caller must check execute permissions on the starting path component */
-		struct fd f = fdget_raw(nd->dfd);
-		struct dentry *dentry;
-
-		if (!f.file)
-			return ERR_PTR(-EBADF);
-
-		dentry = f.file->f_path.dentry;
-
-		if (*s && unlikely(!d_can_lookup(dentry))) {
-			fdput(f);
-			return ERR_PTR(-ENOTDIR);
-		}
-
-		nd->path = f.file->f_path;
-		if (flags & LOOKUP_RCU) {
-			nd->inode = nd->path.dentry->d_inode;
-			nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-		} else {
-			path_get(&nd->path);
-			nd->inode = nd->path.dentry->d_inode;
-		}
-		fdput(f);
+		if (likely(!nd->root.mnt))
+			set_root(nd);
+		error = nd_jump_root(nd);
+		if (unlikely(error))
+			s = ERR_PTR(error);
 		return s;
 	}
+	error = dirfd_path_init(nd);
+	if (unlikely(error))
+		return ERR_PTR(error);
+	return s;
 }
 
 static const char *trailing_symlink(struct nameidata *nd)
-- 
2.21.0

^ permalink raw reply related

* [PATCH v6 0/6] namei: resolveat(2) path resolution restriction API
From: Aleksa Sarai @ 2019-05-06 16:54 UTC (permalink / raw)
  To: Al Viro, Jeff Layton, J. Bruce Fields, Arnd Bergmann,
	David Howells
  Cc: Aleksa Sarai, Eric Biederman, Andy Lutomirski, Jann Horn,
	Christian Brauner, David Drysdale, Tycho Andersen, Kees Cook,
	Linus Torvalds, containers, linux-fsdevel, linux-api,
	Andrew Morton, Alexei Starovoitov, Chanho Min, Oleg Nesterov,
	Aleksa Sarai, linux-kernel, linux-arch

Patch changelog:
  v6:
    * Drop O_* flags API to the new LOOKUP_ path scoping bits and
      instead introduce resolveat(2) as an alternative method of
      obtaining an O_PATH. The justification for this is included in
      patch 6 (though switching back to O_* flags is trivial).
  v5:
    * In response to CVE-2019-5736 (one of the vectors showed that
      open(2)+fexec(3) cannot be used to scope binfmt_script's implicit
      open_exec()), AT_* flags have been re-added and are now piped
      through to binfmt_script (and other binfmt_* that use open_exec)
      but are only supported for execveat(2) for now.
  v4:
    * Remove AT_* flag reservations, as they require more discussion.
    * Switch to path_is_under() over __d_path() for breakout checking.
    * Make O_XDEV no longer block openat("/tmp", "/", O_XDEV) -- dirfd
      is now ignored for absolute paths to match other flags.
    * Improve the dirfd_path_init() refactor and move it to a separate
      commit.
    * Remove reference to Linux-capsicum.
    * Switch "proclink" name to "magic link".
  v3: [resend]
  v2:
    * Made ".." resolution with AT_THIS_ROOT and AT_BENEATH safe(r) with
      some semi-aggressive __d_path checking (see patch 3).
    * Disallowed "proclinks" with AT_THIS_ROOT and AT_BENEATH, in the
      hopes they can be re-enabled once safe.
    * Removed the selftests as they will be reimplemented as xfstests.
    * Removed stat(2) support, since you can already get it through
      O_PATH and fstatat(2).

The need for some sort of control over VFS's path resolution (to avoid
malicious paths resulting in inadvertent breakouts) has been a very
long-standing desire of many userspace applications. This patchset is a
revival of Al Viro's old AT_NO_JUMPS[1,2] patchset (which was a variant
of David Drysdale's O_BENEATH patchset[3] which was a spin-off of the
Capsicum project[4]) with a few additions and changes made based on the
previous discussion within [5] as well as others I felt were useful.

In line with the conclusions of the original discussion of AT_NO_JUMPS,
the flag has been split up into separate flags. However, instead of
being an openat(2) flag it is provided through a new syscall
resolveat(2) which provides an alternative way to get an O_PATH file
descriptor (the reasoning for doing this is included in patch 6). The
following new LOOKUP_ (and corresponding uapi) flags are added:

  * LOOKUP_XDEV blocks all mountpoint crossings (upwards, downwards, or
    through absolute links). Absolute pathnames alone in openat(2) do
    not trigger this.

  * LOOKUP_NO_MAGICLINKS blocks resolution through /proc/$pid/fd-style
    links. This is done by blocking the usage of nd_jump_link() during
    resolution in a filesystem. The term "magic links" is used to match
    with the only reference to these links in Documentation/, but I'm
    happy to change the name.

    It should be noted that this is different to the scope of
    ~LOOKUP_FOLLOW in that it applies to all path components. However,
    you can do resolveat(NOFOLLOW|NO_MAGICLINKS) on a "magic link" and
    it will *not* fail (assuming that no parent component was a "magic
    link"), and you will have an fd for the "magic link".

  * LOOKUP_BENEATH disallows escapes to outside the starting dirfd's
    tree, using techniques such as ".." or absolute links. Absolute
    paths in openat(2) are also disallowed. Conceptually this flag is to
    ensure you "stay below" a certain point in the filesystem tree --
    but this requires some additional to protect against various races
    that would allow escape using ".." (see patch 4 for more detail).

    Currently LOOKUP_BENEATH implies LOOKUP_NO_MAGICLINKS, because it
    can trivially beam you around the filesystem (breaking the
    protection). In future, there might be similar safety checks as in
    patch 4, but that requires more discussion.

In addition, two new flags were added that expand on the above ideas:

  * LOOKUP_NO_SYMLINKS does what it says on the tin. No symlink
    resolution is allowed at all, including "magic links". Just as with
    LOOKUP_NO_MAGICLINKS this can still be used with NOFOLLOW to open an
    fd for the symlink as long as no parent path had a symlink
    component.

  * LOOKUP_IN_ROOT is an extension of LOOKUP_BENEATH that, rather than
    blocking attempts to move past the root, forces all such movements
    to be scoped to the starting point. This provides chroot(2)-like
    protection but without the cost of a chroot(2) for each filesystem
    operation, as well as being safe against race attacks that chroot(2)
    is not.

    If a race is detected (as with LOOKUP_BENEATH) then an error is
    generated, and similar to LOOKUP_BENEATH it is not permitted to cross
    "magic links" with LOOKUP_IN_ROOT.

    The primary need for this is from container runtimes, which
    currently need to do symlink scoping in userspace[6] when opening
    paths in a potentially malicious container. There is a long list of
    CVEs that could have bene mitigated by having O_THISROOT (such as
    CVE-2017-1002101, CVE-2017-1002102, CVE-2018-15664, and
    CVE-2019-5736, just to name a few).

In addition, a mirror set of AT_* flags have been added (though
currently these are only supported for execveat(2) -- and not for any
other syscall). The need for these is explained in patch 5 (it's
motivated by CVE-2019-5736).

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Christian Brauner <christian@brauner.io>
Cc: David Drysdale <drysdale@google.com>
Cc: Tycho Andersen <tycho@tycho.ws>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <containers@lists.linux-foundation.org>
Cc: <linux-fsdevel@vger.kernel.org>
Cc: <linux-api@vger.kernel.org>

[1]: https://lwn.net/Articles/721443/
[2]: https://lore.kernel.org/patchwork/patch/784221/
[3]: https://lwn.net/Articles/619151/
[4]: https://lwn.net/Articles/603929/
[5]: https://lwn.net/Articles/723057/
[6]: https://github.com/cyphar/filepath-securejoin

Aleksa Sarai (6):
  namei: split out nd->dfd handling to dirfd_path_init
  namei: O_BENEATH-style path resolution flags
  namei: LOOKUP_IN_ROOT: chroot-like path resolution
  namei: aggressively check for nd->root escape on ".." resolution
  binfmt_*: scope path resolution of interpreters
  namei: resolveat(2) syscall

 arch/alpha/kernel/syscalls/syscall.tbl      |   1 +
 arch/arm/tools/syscall.tbl                  |   1 +
 arch/ia64/kernel/syscalls/syscall.tbl       |   1 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   1 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   1 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   1 +
 arch/s390/kernel/syscalls/syscall.tbl       |   1 +
 arch/sh/kernel/syscalls/syscall.tbl         |   1 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   1 +
 fs/binfmt_elf.c                             |   2 +-
 fs/binfmt_elf_fdpic.c                       |   2 +-
 fs/binfmt_em86.c                            |   4 +-
 fs/binfmt_misc.c                            |   2 +-
 fs/binfmt_script.c                          |   2 +-
 fs/exec.c                                   |  26 +-
 fs/namei.c                                  | 251 +++++++++++++++-----
 include/linux/binfmts.h                     |   1 +
 include/linux/fs.h                          |   9 +-
 include/linux/namei.h                       |   8 +
 include/uapi/linux/fcntl.h                  |  18 ++
 27 files changed, 270 insertions(+), 71 deletions(-)

-- 
2.21.0

^ permalink raw reply

* Re: RFC: on adding new CLONE_* flags [WAS Re: [PATCH 0/4] clone: add CLONE_PIDFD]
From: Serge E. Hallyn @ 2019-05-05  2:32 UTC (permalink / raw)
  To: Enrico Weigelt, metux IT consult
  Cc: Serge E. Hallyn, Christian Brauner, torvalds, viro, jannh,
	dhowells, linux-api, linux-kernel, luto, arnd, ebiederm, keescook,
	tglx, mtk.manpages, akpm, oleg, cyphar, joel, dancol
In-Reply-To: <c95fbdbb-a62b-4ad1-f4be-7d1a8f96f508@metux.net>

On Mon, Apr 29, 2019 at 07:31:43PM +0200, Enrico Weigelt, metux IT consult wrote:

Argh.  Sorry, it seems your emails aren't making it into my inbox, only
my once-in-a-long-while-checked lkml folder.  Sorry again.

> On 29.04.19 17:49, Serge E. Hallyn wrote:
> 
> >> * all users are equal - no root at all. the only exception is the>>   initial process, which gets the kernel devices mounted into his>>
>  namespace.> > This does not match my understanding, but I'm most likely
> wrong.  (I thought> there was an actual 'host owner' uid, which mostly
> is only used for initial> process, but is basically root with a
> different name, and used far less.  No> uid transitions without factotem
> so that it *looked* like no root user).
> Not quite (IIRC). The hostowner is just the user who booted the machine,
> the initial process runs under this uname and gets the kernel devices
> bound into his namespace, so he can start fileservers on them.
> 
> Also the caphash device (the one you can create capabilities, eg. for
> user change, which then can be used via capuse device) can only be
> opened once - usually by the host factotum.
> 
> There really is no such thing like root user.
> 
> >> What I'd like to achieve on Linux:>>>> * unprivileged users can have their own mount namespace, where
> they>>   can mount at will (maybe just 9P).> > No problem, you can do
> that now.
>
> But only within separate userns, IMHO. (and, when I last tried, plain

"Only within a separate userns" - but why does that matter?  It's just
a different uid mapping.

> users couldn't directly create their userns).

Plain users can definately create their own userns, directly.  On some
distros there is a kernel knob like

#cat /proc/sys/kernel/unprivileged_userns_clone
1

which when unset prevents unprivileged users creating a namespace.

> >> * but they still appear as the same normal users to the rest of the
> >>   system
> > 
> > No problem, you can do that now.
> 
> How exactly ? Did I miss something vital ?

By unsharing your namespace and writing the new uid mapping.  You can of
course only map your own uid without using any privileged helpers at all.
And it requires help from a second process, which does the writing to
the uid map file after the first process has unshared.  But you can do it.
For instance, using the nsexec.c at

	https://github.com/fcicq/nsexec

You can:

Terminal 1:
	shallyn@stp:~/src/nsexec$ ./nsexec -UWm
	about to unshare with 10020000
	Press any key to exec (I am 31157)

Now in terminal 2:

Terminal 2:
	shallyn@stp:~/src/nsexec$ echo "0 1000 1" > /proc/31157/uid_map
	shallyn@stp:~/src/nsexec$ echo deny > /proc/31157/setgroups
	shallyn@stp:~/src/nsexec$ echo "0 1000 1" > /proc/31157/gid_map

Then back in terminal 1:
	# id
	uid=0(root) gid=0(root) groups=0(root),65534(nogroup)
	# mount --bind /etc /mnt
	# echo $?
	0
	# ls /root
	ls: cannot open directory '/root': Permission denied

To the rest of the system you look like uid 1000.  You could have
chosen uid 1000 in your new namespace, but then you couldn't mount.
Of course you can nest user namespaces so you could create another,
this time mapping uid 1000 so you look like 1000 to yourself as well.

> >> * 9p programs (compiled for Linux ABI) can run parallel to traditional
> >>   linux programs within the same user and sessions (eg. from a terminal,
> >>   i can call both the same way)
> >> * namespace modifications affect both equally (eg. I could run ff in
> >>   an own ns)
> > 
> > affect both of what equally?
> 
> mount / bind.
> 
> > That's exactly what user namespaces are for.  You can create a new
> > user namespace, using no privilege at all, with your current uid (i.e.
> > 1000) mapped to whatever uid you like; if you pick 0, then you can unshare all
> > the namespaces you like.  
> 
> But I don't like to appear as 'root' in here. I just wanna have my own
> filesystem namespace, nothing more.

Right.  As you know setuid makes that impossible, unfortunately.  That's
where nonewprivs shows promise.

> > Once you unshare mnt_ns, you can mount to your
> > heart's content.  To other processes on the host, your process is
> > uid 1000.
> 
> Is that the uid, I'm appearing to filesystems ?

Yes.

> > Regarding factotem, I agree that with the pidfd work going on etc, it's getting
> > more and more tempting to attempt a switch to that.  Looking back at my folder,
> > I see you posted a kernel patch for it.  I had done the same long ago.  Happy to
> > work with you again on that, and put a simple daemon into shadow package, if
> > util-linux isn't deemed the far better place.
> 
> Yeah :)
> 
> 
> --mtx
> 
> -- 
> Enrico Weigelt, metux IT consult
> Free software and Linux embedded engineering
> info@metux.net -- +49-151-27565287

^ permalink raw reply

* Re: [PATCH for 5.2 00/12] Restartable Sequences selftests updates
From: shuah @ 2019-05-03 22:59 UTC (permalink / raw)
  To: Mathieu Desnoyers, Andy Whitcroft, Joe Perches
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H. Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer, rostedt,
	Josh Triplett, Linus Torvalds, Catalin Marinas, Will
In-Reply-To: <1137649333.995.1556911352713.JavaMail.zimbra@efficios.com>

On 5/3/19 1:22 PM, Mathieu Desnoyers wrote:
> ----- On May 3, 2019, at 2:53 PM, shuah shuah@kernel.org wrote:
> 
>> On 5/3/19 12:36 PM, Mathieu Desnoyers wrote:
>>> ----- On Apr 29, 2019, at 11:27 AM, Mathieu Desnoyers
>>> mathieu.desnoyers@efficios.com wrote:
>>>
>>>> Those rseq selftests updates are hereby submitted to Shuah Khan,
>>>> maintainer of kernel selftests, for the next merge window (5.2).
>>>>
>>>> They change the per-architecture pre-abort signatures to ensure those
>>>> are valid trap instructions.
>>>>
>>>> The way exit points are presented to debuggers is enhanced, ensuring
>>>> all exit points are present, so debuggers don't have to disassemble
>>>> rseq critical section to properly skip over them.
>>>>
>>>> Discussions with the glibc community is reaching a concensus of exposing
>>>> a __rseq_handled symbol from glibc to coexist with rseq early adopters.
>>>> Update the rseq selftest code to expose and use this symbol.
>>>>
>>>> Support for compiling asm goto with clang is added with the
>>>> "-no-integrated-as" compiler switch, similarly to the toplevel kernel
>>>> Makefile.
>>>
>>> Hi Shuah,
>>>
>>> Is there anything else you need before you can pick up those patches ?
>>>
>>
>> I was going to say "no more work needed" and noticed that the series has
>> checkpatch errors and warns as I was running the series through
>> pre-commit tests.
>>
>> Patches 1,2,3,8 have errors/warns based
>> on quick look at the log.
>>
>>
>> ERROR: need consistent spacing around '%' (ctx:WxV)
>> #227: FILE: tools/testing/selftests/rseq/rseq-x86.h:104:
>> +		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
>>
>>
>> Will you be able to fix them and resend?
> 
> (CCing the che checkpatch maintainers)
> 
> checkpatch appears to be wrong for these errors. I suspect it thinks those are
> '%' modulo operators (for which the style requires space before/after),
> but those are actually part of the asm input and goto target operands.
> 
> Most warnings are about some lines over 80 cols. However, the areas where
> this happens is due to following the style of already upstream code which
> has the final "\" at the end of line sometimes beyond 80 col to accommodate
> macros that take a bit of horizontal real estate.
> 
> For patch 8, the warning about "availble" being a typo is right. The
> style error about space after "asm (" is right as well. Should I send only
> this updated patch to you or should I send the whole patchset again ?
> 

No need to send all patches. This is good.

thanks,
-- Shuah

^ permalink raw reply

* Re: [PATCH for 5.2 00/12] Restartable Sequences selftests updates
From: Joe Perches @ 2019-05-03 21:46 UTC (permalink / raw)
  To: Mathieu Desnoyers, shuah, Andy Whitcroft
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H. Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer, rostedt,
	Josh Triplett, Linus Torvalds, Catalin Marinas, Will
In-Reply-To: <1137649333.995.1556911352713.JavaMail.zimbra@efficios.com>

On Fri, 2019-05-03 at 15:22 -0400, Mathieu Desnoyers wrote:
> ----- On May 3, 2019, at 2:53 PM, shuah shuah@kernel.org wrote:
> > ERROR: need consistent spacing around '%' (ctx:WxV)
> > #227: FILE: tools/testing/selftests/rseq/rseq-x86.h:104:
> > +		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
> > 
> > Will you be able to fix them and resend?
[]
> (CCing the che checkpatch maintainers)
> 
> checkpatch appears to be wrong for these errors. I suspect it thinks those are
> '%' modulo operators (for which the style requires space before/after),
> but those are actually part of the asm input and goto target operands.

checkpatch doesn't really understand asm.
Ignore checkpatch when it's silly.

^ permalink raw reply

* [PATCH v2 for 5.2 08/12] rseq/selftests: arm: use udf instruction for RSEQ_SIG
From: Mathieu Desnoyers @ 2019-05-03 19:38 UTC (permalink / raw)
  To: Shuah Khan
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H . Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer,
	Steven Rostedt, Josh Triplett, Linus Torvalds, Catalin Marinas,
	Will Deacon
In-Reply-To: <20190429152803.7719-9-mathieu.desnoyers@efficios.com>

Use udf as the guard instruction for the restartable sequence abort
handler.

Previously, the chosen signature was not a valid instruction, based
on the assumption that it could always sit in a literal pool. However,
there are compilation environments in which literal pools are not
available, for instance execute-only code. Therefore, we need to
choose a signature value that is also a valid instruction.

Handle compiling with -mbig-endian on ARMv6+, which generates binaries
with mixed code vs data endianness (little endian code, big endian
data).

Else mismatch between code endianness for the generated signatures and
data endianness for the RSEQ_SIG parameter passed to the rseq
registration will trigger application segmentation faults when the
kernel try to abort rseq critical sections.

Prior to ARMv6, -mbig-endian generates big-endian code and data, so
endianness should not be reversed in that case.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Joel Fernandes <joelaf@google.com>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Dave Watson <davejwatson@fb.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Shuah Khan <shuah@kernel.org>
CC: Andi Kleen <andi@firstfloor.org>
CC: linux-kselftest@vger.kernel.org
CC: "H . Peter Anvin" <hpa@zytor.com>
CC: Chris Lameter <cl@linux.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Michael Kerrisk <mtk.manpages@gmail.com>
CC: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
CC: Paul Turner <pjt@google.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Ben Maurer <bmaurer@fb.com>
CC: linux-api@vger.kernel.org
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
---
Changes since v1:
- Fix checkpatch error and warning.

---
 tools/testing/selftests/rseq/rseq-arm.h | 52 +++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/rseq/rseq-arm.h b/tools/testing/selftests/rseq/rseq-arm.h
index 5f262c54364f..84f28f147fb6 100644
--- a/tools/testing/selftests/rseq/rseq-arm.h
+++ b/tools/testing/selftests/rseq/rseq-arm.h
@@ -5,7 +5,54 @@
  * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  */
 
-#define RSEQ_SIG	0x53053053
+/*
+ * RSEQ_SIG uses the udf A32 instruction with an uncommon immediate operand
+ * value 0x5de3. This traps if user-space reaches this instruction by mistake,
+ * and the uncommon operand ensures the kernel does not move the instruction
+ * pointer to attacker-controlled code on rseq abort.
+ *
+ * The instruction pattern in the A32 instruction set is:
+ *
+ * e7f5def3    udf    #24035    ; 0x5de3
+ *
+ * This translates to the following instruction pattern in the T16 instruction
+ * set:
+ *
+ * little endian:
+ * def3        udf    #243      ; 0xf3
+ * e7f5        b.n    <7f5>
+ *
+ * pre-ARMv6 big endian code:
+ * e7f5        b.n    <7f5>
+ * def3        udf    #243      ; 0xf3
+ *
+ * ARMv6+ -mbig-endian generates mixed endianness code vs data: little-endian
+ * code and big-endian data. Ensure the RSEQ_SIG data signature matches code
+ * endianness. Prior to ARMv6, -mbig-endian generates big-endian code and data
+ * (which match), so there is no need to reverse the endianness of the data
+ * representation of the signature. However, the choice between BE32 and BE8
+ * is done by the linker, so we cannot know whether code and data endianness
+ * will be mixed before the linker is invoked.
+ */
+
+#define RSEQ_SIG_CODE	0xe7f5def3
+
+#ifndef __ASSEMBLER__
+
+#define RSEQ_SIG_DATA							\
+	({								\
+		int sig;						\
+		asm volatile ("b 2f\n\t"				\
+			      "1: .inst " __rseq_str(RSEQ_SIG_CODE) "\n\t" \
+			      "2:\n\t"					\
+			      "ldr %[sig], 1b\n\t"			\
+			      : [sig] "=r" (sig));			\
+		sig;							\
+	})
+
+#define RSEQ_SIG	RSEQ_SIG_DATA
+
+#endif
 
 #define rseq_smp_mb()	__asm__ __volatile__ ("dmb" ::: "memory", "cc")
 #define rseq_smp_rmb()	__asm__ __volatile__ ("dmb" ::: "memory", "cc")
@@ -78,7 +125,8 @@ do {									\
 		__rseq_str(table_label) ":\n\t"				\
 		".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
 		".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
-		".word " __rseq_str(RSEQ_SIG) "\n\t"			\
+		".arm\n\t"						\
+		".inst " __rseq_str(RSEQ_SIG_CODE) "\n\t"		\
 		__rseq_str(label) ":\n\t"				\
 		teardown						\
 		"b %l[" __rseq_str(abort_label) "]\n\t"
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH for 5.2 00/12] Restartable Sequences selftests updates
From: Mathieu Desnoyers @ 2019-05-03 19:22 UTC (permalink / raw)
  To: shuah, Andy Whitcroft, Joe Perches
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H. Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer, rostedt,
	Josh Triplett, Linus Torvalds, Catalin Marinas, Will
In-Reply-To: <68a135d7-7b30-71c7-c570-c7608d6f75d5@kernel.org>

----- On May 3, 2019, at 2:53 PM, shuah shuah@kernel.org wrote:

> On 5/3/19 12:36 PM, Mathieu Desnoyers wrote:
>> ----- On Apr 29, 2019, at 11:27 AM, Mathieu Desnoyers
>> mathieu.desnoyers@efficios.com wrote:
>> 
>>> Those rseq selftests updates are hereby submitted to Shuah Khan,
>>> maintainer of kernel selftests, for the next merge window (5.2).
>>>
>>> They change the per-architecture pre-abort signatures to ensure those
>>> are valid trap instructions.
>>>
>>> The way exit points are presented to debuggers is enhanced, ensuring
>>> all exit points are present, so debuggers don't have to disassemble
>>> rseq critical section to properly skip over them.
>>>
>>> Discussions with the glibc community is reaching a concensus of exposing
>>> a __rseq_handled symbol from glibc to coexist with rseq early adopters.
>>> Update the rseq selftest code to expose and use this symbol.
>>>
>>> Support for compiling asm goto with clang is added with the
>>> "-no-integrated-as" compiler switch, similarly to the toplevel kernel
>>> Makefile.
>> 
>> Hi Shuah,
>> 
>> Is there anything else you need before you can pick up those patches ?
>> 
> 
> I was going to say "no more work needed" and noticed that the series has
> checkpatch errors and warns as I was running the series through
> pre-commit tests.
> 
> Patches 1,2,3,8 have errors/warns based
> on quick look at the log.
> 
> 
> ERROR: need consistent spacing around '%' (ctx:WxV)
> #227: FILE: tools/testing/selftests/rseq/rseq-x86.h:104:
> +		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])
> 
> 
> Will you be able to fix them and resend?

(CCing the che checkpatch maintainers)

checkpatch appears to be wrong for these errors. I suspect it thinks those are
'%' modulo operators (for which the style requires space before/after),
but those are actually part of the asm input and goto target operands.

Most warnings are about some lines over 80 cols. However, the areas where
this happens is due to following the style of already upstream code which
has the final "\" at the end of line sometimes beyond 80 col to accommodate
macros that take a bit of horizontal real estate.

For patch 8, the warning about "availble" being a typo is right. The
style error about space after "asm (" is right as well. Should I send only
this updated patch to you or should I send the whole patchset again ?

Thanks,

Mathieu


> 
> thanks,
> -- Shuah

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: [PATCH for 5.2 00/12] Restartable Sequences selftests updates
From: shuah @ 2019-05-03 18:53 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H. Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer, rostedt,
	Josh Triplett, Linus Torvalds, Catalin Marinas, Will
In-Reply-To: <678952111.699.1556908562445.JavaMail.zimbra@efficios.com>

On 5/3/19 12:36 PM, Mathieu Desnoyers wrote:
> ----- On Apr 29, 2019, at 11:27 AM, Mathieu Desnoyers mathieu.desnoyers@efficios.com wrote:
> 
>> Those rseq selftests updates are hereby submitted to Shuah Khan,
>> maintainer of kernel selftests, for the next merge window (5.2).
>>
>> They change the per-architecture pre-abort signatures to ensure those
>> are valid trap instructions.
>>
>> The way exit points are presented to debuggers is enhanced, ensuring
>> all exit points are present, so debuggers don't have to disassemble
>> rseq critical section to properly skip over them.
>>
>> Discussions with the glibc community is reaching a concensus of exposing
>> a __rseq_handled symbol from glibc to coexist with rseq early adopters.
>> Update the rseq selftest code to expose and use this symbol.
>>
>> Support for compiling asm goto with clang is added with the
>> "-no-integrated-as" compiler switch, similarly to the toplevel kernel
>> Makefile.
> 
> Hi Shuah,
> 
> Is there anything else you need before you can pick up those patches ?
> 

I was going to say "no more work needed" and noticed that the series has
checkpatch errors and warns as I was running the series through
pre-commit tests.

Patches 1,2,3,8 have errors/warns based
on quick look at the log.


ERROR: need consistent spacing around '%' (ctx:WxV)
#227: FILE: tools/testing/selftests/rseq/rseq-x86.h:104:
+		RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_CPU_ID_OFFSET(%[rseq_abi]), %l[error1])


Will you be able to fix them and resend?

thanks,
-- Shuah

^ permalink raw reply

* [PATCH 2/5] glibc: sched_getcpu(): use rseq cpu_id TLS on Linux (v4)
From: Mathieu Desnoyers @ 2019-05-03 18:42 UTC (permalink / raw)
  To: Carlos O'Donell
  Cc: Florian Weimer, Joseph Myers, Szabolcs Nagy, libc-alpha,
	Mathieu Desnoyers, Thomas Gleixner, Ben Maurer, Peter Zijlstra,
	Paul E. McKenney, Boqun Feng, Will Deacon, Dave Watson,
	Paul Turner, linux-kernel, linux-api
In-Reply-To: <20190503184219.19266-1-mathieu.desnoyers@efficios.com>

When available, use the cpu_id field from __rseq_abi on Linux to
implement sched_getcpu(). Fall-back on the vgetcpu vDSO if unavailable.

Benchmarks:

x86-64: Intel E5-2630 v3@2.40GHz, 16-core, hyperthreading

glibc sched_getcpu():                     13.7 ns (baseline)
glibc sched_getcpu() using rseq:           2.5 ns (speedup:  5.5x)
inline load cpuid from __rseq_abi TLS:     0.8 ns (speedup: 17.1x)

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Carlos O'Donell <carlos@redhat.com>
CC: Florian Weimer <fweimer@redhat.com>
CC: Joseph Myers <joseph@codesourcery.com>
CC: Szabolcs Nagy <szabolcs.nagy@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ben Maurer <bmaurer@fb.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Dave Watson <davejwatson@fb.com>
CC: Paul Turner <pjt@google.com>
CC: libc-alpha@sourceware.org
CC: linux-kernel@vger.kernel.org
CC: linux-api@vger.kernel.org
---
Changes since v1:
- rseq is only used if both __NR_rseq and RSEQ_SIG are defined.

Changes since v2:
- remove duplicated __rseq_abi extern declaration.

Changes since v3:
- update ChangeLog.
---
 ChangeLog                              |  5 +++++
 sysdeps/unix/sysv/linux/sched_getcpu.c | 24 ++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 459af8f1a5..3e8ec0de1c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2019-04-23  Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+
+	* sysdeps/unix/sysv/linux/sched_getcpu.c: use rseq cpu_id TLS on
+	Linux.
+
 2019-04-23  Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 
 	* NEWS: Add Restartable Sequences feature description.
diff --git a/sysdeps/unix/sysv/linux/sched_getcpu.c b/sysdeps/unix/sysv/linux/sched_getcpu.c
index fb0d317f83..167976fab7 100644
--- a/sysdeps/unix/sysv/linux/sched_getcpu.c
+++ b/sysdeps/unix/sysv/linux/sched_getcpu.c
@@ -24,8 +24,8 @@
 #endif
 #include <sysdep-vdso.h>
 
-int
-sched_getcpu (void)
+static int
+vsyscall_sched_getcpu (void)
 {
 #ifdef __NR_getcpu
   unsigned int cpu;
@@ -37,3 +37,23 @@ sched_getcpu (void)
   return -1;
 #endif
 }
+
+#ifdef __NR_rseq
+#include <sys/rseq.h>
+#endif
+
+#if defined __NR_rseq && defined RSEQ_SIG
+int
+sched_getcpu (void)
+{
+  int cpu_id = __rseq_abi.cpu_id;
+
+  return cpu_id >= 0 ? cpu_id : vsyscall_sched_getcpu ();
+}
+#else
+int
+sched_getcpu (void)
+{
+  return vsyscall_sched_getcpu ();
+}
+#endif
-- 
2.17.1

^ permalink raw reply related

* [PATCH 1/5] glibc: Perform rseq(2) registration at C startup and thread creation (v10)
From: Mathieu Desnoyers @ 2019-05-03 18:42 UTC (permalink / raw)
  To: Carlos O'Donell
  Cc: Florian Weimer, Joseph Myers, Szabolcs Nagy, libc-alpha,
	Mathieu Desnoyers, Thomas Gleixner, Ben Maurer, Peter Zijlstra,
	Paul E. McKenney, Boqun Feng, Will Deacon, Dave Watson,
	Paul Turner, Rich Felker, linux-kernel, linux-api
In-Reply-To: <20190503184219.19266-1-mathieu.desnoyers@efficios.com>

Register rseq(2) TLS for each thread (including main), and unregister
for each thread (excluding main). "rseq" stands for Restartable
Sequences.

See the rseq(2) man page proposed here:
  https://lkml.org/lkml/2018/9/19/647

This patch is based on glibc-2.29. The rseq(2) system call was merged
into Linux 4.18.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Carlos O'Donell <carlos@redhat.com>
CC: Florian Weimer <fweimer@redhat.com>
CC: Joseph Myers <joseph@codesourcery.com>
CC: Szabolcs Nagy <szabolcs.nagy@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ben Maurer <bmaurer@fb.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Dave Watson <davejwatson@fb.com>
CC: Paul Turner <pjt@google.com>
CC: Rich Felker <dalias@libc.org>
CC: libc-alpha@sourceware.org
CC: linux-kernel@vger.kernel.org
CC: linux-api@vger.kernel.org
---
Changes since v1:
- Move __rseq_refcount to an extra field at the end of __rseq_abi to
  eliminate one symbol.

  All libraries/programs which try to register rseq (glibc,
  early-adopter applications, early-adopter libraries) should use the
  rseq refcount. It becomes part of the ABI within a user-space
  process, but it's not part of the ABI shared with the kernel per se.

- Restructure how this code is organized so glibc keeps building on
  non-Linux targets.

- Use non-weak symbol for __rseq_abi.

- Move rseq registration/unregistration implementation into its own
  nptl/rseq.c compile unit.

- Move __rseq_abi symbol under GLIBC_2.29.

Changes since v2:
- Move __rseq_refcount to its own symbol, which is less ugly than
  trying to play tricks with the rseq uapi.
- Move __rseq_abi from nptl to csu (C start up), so it can be used
  across glibc, including memory allocator and sched_getcpu(). The
  __rseq_refcount symbol is kept in nptl, because there is no reason
  to use it elsewhere in glibc.

Changes since v3:
- Set __rseq_refcount TLS to 1 on register/set to 0 on unregister
  because glibc is the first/last user.
- Unconditionally register/unregister rseq at thread start/exit, because
  glibc is the first/last user.
- Add missing abilist items.
- Rebase on glibc master commit a502c5294.
- Add NEWS entry.

Changes since v4:
- Do not use "weak" symbols for __rseq_abi and __rseq_refcount. Based on
  "System V Application Binary Interface", weak only affects the link
  editor, not the dynamic linker.
- Install a new sys/rseq.h system header on Linux, which contains the
  RSEQ_SIG definition, __rseq_abi declaration and __rseq_refcount
  declaration. Move those definition/declarations from rseq-internal.h
  to the installed sys/rseq.h header.
- Considering that rseq is only available on Linux, move csu/rseq.c to
  sysdeps/unix/sysv/linux/rseq-sym.c.
- Move __rseq_refcount from nptl/rseq.c to
  sysdeps/unix/sysv/linux/rseq-sym.c, so it is only defined on Linux.
- Move both ABI definitions for __rseq_abi and __rseq_refcount to
  sysdeps/unix/sysv/linux/Versions, so they only appear on Linux.
- Document __rseq_abi and __rseq_refcount volatile.
- Document the RSEQ_SIG signature define.
- Move registration functions from rseq.c to rseq-internal.h static
  inline functions. Introduce empty stubs in misc/rseq-internal.h,
  which can be overridden by architecture code in
  sysdeps/unix/sysv/linux/rseq-internal.h.
- Rename __rseq_register_current_thread and __rseq_unregister_current_thread
  to rseq_register_current_thread and rseq_unregister_current_thread,
  now that those are only visible as internal static inline functions.
- Invoke rseq_register_current_thread() from libc-start.c LIBC_START_MAIN
  rather than nptl init, so applications not linked against
  libpthread.so have rseq registered for their main() thread. Note that
  it is invoked separately for SHARED and !SHARED builds.

Changes since v5:
- Replace __rseq_refcount by __rseq_lib_abi, which contains two
  uint32_t: register_state and refcount. The "register_state" field
  allows inhibiting rseq registration from signal handlers nested on top
  of glibc registration and occuring after rseq unregistration by glibc.
- Introduce enum rseq_register_state, which contains the states allowed
  for the struct rseq_lib_abi register_state field.

Changes since v6:
- Introduce bits/rseq.h to define RSEQ_SIG for each architecture.
  The generic bits/rseq.h does not define RSEQ_SIG, meaning that each
  architecture implementing rseq needs to implement bits/rseq.h.
- Rename enum item RSEQ_REGISTER_NESTED to RSEQ_REGISTER_ONGOING.
- Port to glibc-2.29.

Changes since v7:
- Remove __rseq_lib_abi symbol, including refcount and register_state
  fields.
- Remove reference counting and nested signals handling from
  registration/unregistration functions.
- Introduce new __rseq_handled exported symbol, which is set to 1
  by glibc on C startup when it handles restartable sequences.
  This allows glibc to coexist with early adopter libraries and
  applications wishing to register restartable sequences when it
  is not handled by glibc.
- Introduce rseq_init (), which sets __rseq_handled to 1 from
  C startup.
- Update NEWS entry.
- Update comments at the beginning of new files.
- Registration depends on both __NR_rseq and RSEQ_SIG.
- Remove ARM, powerpc, MIPS RSEQ_SIG until we agree with maintainers
  on the signature choice.
- Update x86, s390 RSEQ_SIG based on discussion with arch maintainers.
- Remove rseq-internal.h from headers list of misc/Makefile, so it
  it not installed by make install.

Changes since v8:
- Introduce RSEQ_SIG_CODE and RSEQ_SIG_DATA on aarch64 to handle
  compiling with -mbig-endian.

Changes since v9:
- Update Changelog.
- Remove unneeded new file comment header newlines.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Carlos O'Donell <carlos@redhat.com>
CC: Florian Weimer <fweimer@redhat.com>
CC: Joseph Myers <joseph@codesourcery.com>
CC: Szabolcs Nagy <szabolcs.nagy@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ben Maurer <bmaurer@fb.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Dave Watson <davejwatson@fb.com>
CC: Paul Turner <pjt@google.com>
CC: Rich Felker <dalias@libc.org>
CC: libc-alpha@sourceware.org
CC: linux-kernel@vger.kernel.org
CC: linux-api@vger.kernel.org
---
 ChangeLog                                     | 47 ++++++++++
 NEWS                                          | 15 ++++
 csu/libc-start.c                              | 14 ++-
 misc/rseq-internal.h                          | 38 ++++++++
 nptl/pthread_create.c                         |  9 ++
 sysdeps/unix/sysv/linux/Makefile              |  4 +-
 sysdeps/unix/sysv/linux/Versions              |  4 +
 sysdeps/unix/sysv/linux/aarch64/bits/rseq.h   | 43 +++++++++
 sysdeps/unix/sysv/linux/aarch64/libc.abilist  |  2 +
 sysdeps/unix/sysv/linux/alpha/libc.abilist    |  2 +
 sysdeps/unix/sysv/linux/arm/libc.abilist      |  2 +
 sysdeps/unix/sysv/linux/bits/rseq.h           | 29 ++++++
 sysdeps/unix/sysv/linux/csky/libc.abilist     |  2 +
 sysdeps/unix/sysv/linux/hppa/libc.abilist     |  2 +
 sysdeps/unix/sysv/linux/i386/libc.abilist     |  2 +
 sysdeps/unix/sysv/linux/ia64/libc.abilist     |  2 +
 .../sysv/linux/m68k/coldfire/libc.abilist     |  2 +
 .../unix/sysv/linux/m68k/m680x0/libc.abilist  |  2 +
 .../unix/sysv/linux/microblaze/libc.abilist   |  2 +
 .../sysv/linux/mips/mips32/fpu/libc.abilist   |  2 +
 .../sysv/linux/mips/mips32/nofpu/libc.abilist |  2 +
 .../sysv/linux/mips/mips64/n32/libc.abilist   |  2 +
 .../sysv/linux/mips/mips64/n64/libc.abilist   |  2 +
 sysdeps/unix/sysv/linux/nios2/libc.abilist    |  2 +
 .../linux/powerpc/powerpc32/fpu/libc.abilist  |  2 +
 .../powerpc/powerpc32/nofpu/libc.abilist      |  2 +
 .../linux/powerpc/powerpc64/be/libc.abilist   |  2 +
 .../linux/powerpc/powerpc64/le/libc.abilist   |  2 +
 .../unix/sysv/linux/riscv/rv64/libc.abilist   |  2 +
 sysdeps/unix/sysv/linux/rseq-internal.h       | 88 +++++++++++++++++++
 sysdeps/unix/sysv/linux/rseq-sym.c            | 63 +++++++++++++
 sysdeps/unix/sysv/linux/s390/bits/rseq.h      | 30 +++++++
 .../unix/sysv/linux/s390/s390-32/libc.abilist |  2 +
 .../unix/sysv/linux/s390/s390-64/libc.abilist |  2 +
 sysdeps/unix/sysv/linux/sh/libc.abilist       |  2 +
 .../sysv/linux/sparc/sparc32/libc.abilist     |  2 +
 .../sysv/linux/sparc/sparc64/libc.abilist     |  2 +
 sysdeps/unix/sysv/linux/sys/rseq.h            | 50 +++++++++++
 sysdeps/unix/sysv/linux/x86/bits/rseq.h       | 30 +++++++
 .../unix/sysv/linux/x86_64/64/libc.abilist    |  2 +
 .../unix/sysv/linux/x86_64/x32/libc.abilist   |  2 +
 41 files changed, 513 insertions(+), 5 deletions(-)
 create mode 100644 misc/rseq-internal.h
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/bits/rseq.h
 create mode 100644 sysdeps/unix/sysv/linux/bits/rseq.h
 create mode 100644 sysdeps/unix/sysv/linux/rseq-internal.h
 create mode 100644 sysdeps/unix/sysv/linux/rseq-sym.c
 create mode 100644 sysdeps/unix/sysv/linux/s390/bits/rseq.h
 create mode 100644 sysdeps/unix/sysv/linux/sys/rseq.h
 create mode 100644 sysdeps/unix/sysv/linux/x86/bits/rseq.h

diff --git a/ChangeLog b/ChangeLog
index 59dab18463..459af8f1a5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,50 @@
+2019-04-23  Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+
+	* NEWS: Add Restartable Sequences feature description.
+	* csu/libc-start.c: Perform rseq(2) registration at C startup and
+	thread creation.
+	* nptl/pthread_create.c: Likewise.
+	* sysdeps/unix/sysv/linux/Makefile: Add rseq-sym, sys/rseq.h,
+	bits/rseq.h.
+	* sysdeps/unix/sysv/linux/Versions: Export __rseq_abi and
+	__rseq_handled from libc.
+	* sysdeps/unix/sysv/linux/aarch64/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/alpha/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/arm/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/csky/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/hppa/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/i386/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/ia64/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/microblaze/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/nios2/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libc.abilist:
+	Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/sh/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/x86_64/64/libc.abilist: Likewise.
+	* sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist: Likewise.
+	* misc/rseq-internal.h: New file.
+	* sysdeps/unix/sysv/linux/rseq-internal.h: Likewise.
+	* sysdeps/unix/sysv/linux/rseq-sym.c: Likewise.
+	* sysdeps/unix/sysv/linux/sys/rseq.h: Likewise.
+	* sysdeps/unix/sysv/linux/bits/rseq.h: Likewise.
+	* sysdeps/unix/sysv/linux/aarch64/bits/rseq.h: Likewise.
+	* sysdeps/unix/sysv/linux/s390/bits/rseq.h: Likewise.
+	* sysdeps/unix/sysv/linux/x86/bits/rseq.h: Likewise.
+
 2019-01-31  Siddhesh Poyarekar  <siddhesh@sourceware.org>
 
 	* version.h (RELEASE): Set to "stable".
diff --git a/NEWS b/NEWS
index 912a9bdc0f..7276a09b08 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,21 @@ See the end for copying conditions.
 Please send GNU C library bug reports via <https://sourceware.org/bugzilla/>
 using `glibc' in the "product" field.
 \f
+Version 2.30
+
+Major new features:
+
+* Support for automatically registering threads with the Linux rseq(2)
+  system call has been added.  This system call is implemented starting
+  from Linux 4.18.  The Restartable Sequences ABI accelerates user-space
+  operations on per-cpu data.  It allows user-space to perform updates
+  on per-cpu data without requiring heavy-weight atomic operations.
+  Automatically registering threads allows all libraries, including libc,
+  to make immediate use of the rseq(2) support by using the documented ABI.
+  See 'man 2 rseq' for the details of the ABI shared between libc and the
+  kernel.
+
+\f
 Version 2.29
 
 Major new features:
diff --git a/csu/libc-start.c b/csu/libc-start.c
index 5d9c3675fa..e101196b0d 100644
--- a/csu/libc-start.c
+++ b/csu/libc-start.c
@@ -22,6 +22,7 @@
 #include <ldsodefs.h>
 #include <exit-thread.h>
 #include <libc-internal.h>
+#include <rseq-internal.h>
 
 #include <elf/dl-tunables.h>
 
@@ -140,7 +141,12 @@ LIBC_START_MAIN (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL),
 
   __libc_multiple_libcs = &_dl_starting_up && !_dl_starting_up;
 
-#ifndef SHARED
+  rseq_init ();
+
+#ifdef SHARED
+  /* Register rseq ABI to the kernel. */
+  (void) rseq_register_current_thread ();
+#else
   _dl_relocate_static_pie ();
 
   char **ev = &argv[argc + 1];
@@ -218,6 +224,9 @@ LIBC_START_MAIN (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL),
     }
 # endif
 
+  /* Register rseq ABI to the kernel. */
+  (void) rseq_register_current_thread ();
+
   /* Initialize libpthread if linked in.  */
   if (__pthread_initialize_minimal != NULL)
     __pthread_initialize_minimal ();
@@ -230,8 +239,7 @@ LIBC_START_MAIN (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL),
 # else
   __pointer_chk_guard_local = pointer_chk_guard;
 # endif
-
-#endif /* !SHARED  */
+#endif
 
   /* Register the destructor of the dynamic linker if there is any.  */
   if (__glibc_likely (rtld_fini != NULL))
diff --git a/misc/rseq-internal.h b/misc/rseq-internal.h
new file mode 100644
index 0000000000..ccad30bca5
--- /dev/null
+++ b/misc/rseq-internal.h
@@ -0,0 +1,38 @@
+/* Restartable Sequences internal API. Stub version.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef RSEQ_INTERNAL_H
+#define RSEQ_INTERNAL_H
+
+static inline int
+rseq_register_current_thread (void)
+{
+  return -1;
+}
+
+static inline int
+rseq_unregister_current_thread (void)
+{
+  return -1;
+}
+
+static inline int
+rseq_init (void)
+{
+}
+
+#endif /* rseq-internal.h */
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 2bd2b10727..90b3419390 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -33,6 +33,7 @@
 #include <default-sched.h>
 #include <futex-internal.h>
 #include <tls-setup.h>
+#include <rseq-internal.h>
 #include "libioP.h"
 
 #include <shlib-compat.h>
@@ -378,6 +379,7 @@ __free_tcb (struct pthread *pd)
 START_THREAD_DEFN
 {
   struct pthread *pd = START_THREAD_SELF;
+  bool has_rseq = false;
 
 #if HP_TIMING_AVAIL
   /* Remember the time when the thread was started.  */
@@ -396,6 +398,9 @@ START_THREAD_DEFN
   if (__glibc_unlikely (atomic_exchange_acq (&pd->setxid_futex, 0) == -2))
     futex_wake (&pd->setxid_futex, 1, FUTEX_PRIVATE);
 
+  /* Register rseq TLS to the kernel. */
+  has_rseq = !rseq_register_current_thread ();
+
 #ifdef __NR_set_robust_list
 # ifndef __ASSUME_SET_ROBUST_LIST
   if (__set_robust_list_avail >= 0)
@@ -573,6 +578,10 @@ START_THREAD_DEFN
     }
 #endif
 
+  /* Unregister rseq TLS from kernel. */
+  if (has_rseq && rseq_unregister_current_thread ())
+    abort();
+
   advise_stack_range (pd->stackblock, pd->stackblock_size, (uintptr_t) pd,
 		      pd->guardsize);
 
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 5f8c2c7c7d..5b541469ec 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -1,5 +1,5 @@
 ifeq ($(subdir),csu)
-sysdep_routines += errno-loc
+sysdep_routines += errno-loc rseq-sym
 endif
 
 ifeq ($(subdir),assert)
@@ -48,7 +48,7 @@ sysdep_headers += sys/mount.h sys/acct.h sys/sysctl.h \
 		  bits/termios-c_iflag.h bits/termios-c_oflag.h \
 		  bits/termios-baud.h bits/termios-c_cflag.h \
 		  bits/termios-c_lflag.h bits/termios-tcflow.h \
-		  bits/termios-misc.h
+		  bits/termios-misc.h sys/rseq.h bits/rseq.h
 
 tests += tst-clone tst-clone2 tst-clone3 tst-fanotify tst-personality \
 	 tst-quota tst-sync_file_range tst-sysconf-iov_max tst-ttyname \
diff --git a/sysdeps/unix/sysv/linux/Versions b/sysdeps/unix/sysv/linux/Versions
index f1e12d9c69..bee3d727e5 100644
--- a/sysdeps/unix/sysv/linux/Versions
+++ b/sysdeps/unix/sysv/linux/Versions
@@ -174,6 +174,10 @@ libc {
   GLIBC_2.29 {
     getcpu;
   }
+  GLIBC_2.30 {
+    __rseq_abi;
+    __rseq_handled;
+  }
   GLIBC_PRIVATE {
     # functions used in other libraries
     __syscall_rt_sigqueueinfo;
diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/rseq.h b/sysdeps/unix/sysv/linux/aarch64/bits/rseq.h
new file mode 100644
index 0000000000..35fcc41f1e
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/bits/rseq.h
@@ -0,0 +1,43 @@
+/* Restartable Sequences Linux aarch64 architecture header.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _SYS_RSEQ_H
+# error "Never use <bits/rseq.h> directly; include <sys/rseq.h> instead."
+#endif
+
+/* RSEQ_SIG is a signature required before each abort handler code.
+
+   It is a 32-bit value that maps to actual architecture code compiled
+   into applications and libraries. It needs to be defined for each
+   architecture. When choosing this value, it needs to be taken into
+   account that generating invalid instructions may have ill effects on
+   tools like objdump, and may also have impact on the CPU speculative
+   execution efficiency in some cases.
+
+   aarch64 -mbig-endian generates mixed endianness code vs data:
+   little-endian code and big-endian data. Ensure the RSEQ_SIG signature
+   matches code endianness.  */
+
+#define RSEQ_SIG_CODE	0xd428bc00	/* BRK #0x45E0.  */
+
+#ifdef __AARCH64EB__
+#define RSEQ_SIG_DATA	0x00bc28d4	/* BRK #0x45E0.  */
+#else
+#define RSEQ_SIG_DATA	RSEQ_SIG_CODE
+#endif
+
+#define RSEQ_SIG	RSEQ_SIG_DATA
diff --git a/sysdeps/unix/sysv/linux/aarch64/libc.abilist b/sysdeps/unix/sysv/linux/aarch64/libc.abilist
index 9c330f325e..331f39e41a 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libc.abilist
@@ -2141,3 +2141,5 @@ GLIBC_2.28 thrd_yield F
 GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
diff --git a/sysdeps/unix/sysv/linux/alpha/libc.abilist b/sysdeps/unix/sysv/linux/alpha/libc.abilist
index f630fa4c6f..05dfdd3393 100644
--- a/sysdeps/unix/sysv/linux/alpha/libc.abilist
+++ b/sysdeps/unix/sysv/linux/alpha/libc.abilist
@@ -2204,6 +2204,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 _IO_fprintf F
 GLIBC_2.4 _IO_printf F
 GLIBC_2.4 _IO_sprintf F
diff --git a/sysdeps/unix/sysv/linux/arm/libc.abilist b/sysdeps/unix/sysv/linux/arm/libc.abilist
index b96f45590f..24e9b89a50 100644
--- a/sysdeps/unix/sysv/linux/arm/libc.abilist
+++ b/sysdeps/unix/sysv/linux/arm/libc.abilist
@@ -126,6 +126,8 @@ GLIBC_2.28 thrd_yield F
 GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 _Exit F
 GLIBC_2.4 _IO_2_1_stderr_ D 0xa0
 GLIBC_2.4 _IO_2_1_stdin_ D 0xa0
diff --git a/sysdeps/unix/sysv/linux/bits/rseq.h b/sysdeps/unix/sysv/linux/bits/rseq.h
new file mode 100644
index 0000000000..a3c023f5c7
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/bits/rseq.h
@@ -0,0 +1,29 @@
+/* Restartable Sequences architecture header. Stub version.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _SYS_RSEQ_H
+# error "Never use <bits/rseq.h> directly; include <sys/rseq.h> instead."
+#endif
+
+/* RSEQ_SIG is a signature required before each abort handler code.
+
+   It is a 32-bit value that maps to actual architecture code compiled
+   into applications and libraries. It needs to be defined for each
+   architecture. When choosing this value, it needs to be taken into
+   account that generating invalid instructions may have ill effects on
+   tools like objdump, and may also have impact on the CPU speculative
+   execution efficiency in some cases.  */
diff --git a/sysdeps/unix/sysv/linux/csky/libc.abilist b/sysdeps/unix/sysv/linux/csky/libc.abilist
index 019044c3cd..e2b0538088 100644
--- a/sysdeps/unix/sysv/linux/csky/libc.abilist
+++ b/sysdeps/unix/sysv/linux/csky/libc.abilist
@@ -2085,3 +2085,5 @@ GLIBC_2.29 xdrstdio_create F
 GLIBC_2.29 xencrypt F
 GLIBC_2.29 xprt_register F
 GLIBC_2.29 xprt_unregister F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
diff --git a/sysdeps/unix/sysv/linux/hppa/libc.abilist b/sysdeps/unix/sysv/linux/hppa/libc.abilist
index 088a8ee369..263a91b97e 100644
--- a/sysdeps/unix/sysv/linux/hppa/libc.abilist
+++ b/sysdeps/unix/sysv/linux/hppa/libc.abilist
@@ -2037,6 +2037,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
 GLIBC_2.4 __fgets_unlocked_chk F
diff --git a/sysdeps/unix/sysv/linux/i386/libc.abilist b/sysdeps/unix/sysv/linux/i386/libc.abilist
index f7ff2c57b9..18ce09d48a 100644
--- a/sysdeps/unix/sysv/linux/i386/libc.abilist
+++ b/sysdeps/unix/sysv/linux/i386/libc.abilist
@@ -2203,6 +2203,8 @@ GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 vm86 F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
 GLIBC_2.4 __fgets_unlocked_chk F
diff --git a/sysdeps/unix/sysv/linux/ia64/libc.abilist b/sysdeps/unix/sysv/linux/ia64/libc.abilist
index becd8b1033..b61e2ee010 100644
--- a/sysdeps/unix/sysv/linux/ia64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/ia64/libc.abilist
@@ -2069,6 +2069,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
 GLIBC_2.4 __fgets_unlocked_chk F
diff --git a/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist b/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
index 74e42a5209..e55792bb22 100644
--- a/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
+++ b/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
@@ -127,6 +127,8 @@ GLIBC_2.28 thrd_yield F
 GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 _Exit F
 GLIBC_2.4 _IO_2_1_stderr_ D 0x98
 GLIBC_2.4 _IO_2_1_stdin_ D 0x98
diff --git a/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist b/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
index 4af5a74e8a..9845499048 100644
--- a/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
+++ b/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
@@ -2146,6 +2146,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
 GLIBC_2.4 __fgets_unlocked_chk F
diff --git a/sysdeps/unix/sysv/linux/microblaze/libc.abilist b/sysdeps/unix/sysv/linux/microblaze/libc.abilist
index ccef673fd2..1aba8cb86c 100644
--- a/sysdeps/unix/sysv/linux/microblaze/libc.abilist
+++ b/sysdeps/unix/sysv/linux/microblaze/libc.abilist
@@ -2133,3 +2133,5 @@ GLIBC_2.28 thrd_yield F
 GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
diff --git a/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
index 1054bb599e..df54e2adab 100644
--- a/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
@@ -2120,6 +2120,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
 GLIBC_2.4 __fgets_unlocked_chk F
diff --git a/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
index 4f5b5ffebf..ce95ae7e86 100644
--- a/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
@@ -2118,6 +2118,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
 GLIBC_2.4 __fgets_unlocked_chk F
diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
index 943aee58d4..c9fb5d2096 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
@@ -2126,6 +2126,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
 GLIBC_2.4 __fgets_unlocked_chk F
diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
index 17a5d17ef9..6335df9acf 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
@@ -2120,6 +2120,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
 GLIBC_2.4 __fgets_unlocked_chk F
diff --git a/sysdeps/unix/sysv/linux/nios2/libc.abilist b/sysdeps/unix/sysv/linux/nios2/libc.abilist
index 4d62a540fd..5465b96768 100644
--- a/sysdeps/unix/sysv/linux/nios2/libc.abilist
+++ b/sysdeps/unix/sysv/linux/nios2/libc.abilist
@@ -2174,3 +2174,5 @@ GLIBC_2.28 thrd_yield F
 GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
index ecc2d6fa13..eb3808dbd4 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
@@ -2164,6 +2164,8 @@ GLIBC_2.3.4 siglongjmp F
 GLIBC_2.3.4 swapcontext F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 _IO_fprintf F
 GLIBC_2.4 _IO_printf F
 GLIBC_2.4 _IO_sprintf F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libc.abilist
index f5830f9c33..6a49a7b718 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libc.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/nofpu/libc.abilist
@@ -2197,6 +2197,8 @@ GLIBC_2.3.4 siglongjmp F
 GLIBC_2.3.4 swapcontext F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 _IO_fprintf F
 GLIBC_2.4 _IO_printf F
 GLIBC_2.4 _IO_sprintf F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist
index 633d8f4792..83177dc75f 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist
@@ -2027,6 +2027,8 @@ GLIBC_2.3.4 siglongjmp F
 GLIBC_2.3.4 swapcontext F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 _IO_fprintf F
 GLIBC_2.4 _IO_printf F
 GLIBC_2.4 _IO_sprintf F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist
index 2c712636ef..e714de994c 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist
@@ -2231,3 +2231,5 @@ GLIBC_2.28 thrd_yield F
 GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist b/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
index 195bc8b2cf..d190623993 100644
--- a/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
@@ -2103,3 +2103,5 @@ GLIBC_2.28 thrd_yield F
 GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h
new file mode 100644
index 0000000000..edb31b1c3c
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/rseq-internal.h
@@ -0,0 +1,88 @@
+/* Restartable Sequences internal API. Linux implementation.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef RSEQ_INTERNAL_H
+#define RSEQ_INTERNAL_H
+
+#include <sysdep.h>
+#include <errno.h>
+
+#ifdef __NR_rseq
+#include <sys/rseq.h>
+#endif
+
+#if defined __NR_rseq && defined RSEQ_SIG
+
+static inline int
+rseq_register_current_thread (void)
+{
+  int rc, ret = 0;
+  INTERNAL_SYSCALL_DECL (err);
+
+  if (__rseq_abi.cpu_id == RSEQ_CPU_ID_REGISTRATION_FAILED)
+    return -1;
+  rc = INTERNAL_SYSCALL_CALL (rseq, err, &__rseq_abi, sizeof (struct rseq),
+                              0, RSEQ_SIG);
+  if (!rc)
+    goto end;
+  if (INTERNAL_SYSCALL_ERRNO (rc, err) != EBUSY)
+    __rseq_abi.cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED;
+  ret = -1;
+end:
+  return ret;
+}
+
+static inline int
+rseq_unregister_current_thread (void)
+{
+  int rc, ret = 0;
+  INTERNAL_SYSCALL_DECL (err);
+
+  rc = INTERNAL_SYSCALL_CALL (rseq, err, &__rseq_abi, sizeof (struct rseq),
+                              RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
+  if (!rc)
+    goto end;
+  ret = -1;
+end:
+  return ret;
+}
+
+static inline void
+rseq_init (void)
+{
+  __rseq_handled = 1;
+}
+#else
+static inline int
+rseq_register_current_thread (void)
+{
+  return -1;
+}
+
+static inline int
+rseq_unregister_current_thread (void)
+{
+  return -1;
+}
+
+static inline void
+rseq_init (void)
+{
+}
+#endif
+
+#endif /* rseq-internal.h */
diff --git a/sysdeps/unix/sysv/linux/rseq-sym.c b/sysdeps/unix/sysv/linux/rseq-sym.c
new file mode 100644
index 0000000000..8e3abab3d0
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/rseq-sym.c
@@ -0,0 +1,63 @@
+/* Restartable Sequences exported symbols. Linux Implementation.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sys/syscall.h>
+#include <stdint.h>
+
+#ifdef __NR_rseq
+#include <sys/rseq.h>
+#else
+
+enum rseq_cpu_id_state {
+  RSEQ_CPU_ID_UNINITIALIZED = -1,
+  RSEQ_CPU_ID_REGISTRATION_FAILED = -2,
+};
+
+/* linux/rseq.h defines struct rseq as aligned on 32 bytes. The kernel ABI
+   size is 20 bytes.  */
+struct rseq {
+  uint32_t cpu_id_start;
+  uint32_t cpu_id;
+  uint64_t rseq_cs;
+  uint32_t flags;
+} __attribute__ ((aligned(4 * sizeof(uint64_t))));
+
+#endif
+
+/* volatile because fields can be read/updated by the kernel.  */
+__thread volatile struct rseq __rseq_abi = {
+  .cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
+};
+
+/* Advertise Restartable Sequences registration ownership across
+   application and shared libraries.
+
+   Libraries and applications must check whether this variable is zero or
+   non-zero if they wish to perform rseq registration on their own. If it
+   is zero, it means restartable sequence registration is not handled, and
+   the library or application is free to perform rseq registration. In
+   that case, the library or application is taking ownership of rseq
+   registration, and may set __rseq_handled to 1. It may then set it back
+   to 0 after it completes unregistering rseq.
+
+   If __rseq_handled is found to be non-zero, it means that another
+   library (or the application) is currently handling rseq registration.
+
+   Typical use of __rseq_handled is within library constructors and
+   destructors, or at program startup.  */
+
+int __rseq_handled;
diff --git a/sysdeps/unix/sysv/linux/s390/bits/rseq.h b/sysdeps/unix/sysv/linux/s390/bits/rseq.h
new file mode 100644
index 0000000000..0ed16c23a4
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/s390/bits/rseq.h
@@ -0,0 +1,30 @@
+/* Restartable Sequences Linux s390 architecture header.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _SYS_RSEQ_H
+# error "Never use <bits/rseq.h> directly; include <sys/rseq.h> instead."
+#endif
+
+/* RSEQ_SIG is a signature required before each abort handler code.
+
+   RSEQ_SIG uses the trap4 instruction. As Linux does not make use of the
+   access-register mode nor the linkage stack this instruction will always
+   cause a special-operation exception (the trap-enabled bit in the DUCT
+   is and will stay 0). The instruction pattern is
+	b2 ff 0f ff	trap4   4095(%r0)  */
+
+#define RSEQ_SIG	0xB2FF0FFF
diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist b/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
index 334def033c..dacae17ec4 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
@@ -2159,6 +2159,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 _IO_fprintf F
 GLIBC_2.4 _IO_printf F
 GLIBC_2.4 _IO_sprintf F
diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist b/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
index 536f4c4ced..c277b3bd90 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
@@ -2063,6 +2063,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 _IO_fprintf F
 GLIBC_2.4 _IO_printf F
 GLIBC_2.4 _IO_sprintf F
diff --git a/sysdeps/unix/sysv/linux/sh/libc.abilist b/sysdeps/unix/sysv/linux/sh/libc.abilist
index 30ae3b6ebb..5f70e5c53b 100644
--- a/sysdeps/unix/sysv/linux/sh/libc.abilist
+++ b/sysdeps/unix/sysv/linux/sh/libc.abilist
@@ -2041,6 +2041,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
 GLIBC_2.4 __fgets_unlocked_chk F
diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist b/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
index 68b107d080..537da009d3 100644
--- a/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
@@ -2153,6 +2153,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 _IO_fprintf F
 GLIBC_2.4 _IO_printf F
 GLIBC_2.4 _IO_sprintf F
diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist b/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
index e5b6a4da50..1fee8e34fc 100644
--- a/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
@@ -2092,6 +2092,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
 GLIBC_2.4 __fgets_unlocked_chk F
diff --git a/sysdeps/unix/sysv/linux/sys/rseq.h b/sysdeps/unix/sysv/linux/sys/rseq.h
new file mode 100644
index 0000000000..5698f4a96d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/sys/rseq.h
@@ -0,0 +1,50 @@
+/* Restartable Sequences exported symbols. Linux header.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _SYS_RSEQ_H
+#define _SYS_RSEQ_H	1
+
+/* We use the structures declarations from the kernel headers.  */
+#include <linux/rseq.h>
+/* Architecture-specific rseq signature.  */
+#include <bits/rseq.h>
+#include <stdint.h>
+
+/* volatile because fields can be read/updated by the kernel.  */
+extern __thread volatile struct rseq __rseq_abi
+__attribute__ ((tls_model ("initial-exec")));
+
+/* Advertise Restartable Sequences registration ownership across
+   application and shared libraries.
+
+   Libraries and applications must check whether this variable is zero or
+   non-zero if they wish to perform rseq registration on their own. If it
+   is zero, it means restartable sequence registration is not handled, and
+   the library or application is free to perform rseq registration. In
+   that case, the library or application is taking ownership of rseq
+   registration, and may set __rseq_handled to 1. It may then set it back
+   to 0 after it completes unregistering rseq.
+
+   If __rseq_handled is found to be non-zero, it means that another
+   library (or the application) is currently handling rseq registration.
+
+   Typical use of __rseq_handled is within library constructors and
+   destructors, or at program startup.  */
+
+extern int __rseq_handled;
+
+#endif /* sys/rseq.h */
diff --git a/sysdeps/unix/sysv/linux/x86/bits/rseq.h b/sysdeps/unix/sysv/linux/x86/bits/rseq.h
new file mode 100644
index 0000000000..a2918c4617
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/x86/bits/rseq.h
@@ -0,0 +1,30 @@
+/* Restartable Sequences Linux x86 architecture header.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _SYS_RSEQ_H
+# error "Never use <bits/rseq.h> directly; include <sys/rseq.h> instead."
+#endif
+
+/* RSEQ_SIG is a signature required before each abort handler code.
+
+   RSEQ_SIG is used with the following reserved undefined instructions, which
+   trap in user-space:
+
+   x86-32:    0f b9 3d 53 30 05 53      ud1    0x53053053,%edi
+   x86-64:    0f b9 3d 53 30 05 53      ud1    0x53053053(%rip),%edi  */
+
+#define RSEQ_SIG	0x53053053
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
index 86dfb0c94d..a834f65383 100644
--- a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
@@ -2050,6 +2050,8 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
 GLIBC_2.4 __fgets_unlocked_chk F
diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist b/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
index dd688263aa..fb8417bde7 100644
--- a/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
@@ -2149,3 +2149,5 @@ GLIBC_2.28 thrd_yield F
 GLIBC_2.29 getcpu F
 GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
+GLIBC_2.30 __rseq_abi T 0x20
+GLIBC_2.30 __rseq_handled D 0x4
-- 
2.17.1

^ permalink raw reply related

* Re: [PATCH for 5.2 00/12] Restartable Sequences selftests updates
From: Mathieu Desnoyers @ 2019-05-03 18:36 UTC (permalink / raw)
  To: shuah
  Cc: linux-kernel, linux-api, Thomas Gleixner, Peter Zijlstra,
	Paul E . McKenney, Boqun Feng, Andy Lutomirski, Dave Watson,
	Paul Turner, Andrew Morton, Russell King, Ingo Molnar,
	H. Peter Anvin, Andi Kleen, Chris Lameter, Ben Maurer, rostedt,
	Josh Triplett, Linus Torvalds, Catalin Marinas, Will
In-Reply-To: <20190429152803.7719-1-mathieu.desnoyers@efficios.com>

----- On Apr 29, 2019, at 11:27 AM, Mathieu Desnoyers mathieu.desnoyers@efficios.com wrote:

> Those rseq selftests updates are hereby submitted to Shuah Khan,
> maintainer of kernel selftests, for the next merge window (5.2).
> 
> They change the per-architecture pre-abort signatures to ensure those
> are valid trap instructions.
> 
> The way exit points are presented to debuggers is enhanced, ensuring
> all exit points are present, so debuggers don't have to disassemble
> rseq critical section to properly skip over them.
> 
> Discussions with the glibc community is reaching a concensus of exposing
> a __rseq_handled symbol from glibc to coexist with rseq early adopters.
> Update the rseq selftest code to expose and use this symbol.
> 
> Support for compiling asm goto with clang is added with the
> "-no-integrated-as" compiler switch, similarly to the toplevel kernel
> Makefile.

Hi Shuah,

Is there anything else you need before you can pick up those patches ?

Thanks,

Mathieu

> 
> Thanks,
> 
> Mathieu
> 
> Martin Schwidefsky (1):
>  rseq/selftests: s390: use trap4 for RSEQ_SIG
> 
> Mathieu Desnoyers (11):
>  rseq/selftests: x86: Work-around bogus gcc-8 optimisation
>  rseq/selftests: Add __rseq_exit_point_array section for debuggers
>  rseq/selftests: Introduce __rseq_cs_ptr_array, rename __rseq_table to
>    __rseq_cs
>  rseq/selftests: Use __rseq_handled symbol to coexist with glibc
>  rseq/selftests: s390: use jg instruction for jumps outside of the asm
>  rseq/selftests: x86: use ud1 instruction as RSEQ_SIG opcode
>  rseq/selftests: arm: use udf instruction for RSEQ_SIG
>  rseq/selftests: aarch64 code signature: handle big-endian environment
>  rseq/selftests: powerpc code signature: generate valid instructions
>  rseq/selftests: mips: use break instruction for RSEQ_SIG
>  rseq/selftests: add -no-integrated-as for clang
> 
> tools/testing/selftests/rseq/Makefile     |   8 +-
> tools/testing/selftests/rseq/rseq-arm.h   | 132 +++++++++++++--
> tools/testing/selftests/rseq/rseq-arm64.h |  74 ++++++++-
> tools/testing/selftests/rseq/rseq-mips.h  | 115 +++++++++++--
> tools/testing/selftests/rseq/rseq-ppc.h   |  90 +++++++++-
> tools/testing/selftests/rseq/rseq-s390.h  |  78 ++++++++-
> tools/testing/selftests/rseq/rseq-x86.h   | 264 +++++++++++++++++++++---------
> tools/testing/selftests/rseq/rseq.c       |  55 ++++++-
> tools/testing/selftests/rseq/rseq.h       |   1 +
> 9 files changed, 688 insertions(+), 129 deletions(-)
> 
> --
> 2.11.0

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: [PATCHv3 05/27] timerfd/timens: Take into account ns clock offsets
From: Andrei Vagin @ 2019-05-03  7:00 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Dmitry Safonov, linux-kernel, Adrian Reber, Andrei Vagin,
	Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Dmitry Safonov, Eric W. Biederman,
	H. Peter Anvin, Ingo Molnar, Jeff Dike, Oleg Nesterov,
	Pavel Emelyanov, Shuah Khan, Vincenzo Frascino, containers, criu,
	linux-api, x86
In-Reply-To: <alpine.DEB.2.21.1904252207170.1768@nanos.tec.linutronix.de>

Hi Thomas,

Thank you for the review.  I read your comments. All of them look
reasonable. I'm sorry that you had to comment a lot. Will fix in the
next version.

Thanks,
Andrei

On Thu, Apr 25, 2019 at 11:28:24PM +0200, Thomas Gleixner wrote:
> On Thu, 25 Apr 2019, Dmitry Safonov wrote:
> > From: Andrei Vagin <avagin@gmail.com>
> > 
> > Make timerfd respect timens offsets.
> > Provide a helper timens_ktime_to_host() that is useful to wire up
> > timens to different kernel subsystems.
> 
> Yet another changelog which lacks meat.
> 
> > @@ -179,6 +180,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
> >  	htmode = (flags & TFD_TIMER_ABSTIME) ?
> >  		HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
> >  
> > +	htmode |= HRTIMER_MODE_NS;
> 
> Without looking further this time. My gut reaction is that this is
> wrong. Name space adjustment is only valid for absolute timers not for
> relative timers.
> 
> Aside of that the name sucks. MODE_NS is really not intuitive. It could be
> NanoSeconds or whatever and quite some time(r) functions have a _ns element
> already. Please look for something more inuitive and clearly related to
> namespaces. We are not short of letters.
> 
> >  	texp = timespec64_to_ktime(ktmr->it_value);
> >  	ctx->expired = 0;
> >  	ctx->ticks = 0;
> > @@ -197,9 +200,10 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
> >  
> >  	if (texp != 0) {
> >  		if (isalarm(ctx)) {
> > -			if (flags & TFD_TIMER_ABSTIME)
> > +			if (flags & TFD_TIMER_ABSTIME) {
> > +				texp = timens_ktime_to_host(clockid, texp);
> 
> You are not serious about that inline function here? It's huge and
> pointless bloat because the only time affected here is boot time, but the
> compiler does not know that.
> 
> >  				alarm_start(&ctx->t.alarm, texp);
> 
> Make that:
> 
>    alarm_start_namespace(.....)
> 
> and that does:
> 
> void alarm_start_namespace(struct alarm *alarm, ktime_t expires)
> {
> 	if (alarm->type == ALARM_BOOTTIME)
> 		expires = timens_sub_boottime(expires);
> 	alarm_start(alarm, expires);
> }
> 
> Hmm?
> 
> > -			else
> > +			} else
> >  				alarm_start_relative(&ctx->t.alarm, texp);
> >  		} else {
> >  			hrtimer_start(&ctx->t.tmr, texp, htmode);
> > diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
> > index 2e8957eac4d4..4b9c89c797ee 100644
> > --- a/include/linux/hrtimer.h
> > +++ b/include/linux/hrtimer.h
> > @@ -38,6 +38,7 @@ enum hrtimer_mode {
> >  	HRTIMER_MODE_REL	= 0x01,
> >  	HRTIMER_MODE_PINNED	= 0x02,
> >  	HRTIMER_MODE_SOFT	= 0x04,
> > +	HRTIMER_MODE_NS		= 0x08,
> >  
> >  	HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED,
> >  	HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED,
> > diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
> > index 5f0da6858b10..988414f7f791 100644
> > --- a/include/linux/time_namespace.h
> > +++ b/include/linux/time_namespace.h
> > @@ -56,6 +56,41 @@ static inline void timens_add_boottime(struct timespec64 *ts)
> >                  *ts = timespec64_add(*ts, ns_offsets->monotonic_boottime_offset);
> >  }
> >  
> > +static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
> > +{
> > +	struct timens_offsets *ns_offsets = current->nsproxy->time_ns->offsets;
> > +	struct timespec64 *offset;
> > +	ktime_t koff;
> > +
> > +	if (!ns_offsets)
> > +		return tim;
> > +
> > +	switch (clockid) {
> > +		case CLOCK_MONOTONIC:
> > +		case CLOCK_MONOTONIC_RAW:
> > +		case CLOCK_MONOTONIC_COARSE:
> 
> What's the point of COARSE and RAW? Neither of them can be used to arm
> timers.
> 
> > +			offset = &ns_offsets->monotonic_time_offset;
> > +			break;
> > +		case CLOCK_BOOTTIME:
> > +		case CLOCK_BOOTTIME_ALARM:
> > +			offset = &ns_offsets->monotonic_boottime_offset;
> > +			break;
> > +		default:
> > +			return tim;
> > +	}
> > +
> > +	koff = timespec64_to_ktime(*offset);
> 
> What about storing both the timespec and the ktime_t representation?
> 
> > +	if (tim < koff)
> > +		tim = 0;
> > +	else if (KTIME_MAX - tim < -koff)
> > +		tim = KTIME_MAX;
> 
> Blink!?! This is completely nonobvious and you're going to stare at it in
> disbelief half a year from now. Comments exist for a reason.
> 
> > +	else
> > +		tim = ktime_sub(tim, koff);
> > +
> > +	return tim;
> 
> This whole thing is way too large for inlining.
> 
> Please create a function which does the magic substraction, something like
> ktime_sub_namespace_offset() and invoke it from the proper places, i.e. the
> alarmtimer one.
> 
> > @@ -1069,6 +1070,8 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
> >  
> >  	if (mode & HRTIMER_MODE_REL)
> >  		tim = ktime_add_safe(tim, base->get_time());
> > +	else if (mode & HRTIMER_MODE_NS)
> > +		tim = timens_ktime_to_host(base->clockid, tim);
> 
> You can do the same as for alarmtime above:
> 
> hrtimer_start_namespace(struct hrtimer *timer, ktime_t tim,
> 			const enum hrtimer_mode mode)
> {
> 	if (mode & HRTIMER_MODE_ABS) {
> 		switch(timer->base->clockid) {
> 		case CLOCK_MONOTONIC:
> 			tim = timens_sub_monotonic(tim);
> 			break;
> 		case CLOCK_BOOTTIME:
> 			tim = timens_sub_boottime(tim);
> 			break;
> 		}
> 	}
> 	hrtimer_start(timer, tim, mode);
> }
> 
> Thanks,
> 
> 	tglx

^ permalink raw reply

* Re: [PATCH V32 01/27] Add the ability to lock down access to the running kernel image
From: Andy Lutomirski @ 2019-05-03  0:34 UTC (permalink / raw)
  To: James Morris
  Cc: Matthew Garrett, LSM List, Linux Kernel Mailing List,
	David Howells, Linux API, Andy Lutomirski
In-Reply-To: <alpine.LRH.2.21.1905030901180.7491@namei.org>


> On May 2, 2019, at 4:19 PM, James Morris <jmorris@namei.org> wrote:
> 
>> On Thu, 2 May 2019, Matthew Garrett wrote:
>> 
>>> On Thu, May 2, 2019 at 2:07 PM James Morris <jmorris@namei.org> wrote:
>>> One possible direction is to (as previously mentioned) assign IDs to each
>>> callsite and be able to check this ID against a simple policy array
>>> (allow/deny).  The default policy choices could be reduced to 'all' or
>>> 'none' during kconfig, and allow a custom policy to be loaded later if
>>> desired.
>> 
>> Ok. My primary concern around this is that it's very difficult to use
>> correctly in anything other than the "all" or "none" modes. If a new
>> kernel feature is added with integrated lockdown support, if an admin
>> is simply setting the flags of things they wish to block then this
>> will be left enabled - and may violate the admin's expectations around
>> integrity. On the other hand, if an admin is simply setting the flags
>> of things they wish to permit, then adding lockdown support to an
>> existing kernel feature may result in that feature suddenly being
>> disabled, which may also violate the admin's expectations around the
>> flags providing a stable set of behaviour.
> 
> Understood. Most uses will likely be either a distro or an embedded 
> system, who I'm assuming would provide a useful policy by default, and 
> perhaps a high-level abstraction for modification.
> 
>> Given that, would you prefer such a policy expression to look like?
> 
> Perhaps a write-once policy, injected from userspace during early boot?
> 
> The policy could be simply a list of:
> 
> lockdown_feature true|false
> 

I’m not convinced this is worthwhile.  As I see it, there really are only two privileges here: root can read kernel memory, and root can corrupt kernel state.  A policy that root can’t corrupt kernel memory except using, say, eBPF is useless — it gives warm fuzzy feelings but nothing else.

^ permalink raw reply

* Re: [PATCH V32 01/27] Add the ability to lock down access to the running kernel image
From: James Morris @ 2019-05-02 23:19 UTC (permalink / raw)
  To: Matthew Garrett
  Cc: LSM List, Linux Kernel Mailing List, David Howells, Linux API,
	Andy Lutomirski
In-Reply-To: <CACdnJuusGU2DMXaPAjH3+QOcSj-9q6njbxxG-9s2PweDKognvw@mail.gmail.com>

On Thu, 2 May 2019, Matthew Garrett wrote:

> On Thu, May 2, 2019 at 2:07 PM James Morris <jmorris@namei.org> wrote:
> > One possible direction is to (as previously mentioned) assign IDs to each
> > callsite and be able to check this ID against a simple policy array
> > (allow/deny).  The default policy choices could be reduced to 'all' or
> > 'none' during kconfig, and allow a custom policy to be loaded later if
> > desired.
> 
> Ok. My primary concern around this is that it's very difficult to use
> correctly in anything other than the "all" or "none" modes. If a new
> kernel feature is added with integrated lockdown support, if an admin
> is simply setting the flags of things they wish to block then this
> will be left enabled - and may violate the admin's expectations around
> integrity. On the other hand, if an admin is simply setting the flags
> of things they wish to permit, then adding lockdown support to an
> existing kernel feature may result in that feature suddenly being
> disabled, which may also violate the admin's expectations around the
> flags providing a stable set of behaviour.

Understood. Most uses will likely be either a distro or an embedded 
system, who I'm assuming would provide a useful policy by default, and 
perhaps a high-level abstraction for modification.

> Given that, would you prefer such a policy expression to look like?

Perhaps a write-once policy, injected from userspace during early boot?

The policy could be simply a list of:

lockdown_feature true|false


> 
> > Within the policy check hook, we could add a new LSM hook, which would
> > allow an LSM to restrictively override the lockdown policy with its own
> 
> Ok, that makes sense. If we take this approach, does there need to be
> a separate policy mechanism at all? Users who want fine-grained
> control would be able to set the behaviour to "None" and then use
> their choice of LSM to express more fine-grained control.

Right, and there could be a stackable LSM which just does fine-grained 
policy (per above).


> 
> > This doesn't really address the completeness / maintenance issue (i.e. "do
> > we have everything covered and how do we ensure this on an ongoing
> > basis?", and "what will this new lockdown feature break?"), although it
> > should make it easier to add new lockdown callsites as they don't have to
> > be enabled by the user.
> 
> I can start on this.

Cool!

-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply

* Re: [PATCH V32 01/27] Add the ability to lock down access to the running kernel image
From: Matthew Garrett @ 2019-05-02 21:15 UTC (permalink / raw)
  To: James Morris
  Cc: LSM List, Linux Kernel Mailing List, David Howells, Linux API,
	Andy Lutomirski
In-Reply-To: <alpine.LRH.2.21.1905030653480.32502@namei.org>

On Thu, May 2, 2019 at 2:07 PM James Morris <jmorris@namei.org> wrote:
> One possible direction is to (as previously mentioned) assign IDs to each
> callsite and be able to check this ID against a simple policy array
> (allow/deny).  The default policy choices could be reduced to 'all' or
> 'none' during kconfig, and allow a custom policy to be loaded later if
> desired.

Ok. My primary concern around this is that it's very difficult to use
correctly in anything other than the "all" or "none" modes. If a new
kernel feature is added with integrated lockdown support, if an admin
is simply setting the flags of things they wish to block then this
will be left enabled - and may violate the admin's expectations around
integrity. On the other hand, if an admin is simply setting the flags
of things they wish to permit, then adding lockdown support to an
existing kernel feature may result in that feature suddenly being
disabled, which may also violate the admin's expectations around the
flags providing a stable set of behaviour.

Given that, would you prefer such a policy expression to look like?

> Within the policy check hook, we could add a new LSM hook, which would
> allow an LSM to restrictively override the lockdown policy with its own

Ok, that makes sense. If we take this approach, does there need to be
a separate policy mechanism at all? Users who want fine-grained
control would be able to set the behaviour to "None" and then use
their choice of LSM to express more fine-grained control.

> This doesn't really address the completeness / maintenance issue (i.e. "do
> we have everything covered and how do we ensure this on an ongoing
> basis?", and "what will this new lockdown feature break?"), although it
> should make it easier to add new lockdown callsites as they don't have to
> be enabled by the user.

I can start on this.

^ permalink raw reply

* Re: [PATCH V32 01/27] Add the ability to lock down access to the running kernel image
From: James Morris @ 2019-05-02 21:07 UTC (permalink / raw)
  To: Matthew Garrett
  Cc: LSM List, Linux Kernel Mailing List, David Howells, Linux API,
	Andy Lutomirski
In-Reply-To: <CACdnJus-+VTy0uOWg982SgZr55Lp7Xot653dJb_tO5T=J6D8nw@mail.gmail.com>

On Mon, 29 Apr 2019, Matthew Garrett wrote:

> Hi James,
> 
> What's the best way forward with this? I'm still not entirely clear on
> how it can be implemented purely as an LSM, but if you have ideas on
> what sort of implementation you'd prefer I'm happy to work on that.

It can't be implemented purely as an LSM.

The concerns I have are:

  o Mixing of mechanism and policy (they are hardcoded together)
  o Too-coarse policy (all or nothing, which will lead many to choose   
    nothing)
  o Lack of integration with LSM
  o Completeness
  o Maintenance (including adding new lockdowns without breaking existing 
    userspace)
  
One possible direction is to (as previously mentioned) assign IDs to each 
callsite and be able to check this ID against a simple policy array 
(allow/deny).  The default policy choices could be reduced to 'all' or 
'none' during kconfig, and allow a custom policy to be loaded later if 
desired.

Within the policy check hook, we could add a new LSM hook, which would 
allow an LSM to restrictively override the lockdown policy with its own 
(so e.g. SELinux could utilize the context of the current process to 
determine if a lockdown feature should be enforced).

This doesn't really address the completeness / maintenance issue (i.e. "do 
we have everything covered and how do we ensure this on an ongoing 
basis?", and "what will this new lockdown feature break?"), although it 
should make it easier to add new lockdown callsites as they don't have to 
be enabled by the user.

Thoughts?

-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply

* Re: [PATCH] binfmt_elf: Extract .note.gnu.property from an ELF file
From: Yu-cheng Yu @ 2019-05-02 16:25 UTC (permalink / raw)
  To: Dave Martin
  Cc: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
	Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
	Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit,
	Oleg Nesterov, Pa
In-Reply-To: <20190502161424.GQ3567@e103592.cambridge.arm.com>

On Thu, 2019-05-02 at 17:14 +0100, Dave Martin wrote:
> On Thu, May 02, 2019 at 08:47:06AM -0700, Yu-cheng Yu wrote:
> > On Thu, 2019-05-02 at 12:10 +0100, Dave Martin wrote:
> > > On Wed, May 01, 2019 at 02:12:17PM -0700, Yu-cheng Yu wrote:
> > > > An ELF file's .note.gnu.property indicates features the executable file
> > > > can support.  For example, the property GNU_PROPERTY_X86_FEATURE_1_AND
> > > > indicates the file supports GNU_PROPERTY_X86_FEATURE_1_IBT and/or
> > > > GNU_PROPERTY_X86_FEATURE_1_SHSTK.
> > 
> > [...]
> > > A couple of questions before I look in more detail:
> > > 
> > > 1) Can we rely on PT_GNU_PROPERTY being present in the phdrs to describe
> > > the NT_GNU_PROPERTY_TYPE_0 note?  If so, we can avoid trying to parse
> > > irrelevant PT_NOTE segments.
> > 
> > Some older linkers can create multiples of NT_GNU_PROPERTY_TYPE_0.  The code
> > scans all PT_NOTE segments to ensure there is only one
> > NT_GNU_PROPERTY_TYPE_0. 
> > If there are multiples, then all are considered invalid.
> 
> I'm concerned that in the arm64 case we would waste some effort by
> scanning multiple notes.
> 
> Could we do something like iterating over the phdrs, and if we find
> exactly one PT_GNU_PROPERTY then use that, else fall back to scanning
> all PT_NOTEs?

That makes sense to me, but the concern is that we don't know the
PT_GNU_PROPERTY the only one.  This probably needs to be discussed with more
people.

> > > 2) Are there standard types for things like the program property header?
> > > If not, can we add something in elf.h?  We should try to coordinate with
> > > libc on that.  Something like
> > > 
> > > typedef __u32 Elf_Word;
> > > 
> > > typedef struct {
> > > 	Elf_Word pr_type;
> > > 	Elf_Word pr_datasz;
> > > } Elf_Gnu_Prophdr;
> > > 
> > > (i.e., just the header part from [1], with a more specific name -- which
> > > I just made up).
> > 
> > Yes, I will fix that.
> > 
> > [...]
> > > 3) It looks like we have to go and re-parse all the notes for every
> > > property requested by the arch code.
> > 
> > As explained above, it is necessary to scan all PT_NOTE segments.  But there
> > should be only one NT_GNU_PROPERTY_TYPE_0 in an ELF file.  Once that is
> > found,
> > perhaps we can store it somewhere, or call into the arch code as you
> > mentioned
> > below.  I will look into that.
> 
> Just to get something working on arm64, I'm working on some hacks that
> move things around a bit -- I'll post when I have something.
> 
> Did you have any view on my other point, below?

That should work.  I will also make some changes for that.

> 
> Cheers
> ---Dave
> 
> > > For now there is only one property requested anyway, so this is probably
> > > not too bad.  But could we flip things around so that we have some
> > > CONFIG_ARCH_WANTS_ELF_GNU_PROPERTY (say), and have the ELF core code
> > > call into the arch backend for each property found?
> > > 
> > > The arch could provide some hook
> > > 
> > > 	int arch_elf_has_gnu_property(const Elf_Gnu_Prophdr *prop,
> > > 					const void *data);
> > > 
> > > to consume the properties as they are found.
> > > 
> > > This would effectively replace the arch_setup_property() hook you
> > > currently have.
> > > 
> > > Cheers
> > > ---Dave
> > > 
> > > [1] https://github.com/hjl-tools/linux-abi/wiki/Linux-Extensions-to-gABI

^ permalink raw reply

* Re: [PATCH] binfmt_elf: Extract .note.gnu.property from an ELF file
From: Dave Martin @ 2019-05-02 16:14 UTC (permalink / raw)
  To: Yu-cheng Yu
  Cc: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
	Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
	Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit,
	Oleg Nesterov, Pa
In-Reply-To: <5b2c6cee345e00182e97842ae90c02cdcd830135.camel@intel.com>

On Thu, May 02, 2019 at 08:47:06AM -0700, Yu-cheng Yu wrote:
> On Thu, 2019-05-02 at 12:10 +0100, Dave Martin wrote:
> > On Wed, May 01, 2019 at 02:12:17PM -0700, Yu-cheng Yu wrote:
> > > An ELF file's .note.gnu.property indicates features the executable file
> > > can support.  For example, the property GNU_PROPERTY_X86_FEATURE_1_AND
> > > indicates the file supports GNU_PROPERTY_X86_FEATURE_1_IBT and/or
> > > GNU_PROPERTY_X86_FEATURE_1_SHSTK.
> 
> [...]
> > A couple of questions before I look in more detail:
> > 
> > 1) Can we rely on PT_GNU_PROPERTY being present in the phdrs to describe
> > the NT_GNU_PROPERTY_TYPE_0 note?  If so, we can avoid trying to parse
> > irrelevant PT_NOTE segments.
> 
> Some older linkers can create multiples of NT_GNU_PROPERTY_TYPE_0.  The code
> scans all PT_NOTE segments to ensure there is only one NT_GNU_PROPERTY_TYPE_0. 
> If there are multiples, then all are considered invalid.

I'm concerned that in the arm64 case we would waste some effort by
scanning multiple notes.

Could we do something like iterating over the phdrs, and if we find
exactly one PT_GNU_PROPERTY then use that, else fall back to scanning
all PT_NOTEs?

> > 2) Are there standard types for things like the program property header?
> > If not, can we add something in elf.h?  We should try to coordinate with
> > libc on that.  Something like
> > 
> > typedef __u32 Elf_Word;
> > 
> > typedef struct {
> > 	Elf_Word pr_type;
> > 	Elf_Word pr_datasz;
> > } Elf_Gnu_Prophdr;
> > 
> > (i.e., just the header part from [1], with a more specific name -- which
> > I just made up).
> 
> Yes, I will fix that.
> 
> [...]
> > 3) It looks like we have to go and re-parse all the notes for every
> > property requested by the arch code.
> 
> As explained above, it is necessary to scan all PT_NOTE segments.  But there
> should be only one NT_GNU_PROPERTY_TYPE_0 in an ELF file.  Once that is found,
> perhaps we can store it somewhere, or call into the arch code as you mentioned
> below.  I will look into that.

Just to get something working on arm64, I'm working on some hacks that
move things around a bit -- I'll post when I have something.

Did you have any view on my other point, below?

Cheers
---Dave

> > For now there is only one property requested anyway, so this is probably
> > not too bad.  But could we flip things around so that we have some
> > CONFIG_ARCH_WANTS_ELF_GNU_PROPERTY (say), and have the ELF core code
> > call into the arch backend for each property found?
> > 
> > The arch could provide some hook
> > 
> > 	int arch_elf_has_gnu_property(const Elf_Gnu_Prophdr *prop,
> > 					const void *data);
> > 
> > to consume the properties as they are found.
> > 
> > This would effectively replace the arch_setup_property() hook you
> > currently have.
> > 
> > Cheers
> > ---Dave
> > 
> > [1] https://github.com/hjl-tools/linux-abi/wiki/Linux-Extensions-to-gABI
> 

^ permalink raw reply

* Re: [PATCH] binfmt_elf: Extract .note.gnu.property from an ELF file
From: Yu-cheng Yu @ 2019-05-02 15:48 UTC (permalink / raw)
  To: Dave Martin
  Cc: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
	Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
	Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit,
	Oleg Nesterov, Pa
In-Reply-To: <20190502142951.GP3567@e103592.cambridge.arm.com>

On Thu, 2019-05-02 at 15:29 +0100, Dave Martin wrote:
> On Thu, May 02, 2019 at 12:10:04PM +0100, Dave Martin wrote:
> > On Wed, May 01, 2019 at 02:12:17PM -0700, Yu-cheng Yu wrote:
> 
> [...]
> 
> > > diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c > > index
> > > 7d09d125f148..40aa4a4fd64d 100644
> > > --- a/fs/binfmt_elf.c
> > > +++ b/fs/binfmt_elf.c
> > > @@ -1076,6 +1076,19 @@ static int load_elf_binary(struct linux_binprm
> > > *bprm)
> > >  		goto out_free_dentry;
> > >  	}
> > >  
> > > +	if (interpreter) {
> > > +		retval = arch_setup_property(&loc->interp_elf_ex,
> > > +					     interp_elf_phdata,
> > > +					     interpreter, true);
> > > +	} else {
> > > +		retval = arch_setup_property(&loc->elf_ex,
> > > +					     elf_phdata,
> > > +					     bprm->file, false);
> > > +	}
> 
> This will be too late for arm64, since we need to twiddle the mmap prot
> flags for the executable's pages based on the detected properties.
> 
> Can we instead move this much earlier, letting the arch code stash
> something in arch_state that can be consumed later on?
> 
> This also has the advantage that we can report errors to the execve()
> caller before passing the point of no return (i.e., flush_old_exec()).

I will look into that.

> 
> [...]
> 
> > > diff --git a/fs/gnu_property.c b/fs/gnu_property.c
> 
> [...]
> 
> > > +int get_gnu_property(void *ehdr_p, void *phdr_p, struct file *f,
> > > +		     u32 pr_type, u32 *property)
> > > +{
> > > +	struct elf64_hdr *ehdr64 = ehdr_p;
> > > +	int err = 0;
> > > +
> > > +	*property = 0;
> > > +
> > > +	if (ehdr64->e_ident[EI_CLASS] == ELFCLASS64) {
> > > +		struct elf64_phdr *phdr64 = phdr_p;
> > > +
> > > +		err = scan_segments_64(f, phdr64, ehdr64->e_phnum,
> > > +				       pr_type, property);
> > > +		if (err < 0)
> > > +			goto out;
> > > +	} else {
> > > +#ifdef CONFIG_COMPAT
> > > +		struct elf32_hdr *ehdr32 = ehdr_p;
> > > +
> > > +		if (ehdr32->e_ident[EI_CLASS] == ELFCLASS32) {
> > > +			struct elf32_phdr *phdr32 = phdr_p;
> > > +
> > > +			err = scan_segments_32(f, phdr32, ehdr32-
> > > >e_phnum,
> > > +					       pr_type, property);
> > > +			if (err < 0)
> > > +				goto out;
> > > +		}
> > > +#else
> > > +	WARN_ONCE(1, "Exec of 32-bit app, but CONFIG_COMPAT is not
> > > enabled.\n");
> > > +	return -ENOTSUPP;
> > > +#endif
> > > +	}
> 
> We have already made a ton of assumptions about the ELF class by this
> point, and we don't seem to check it explicitly elsewhere, so it is a
> bit weird to police it specifically here.
> 
> Can we simply pass the assumed ELF class as a parameter instead?

Yes.

Yu-cheng

^ permalink raw reply

* Re: [PATCH] binfmt_elf: Extract .note.gnu.property from an ELF file
From: Yu-cheng Yu @ 2019-05-02 15:47 UTC (permalink / raw)
  To: Dave Martin
  Cc: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
	Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
	Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit,
	Oleg Nesterov, Pa
In-Reply-To: <20190502111003.GO3567@e103592.cambridge.arm.com>

On Thu, 2019-05-02 at 12:10 +0100, Dave Martin wrote:
> On Wed, May 01, 2019 at 02:12:17PM -0700, Yu-cheng Yu wrote:
> > An ELF file's .note.gnu.property indicates features the executable file
> > can support.  For example, the property GNU_PROPERTY_X86_FEATURE_1_AND
> > indicates the file supports GNU_PROPERTY_X86_FEATURE_1_IBT and/or
> > GNU_PROPERTY_X86_FEATURE_1_SHSTK.

[...]
> A couple of questions before I look in more detail:
> 
> 1) Can we rely on PT_GNU_PROPERTY being present in the phdrs to describe
> the NT_GNU_PROPERTY_TYPE_0 note?  If so, we can avoid trying to parse
> irrelevant PT_NOTE segments.

Some older linkers can create multiples of NT_GNU_PROPERTY_TYPE_0.  The code
scans all PT_NOTE segments to ensure there is only one NT_GNU_PROPERTY_TYPE_0. 
If there are multiples, then all are considered invalid.

> 
> 
> 2) Are there standard types for things like the program property header?
> If not, can we add something in elf.h?  We should try to coordinate with
> libc on that.  Something like
> 
> typedef __u32 Elf_Word;
> 
> typedef struct {
> 	Elf_Word pr_type;
> 	Elf_Word pr_datasz;
> } Elf_Gnu_Prophdr;
> 
> (i.e., just the header part from [1], with a more specific name -- which
> I just made up).

Yes, I will fix that.

[...]
> 3) It looks like we have to go and re-parse all the notes for every
> property requested by the arch code.

As explained above, it is necessary to scan all PT_NOTE segments.  But there
should be only one NT_GNU_PROPERTY_TYPE_0 in an ELF file.  Once that is found,
perhaps we can store it somewhere, or call into the arch code as you mentioned
below.  I will look into that.

> 
> For now there is only one property requested anyway, so this is probably
> not too bad.  But could we flip things around so that we have some
> CONFIG_ARCH_WANTS_ELF_GNU_PROPERTY (say), and have the ELF core code
> call into the arch backend for each property found?
> 
> The arch could provide some hook
> 
> 	int arch_elf_has_gnu_property(const Elf_Gnu_Prophdr *prop,
> 					const void *data);
> 
> to consume the properties as they are found.
> 
> This would effectively replace the arch_setup_property() hook you
> currently have.
> 
> Cheers
> ---Dave
> 
> [1] https://github.com/hjl-tools/linux-abi/wiki/Linux-Extensions-to-gABI

^ permalink raw reply

* Re: [PATCH] binfmt_elf: Extract .note.gnu.property from an ELF file
From: Dave Martin @ 2019-05-02 14:29 UTC (permalink / raw)
  To: Yu-cheng Yu
  Cc: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
	Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
	Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit,
	Oleg Nesterov, Pa
In-Reply-To: <20190502111003.GO3567@e103592.cambridge.arm.com>

On Thu, May 02, 2019 at 12:10:04PM +0100, Dave Martin wrote:
> On Wed, May 01, 2019 at 02:12:17PM -0700, Yu-cheng Yu wrote:
> > An ELF file's .note.gnu.property indicates features the executable file
> > can support.  For example, the property GNU_PROPERTY_X86_FEATURE_1_AND
> > indicates the file supports GNU_PROPERTY_X86_FEATURE_1_IBT and/or
> > GNU_PROPERTY_X86_FEATURE_1_SHSTK.
> > 
> > This patch was part of the Control-flow Enforcement series; the original
> > patch is here: https://lkml.org/lkml/2018/11/20/205.  Dave Martin responded
> > that ARM recently introduced new features to NT_GNU_PROPERTY_TYPE_0 with
> > properties closely modelled on GNU_PROPERTY_X86_FEATURE_1_AND, and it is
> > logical to split out the generic part.  Here it is.
> > 
> > With this patch, if an arch needs to setup features from ELF properties,
> > it needs CONFIG_ARCH_USE_GNU_PROPERTY to be set, and a specific
> > arch_setup_property().
> > 
> > For example, for X86_64:
> > 
> > int arch_setup_property(void *ehdr, void *phdr, struct file *f, bool inter)
> > {
> > 	int r;
> > 	uint32_t property;
> > 
> > 	r = get_gnu_property(ehdr, phdr, f, GNU_PROPERTY_X86_FEATURE_1_AND,
> > 			     &property);
> > 	...
> > }

[...]

> > diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c > > index 7d09d125f148..40aa4a4fd64d 100644
> > --- a/fs/binfmt_elf.c
> > +++ b/fs/binfmt_elf.c
> > @@ -1076,6 +1076,19 @@ static int load_elf_binary(struct linux_binprm *bprm)
> >  		goto out_free_dentry;
> >  	}
> >  
> > +	if (interpreter) {
> > +		retval = arch_setup_property(&loc->interp_elf_ex,
> > +					     interp_elf_phdata,
> > +					     interpreter, true);
> > +	} else {
> > +		retval = arch_setup_property(&loc->elf_ex,
> > +					     elf_phdata,
> > +					     bprm->file, false);
> > +	}

This will be too late for arm64, since we need to twiddle the mmap prot
flags for the executable's pages based on the detected properties.

Can we instead move this much earlier, letting the arch code stash
something in arch_state that can be consumed later on?

This also has the advantage that we can report errors to the execve()
caller before passing the point of no return (i.e., flush_old_exec()).

[...]

> > diff --git a/fs/gnu_property.c b/fs/gnu_property.c

[...]

> > +int get_gnu_property(void *ehdr_p, void *phdr_p, struct file *f,
> > +		     u32 pr_type, u32 *property)
> > +{
> > +	struct elf64_hdr *ehdr64 = ehdr_p;
> > +	int err = 0;
> > +
> > +	*property = 0;
> > +
> > +	if (ehdr64->e_ident[EI_CLASS] == ELFCLASS64) {
> > +		struct elf64_phdr *phdr64 = phdr_p;
> > +
> > +		err = scan_segments_64(f, phdr64, ehdr64->e_phnum,
> > +				       pr_type, property);
> > +		if (err < 0)
> > +			goto out;
> > +	} else {
> > +#ifdef CONFIG_COMPAT
> > +		struct elf32_hdr *ehdr32 = ehdr_p;
> > +
> > +		if (ehdr32->e_ident[EI_CLASS] == ELFCLASS32) {
> > +			struct elf32_phdr *phdr32 = phdr_p;
> > +
> > +			err = scan_segments_32(f, phdr32, ehdr32->e_phnum,
> > +					       pr_type, property);
> > +			if (err < 0)
> > +				goto out;
> > +		}
> > +#else
> > +	WARN_ONCE(1, "Exec of 32-bit app, but CONFIG_COMPAT is not enabled.\n");
> > +	return -ENOTSUPP;
> > +#endif
> > +	}

We have already made a ton of assumptions about the ELF class by this
point, and we don't seem to check it explicitly elsewhere, so it is a
bit weird to police it specifically here.

Can we simply pass the assumed ELF class as a parameter instead?

[...]

Cheers
---DavE

^ permalink raw reply

* Re: [PATCH] binfmt_elf: Extract .note.gnu.property from an ELF file
From: Dave Martin @ 2019-05-02 11:10 UTC (permalink / raw)
  To: Yu-cheng Yu
  Cc: x86, H. Peter Anvin, Thomas Gleixner, Ingo Molnar, linux-kernel,
	linux-doc, linux-mm, linux-arch, linux-api, Arnd Bergmann,
	Andy Lutomirski, Balbir Singh, Cyrill Gorcunov, Dave Hansen,
	Eugene Syromiatnikov, Florian Weimer, H.J. Lu, Jann Horn,
	Jonathan Corbet, Kees Cook, Mike Kravetz, Nadav Amit,
	Oleg Nesterov, Pa
In-Reply-To: <20190501211217.5039-1-yu-cheng.yu@intel.com>

On Wed, May 01, 2019 at 02:12:17PM -0700, Yu-cheng Yu wrote:
> An ELF file's .note.gnu.property indicates features the executable file
> can support.  For example, the property GNU_PROPERTY_X86_FEATURE_1_AND
> indicates the file supports GNU_PROPERTY_X86_FEATURE_1_IBT and/or
> GNU_PROPERTY_X86_FEATURE_1_SHSTK.
> 
> This patch was part of the Control-flow Enforcement series; the original
> patch is here: https://lkml.org/lkml/2018/11/20/205.  Dave Martin responded
> that ARM recently introduced new features to NT_GNU_PROPERTY_TYPE_0 with
> properties closely modelled on GNU_PROPERTY_X86_FEATURE_1_AND, and it is
> logical to split out the generic part.  Here it is.
> 
> With this patch, if an arch needs to setup features from ELF properties,
> it needs CONFIG_ARCH_USE_GNU_PROPERTY to be set, and a specific
> arch_setup_property().
> 
> For example, for X86_64:
> 
> int arch_setup_property(void *ehdr, void *phdr, struct file *f, bool inter)
> {
> 	int r;
> 	uint32_t property;
> 
> 	r = get_gnu_property(ehdr, phdr, f, GNU_PROPERTY_X86_FEATURE_1_AND,
> 			     &property);
> 	...
> }

Thanks, this is timely for me.  I should be able to build the needed
arm64 support pretty quickly around this now.

[Cc'ing libc-alpha for the elf.h question -- see (2)]


A couple of questions before I look in more detail:

1) Can we rely on PT_GNU_PROPERTY being present in the phdrs to describe
the NT_GNU_PROPERTY_TYPE_0 note?  If so, we can avoid trying to parse
irrelevant PT_NOTE segments.


2) Are there standard types for things like the program property header?
If not, can we add something in elf.h?  We should try to coordinate with
libc on that.  Something like

typedef __u32 Elf_Word;

typedef struct {
	Elf_Word pr_type;
	Elf_Word pr_datasz;
} Elf_Gnu_Prophdr;

(i.e., just the header part from [1], with a more specific name -- which
I just made up).


Given the fragmented nature and draft status of the specs -- and
differing opiniions about the sizes and alignments of certain things --
it could be useful to have this explicitly in the kernel.  Some
documentation as to _precisely_ what we accept may also be a good idea.


3) It looks like we have to go and re-parse all the notes for every
property requested by the arch code.

For now there is only one property requested anyway, so this is probably
not too bad.  But could we flip things around so that we have some
CONFIG_ARCH_WANTS_ELF_GNU_PROPERTY (say), and have the ELF core code
call into the arch backend for each property found?

The arch could provide some hook

	int arch_elf_has_gnu_property(const Elf_Gnu_Prophdr *prop,
					const void *data);

to consume the properties as they are found.

This would effectively replace the arch_setup_property() hook you
currently have.

Cheers
---Dave

[1] https://github.com/hjl-tools/linux-abi/wiki/Linux-Extensions-to-gABI

> 
> Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
> ---
>  fs/Kconfig.binfmt        |   4 +
>  fs/Makefile              |   1 +
>  fs/binfmt_elf.c          |  13 ++
>  fs/gnu_property.c        | 363 +++++++++++++++++++++++++++++++++++++++
>  include/linux/elf.h      |  12 ++
>  include/uapi/linux/elf.h |   8 +
>  6 files changed, 401 insertions(+)
>  create mode 100644 fs/gnu_property.c
> 
> diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
> index b795f8da81f3..175a1f58e785 100644
> --- a/fs/Kconfig.binfmt
> +++ b/fs/Kconfig.binfmt
> @@ -35,6 +35,10 @@ config COMPAT_BINFMT_ELF
>  config ARCH_BINFMT_ELF_STATE
>  	bool
>  
> +config ARCH_USE_GNU_PROPERTY
> +	bool
> +	depends on 64BIT
> +
>  config BINFMT_ELF_FDPIC
>  	bool "Kernel support for FDPIC ELF binaries"
>  	default y if !BINFMT_ELF
> diff --git a/fs/Makefile b/fs/Makefile
> index 427fec226fae..8a35abbebf8b 100644
> --- a/fs/Makefile
> +++ b/fs/Makefile
> @@ -44,6 +44,7 @@ obj-$(CONFIG_BINFMT_ELF)	+= binfmt_elf.o
>  obj-$(CONFIG_COMPAT_BINFMT_ELF)	+= compat_binfmt_elf.o
>  obj-$(CONFIG_BINFMT_ELF_FDPIC)	+= binfmt_elf_fdpic.o
>  obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
> +obj-$(CONFIG_ARCH_USE_GNU_PROPERTY) += gnu_property.o
>  
>  obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
>  obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 7d09d125f148..40aa4a4fd64d 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -1076,6 +1076,19 @@ static int load_elf_binary(struct linux_binprm *bprm)
>  		goto out_free_dentry;
>  	}
>  
> +	if (interpreter) {
> +		retval = arch_setup_property(&loc->interp_elf_ex,
> +					     interp_elf_phdata,
> +					     interpreter, true);
> +	} else {
> +		retval = arch_setup_property(&loc->elf_ex,
> +					     elf_phdata,
> +					     bprm->file, false);
> +	}
> +
> +	if (retval < 0)
> +		goto out_free_dentry;
> +
>  	if (elf_interpreter) {
>  		unsigned long interp_map_addr = 0;
>  
> diff --git a/fs/gnu_property.c b/fs/gnu_property.c
> new file mode 100644
> index 000000000000..656ea3951840
> --- /dev/null
> +++ b/fs/gnu_property.c
> @@ -0,0 +1,363 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Extract an ELF file's .note.gnu.property.
> + *
> + * The path from the ELF header to the note section is the following:
> + * elfhdr->elf_phdr->elf_note->property[].
> + */
> +
> +#include <uapi/linux/elf-em.h>
> +#include <linux/processor.h>
> +#include <linux/binfmts.h>
> +#include <linux/elf.h>
> +#include <linux/slab.h>
> +#include <linux/fs.h>
> +#include <linux/uaccess.h>
> +#include <linux/string.h>
> +#include <linux/compat.h>
> +
> +/*
> + * The .note.gnu.property layout:
> + *
> + *	struct elf_note {
> + *		u32 n_namesz; --> sizeof(n_name[]); always (4)
> + *		u32 n_ndescsz;--> sizeof(property[])
> + *		u32 n_type;   --> always NT_GNU_PROPERTY_TYPE_0
> + *	};
> + *	char n_name[4]; --> always 'GNU\0'
> + *
> + *	struct {
> + *		struct gnu_property {
> + *			u32 pr_type;
> + *			u32 pr_datasz;
> + *		};
> + *		u8 pr_data[pr_datasz];
> + *	}[];
> + */
> +
> +#define BUF_SIZE (PAGE_SIZE / 4)
> +
> +struct gnu_property {
> +	u32 pr_type;
> +	u32 pr_datasz;
> +};
> +
> +typedef bool (test_item_fn)(void *buf, u32 *arg, u32 type);
> +typedef void *(next_item_fn)(void *buf, u32 *arg, u32 type);
> +
> +static inline bool test_note_type(void *buf, u32 *align, u32 note_type)
> +{
> +	struct elf_note *n = buf;
> +
> +	return ((n->n_type == note_type) && (n->n_namesz == 4) &&
> +		(memcmp(n + 1, "GNU", 4) == 0));
> +}
> +
> +static inline void *next_note(void *buf, u32 *align, u32 note_type)
> +{
> +	struct elf_note *n = buf;
> +	u64 size;
> +
> +	if (check_add_overflow((u64)sizeof(*n), (u64)n->n_namesz, &size))
> +		return NULL;
> +
> +	size = round_up(size, *align);
> +
> +	if (check_add_overflow(size, (u64)n->n_descsz, &size))
> +		return NULL;
> +
> +	size = round_up(size, *align);
> +
> +	if (buf + size < buf)
> +		return NULL;
> +	else
> +		return (buf + size);
> +}
> +
> +static inline bool test_property(void *buf, u32 *max_type, u32 pr_type)
> +{
> +	struct gnu_property *pr = buf;
> +
> +	/*
> +	 * Property types must be in ascending order.
> +	 * Keep track of the max when testing each.
> +	 */
> +	if (pr->pr_type > *max_type)
> +		*max_type = pr->pr_type;
> +
> +	return (pr->pr_type == pr_type);
> +}
> +
> +static inline void *next_property(void *buf, u32 *max_type, u32 pr_type)
> +{
> +	struct gnu_property *pr = buf;
> +
> +	if ((buf + sizeof(*pr) +  pr->pr_datasz < buf) ||
> +	    (pr->pr_type > pr_type) ||
> +	    (pr->pr_type > *max_type))
> +		return NULL;
> +	else
> +		return (buf + sizeof(*pr) + pr->pr_datasz);
> +}
> +
> +/*
> + * Scan 'buf' for a pattern; return true if found.
> + * *pos is the distance from the beginning of buf to where
> + * the searched item or the next item is located.
> + */
> +static int scan(u8 *buf, u32 buf_size, int item_size, test_item_fn test_item,
> +		next_item_fn next_item, u32 *arg, u32 type, u32 *pos)
> +{
> +	int found = 0;
> +	u8 *p, *max;
> +
> +	max = buf + buf_size;
> +	if (max < buf)
> +		return 0;
> +
> +	p = buf;
> +
> +	while ((p + item_size < max) && (p + item_size > buf)) {
> +		if (test_item(p, arg, type)) {
> +			found = 1;
> +			break;
> +		}
> +
> +		p = next_item(p, arg, type);
> +	}
> +
> +	*pos = (p + item_size <= buf) ? 0 : (u32)(p - buf);
> +	return found;
> +}
> +
> +/*
> + * Search an NT_GNU_PROPERTY_TYPE_0 for the property of 'pr_type'.
> + */
> +static int find_property(struct file *file, unsigned long desc_size,
> +			 loff_t file_offset, u8 *buf,
> +			 u32 pr_type, u32 *property)
> +{
> +	u32 buf_pos;
> +	unsigned long read_size;
> +	unsigned long done;
> +	int found = 0;
> +	int ret = 0;
> +	u32 last_pr = 0;
> +
> +	*property = 0;
> +	buf_pos = 0;
> +
> +	for (done = 0; done < desc_size; done += buf_pos) {
> +		read_size = desc_size - done;
> +		if (read_size > BUF_SIZE)
> +			read_size = BUF_SIZE;
> +
> +		ret = kernel_read(file, buf, read_size, &file_offset);
> +
> +		if (ret != read_size)
> +			return (ret < 0) ? ret : -EIO;
> +
> +		ret = 0;
> +		found = scan(buf, read_size, sizeof(struct gnu_property),
> +			     test_property, next_property,
> +			     &last_pr, pr_type, &buf_pos);
> +
> +		if ((!buf_pos) || found)
> +			break;
> +
> +		file_offset += buf_pos - read_size;
> +	}
> +
> +	if (found) {
> +		struct gnu_property *pr =
> +			(struct gnu_property *)(buf + buf_pos);
> +
> +		if (pr->pr_datasz == 4) {
> +			u32 *max =  (u32 *)(buf + read_size);
> +			u32 *data = (u32 *)((u8 *)pr + sizeof(*pr));
> +
> +			if (data + 1 <= max) {
> +				*property = *data;
> +			} else {
> +				file_offset += buf_pos - read_size;
> +				file_offset += sizeof(*pr);
> +				ret = kernel_read(file, property, 4,
> +						  &file_offset);
> +			}
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +/*
> + * Search a PT_NOTE segment for NT_GNU_PROPERTY_TYPE_0.
> + */
> +static int find_note_type_0(struct file *file, loff_t file_offset,
> +			    unsigned long note_size, u32 align,
> +			    u32 pr_type, u32 *property)
> +{
> +	u8 *buf;
> +	u32 buf_pos;
> +	unsigned long read_size;
> +	unsigned long done;
> +	int found = 0;
> +	int ret = 0;
> +
> +	buf = kmalloc(BUF_SIZE, GFP_KERNEL);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	*property = 0;
> +	buf_pos = 0;
> +
> +	for (done = 0; done < note_size; done += buf_pos) {
> +		read_size = note_size - done;
> +		if (read_size > BUF_SIZE)
> +			read_size = BUF_SIZE;
> +
> +		ret = kernel_read(file, buf, read_size, &file_offset);
> +
> +		if (ret != read_size) {
> +			ret = (ret < 0) ? ret : -EIO;
> +			kfree(buf);
> +			return ret;
> +		}
> +
> +		/*
> +		 * item_size = sizeof(struct elf_note) + elf_note.n_namesz.
> +		 * n_namesz is 4 for the note type we look for.
> +		 */
> +		ret = scan(buf, read_size, sizeof(struct elf_note) + 4,
> +			      test_note_type, next_note,
> +			      &align, NT_GNU_PROPERTY_TYPE_0, &buf_pos);
> +
> +		file_offset += buf_pos - read_size;
> +
> +		if (ret && !found) {
> +			struct elf_note *n =
> +				(struct elf_note *)(buf + buf_pos);
> +			u64 start = round_up(sizeof(*n) + n->n_namesz, align);
> +			u64 total = 0;
> +
> +			if (check_add_overflow(start, (u64)n->n_descsz, &total)) {
> +				ret = -EINVAL;
> +				break;
> +			}
> +			total = round_up(total, align);
> +
> +			ret = find_property(file, n->n_descsz,
> +					    file_offset + start,
> +					    buf, pr_type, property);
> +			found++;
> +			file_offset += total;
> +			buf_pos += total;
> +		} else if (!buf_pos || ret) {
> +			ret = 0;
> +			*property = 0;
> +			break;
> +		}
> +	}
> +
> +	kfree(buf);
> +	return ret;
> +}
> +
> +/*
> + * Look at an ELF file's PT_NOTE segments, then NT_GNU_PROPERTY_TYPE_0, then
> + * the property of pr_type.
> + *
> + * Input:
> + *	file: the file to search;
> + *	phdr: the file's elf header;
> + *	phnum: number of entries in phdr;
> + *	pr_type: the property type.
> + *
> + * Output:
> + *	The property found.
> + *
> + * Return:
> + *	Zero or error.
> + */
> +static int scan_segments_64(struct file *file, struct elf64_phdr *phdr,
> +			    int phnum, u32 pr_type, u32 *property)
> +{
> +	int i;
> +	int err = 0;
> +
> +	for (i = 0; i < phnum; i++, phdr++) {
> +		if ((phdr->p_type != PT_NOTE) || (phdr->p_align != 8))
> +			continue;
> +
> +		/*
> +		 * Search the PT_NOTE segment for NT_GNU_PROPERTY_TYPE_0.
> +		 */
> +		err = find_note_type_0(file, phdr->p_offset, phdr->p_filesz,
> +				       phdr->p_align, pr_type, property);
> +		if (err)
> +			return err;
> +	}
> +
> +	return 0;
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static int scan_segments_32(struct file *file, struct elf32_phdr *phdr,
> +			    int phnum, u32 pr_type, u32 *property)
> +{
> +	int i;
> +	int err = 0;
> +
> +	for (i = 0; i < phnum; i++, phdr++) {
> +		if ((phdr->p_type != PT_NOTE) || (phdr->p_align != 4))
> +			continue;
> +
> +		/*
> +		 * Search the PT_NOTE segment for NT_GNU_PROPERTY_TYPE_0.
> +		 */
> +		err = find_note_type_0(file, phdr->p_offset, phdr->p_filesz,
> +				       phdr->p_align, pr_type, property);
> +		if (err)
> +			return err;
> +	}
> +
> +	return 0;
> +}
> +#endif
> +
> +int get_gnu_property(void *ehdr_p, void *phdr_p, struct file *f,
> +		     u32 pr_type, u32 *property)
> +{
> +	struct elf64_hdr *ehdr64 = ehdr_p;
> +	int err = 0;
> +
> +	*property = 0;
> +
> +	if (ehdr64->e_ident[EI_CLASS] == ELFCLASS64) {
> +		struct elf64_phdr *phdr64 = phdr_p;
> +
> +		err = scan_segments_64(f, phdr64, ehdr64->e_phnum,
> +				       pr_type, property);
> +		if (err < 0)
> +			goto out;
> +	} else {
> +#ifdef CONFIG_COMPAT
> +		struct elf32_hdr *ehdr32 = ehdr_p;
> +
> +		if (ehdr32->e_ident[EI_CLASS] == ELFCLASS32) {
> +			struct elf32_phdr *phdr32 = phdr_p;
> +
> +			err = scan_segments_32(f, phdr32, ehdr32->e_phnum,
> +					       pr_type, property);
> +			if (err < 0)
> +				goto out;
> +		}
> +#else
> +	WARN_ONCE(1, "Exec of 32-bit app, but CONFIG_COMPAT is not enabled.\n");
> +	return -ENOTSUPP;
> +#endif
> +	}
> +
> +out:
> +	return err;
> +}
> diff --git a/include/linux/elf.h b/include/linux/elf.h
> index e3649b3e970e..c15febebe7f2 100644
> --- a/include/linux/elf.h
> +++ b/include/linux/elf.h
> @@ -56,4 +56,16 @@ static inline int elf_coredump_extra_notes_write(struct coredump_params *cprm) {
>  extern int elf_coredump_extra_notes_size(void);
>  extern int elf_coredump_extra_notes_write(struct coredump_params *cprm);
>  #endif
> +
> +#ifdef CONFIG_ARCH_USE_GNU_PROPERTY
> +extern int arch_setup_property(void *ehdr, void *phdr, struct file *f,
> +			       bool interp);
> +extern int get_gnu_property(void *ehdr_p, void *phdr_p, struct file *f,
> +			    u32 pr_type, u32 *feature);
> +#else
> +static inline int arch_setup_property(void *ehdr, void *phdr, struct file *f,
> +				      bool interp) { return 0; }
> +static inline int get_gnu_property(void *ehdr_p, void *phdr_p, struct file *f,
> +				   u32 pr_type, u32 *feature) { return 0; }
> +#endif
>  #endif /* _LINUX_ELF_H */
> diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
> index 34c02e4290fe..7b7603a44cbc 100644
> --- a/include/uapi/linux/elf.h
> +++ b/include/uapi/linux/elf.h
> @@ -372,6 +372,7 @@ typedef struct elf64_shdr {
>  #define NT_PRFPREG	2
>  #define NT_PRPSINFO	3
>  #define NT_TASKSTRUCT	4
> +#define NT_GNU_PROPERTY_TYPE_0 5
>  #define NT_AUXV		6
>  /*
>   * Note to userspace developers: size of NT_SIGINFO note may increase
> @@ -443,4 +444,11 @@ typedef struct elf64_note {
>    Elf64_Word n_type;	/* Content type */
>  } Elf64_Nhdr;
>  
> +/* .note.gnu.property types */
> +#define GNU_PROPERTY_X86_FEATURE_1_AND		(0xc0000002)
> +
> +/* Bits of GNU_PROPERTY_X86_FEATURE_1_AND */
> +#define GNU_PROPERTY_X86_FEATURE_1_IBT		(0x00000001)
> +#define GNU_PROPERTY_X86_FEATURE_1_SHSTK	(0x00000002)
> +
>  #endif /* _UAPI_LINUX_ELF_H */
> -- 
> 2.17.1
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox