* [PATCH v2 2/4] clone3: switch to copy_struct_from_user()
From: Aleksa Sarai @ 2019-09-25 23:03 UTC (permalink / raw)
To: Ingo Molnar, Peter Zijlstra, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Christian Brauner
Cc: Aleksa Sarai, Rasmus Villemoes, Al Viro, Linus Torvalds,
libc-alpha, linux-api, linux-kernel
In-Reply-To: <20190925230332.18690-1-cyphar@cyphar.com>
The change is very straightforward, and helps unify the syscall
interface for struct-from-userspace syscalls. Additionally, explicitly
define CLONE_ARGS_SIZE_VER0 to match the other users of the
struct-extension pattern.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
include/uapi/linux/sched.h | 2 ++
kernel/fork.c | 34 +++++++---------------------------
2 files changed, 9 insertions(+), 27 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index b3105ac1381a..0945805982b4 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -47,6 +47,8 @@ struct clone_args {
__aligned_u64 tls;
};
+#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
+
/*
* Scheduling policies
*/
diff --git a/kernel/fork.c b/kernel/fork.c
index 541fd805fb88..a86e3841ee4e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2530,39 +2530,19 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
#ifdef __ARCH_WANT_SYS_CLONE3
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
- size_t size)
+ size_t usize)
{
+ int err;
struct clone_args args;
- if (unlikely(size > PAGE_SIZE))
+ if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
-
- if (unlikely(size < sizeof(struct clone_args)))
+ if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
return -EINVAL;
- if (unlikely(!access_ok(uargs, size)))
- return -EFAULT;
-
- if (size > sizeof(struct clone_args)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uargs + sizeof(struct clone_args);
- end = (void __user *)uargs + size;
-
- for (; addr < end; addr++) {
- if (get_user(val, addr))
- return -EFAULT;
- if (val)
- return -E2BIG;
- }
-
- size = sizeof(struct clone_args);
- }
-
- if (copy_from_user(&args, uargs, size))
- return -EFAULT;
+ err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+ if (err)
+ return err;
/*
* Verify that higher 32bits of exit_signal are unset and that
--
2.23.0
^ permalink raw reply related
* [PATCH v2 3/4] sched_setattr: switch to copy_struct_from_user()
From: Aleksa Sarai @ 2019-09-25 23:03 UTC (permalink / raw)
To: Ingo Molnar, Peter Zijlstra, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Christian Brauner
Cc: Aleksa Sarai, Rasmus Villemoes, Al Viro, Linus Torvalds,
libc-alpha, linux-api, linux-kernel
In-Reply-To: <20190925230332.18690-1-cyphar@cyphar.com>
The change is very straightforward, and helps unify the syscall
interface for struct-from-userspace syscalls. Ideally we could also
unify sched_getattr(2)-style syscalls as well, but unfortunately the
correct semantics for such syscalls are much less clear (see [1] for
more detail). In future we could come up with a more sane idea for how
the syscall interface should look.
[1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and
robustify sched_read_attr() ABI logic and code")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
kernel/sched/core.c | 43 +++++++------------------------------------
1 file changed, 7 insertions(+), 36 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index df9f1fe5689b..cdb2f5e29b88 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4900,9 +4900,6 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
u32 size;
int ret;
- if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0))
- return -EFAULT;
-
/* Zero the full structure, so that a short copy will be nice: */
memset(attr, 0, sizeof(*attr));
@@ -4910,45 +4907,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
if (ret)
return ret;
- /* Bail out on silly large: */
- if (size > PAGE_SIZE)
- goto err_size;
-
/* ABI compatibility quirk: */
if (!size)
size = SCHED_ATTR_SIZE_VER0;
-
- if (size < SCHED_ATTR_SIZE_VER0)
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
goto err_size;
- /*
- * If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
- if (size > sizeof(*attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(*attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- ret = get_user(val, addr);
- if (ret)
- return ret;
- if (val)
- goto err_size;
- }
- size = sizeof(*attr);
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
}
- ret = copy_from_user(attr, uattr, size);
- if (ret)
- return -EFAULT;
-
if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
size < SCHED_ATTR_SIZE_VER1)
return -EINVAL;
@@ -5148,7 +5119,7 @@ sched_attr_copy_to_user(struct sched_attr __user *uattr,
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
- * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
+ * @usize: sizeof(attr) for fwd/bwd comp.
* @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
--
2.23.0
^ permalink raw reply related
* [PATCH v2 4/4] perf_event_open: switch to copy_struct_from_user()
From: Aleksa Sarai @ 2019-09-25 23:03 UTC (permalink / raw)
To: Ingo Molnar, Peter Zijlstra, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Christian Brauner
Cc: Aleksa Sarai, Rasmus Villemoes, Al Viro, Linus Torvalds,
libc-alpha, linux-api, linux-kernel
In-Reply-To: <20190925230332.18690-1-cyphar@cyphar.com>
The change is very straightforward, and helps unify the syscall
interface for struct-from-userspace syscalls.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
kernel/events/core.c | 47 +++++++++-----------------------------------
1 file changed, 9 insertions(+), 38 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0463c1151bae..038ed126bc1b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10498,55 +10498,26 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
u32 size;
int ret;
- if (!access_ok(uattr, PERF_ATTR_SIZE_VER0))
- return -EFAULT;
-
- /*
- * zero the full structure, so that a short copy will be nice.
- */
+ /* Zero the full structure, so that a short copy will be nice. */
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size);
if (ret)
return ret;
- if (size > PAGE_SIZE) /* silly large */
- goto err_size;
-
- if (!size) /* abi compat */
+ /* ABI compatibility quirk: */
+ if (!size)
size = PERF_ATTR_SIZE_VER0;
-
- if (size < PERF_ATTR_SIZE_VER0)
+ if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
goto err_size;
- /*
- * If we're handed a bigger struct than we know of,
- * ensure all the unknown bits are 0 - i.e. new
- * user-space does not rely on any kernel feature
- * extensions we dont know about yet.
- */
- if (size > sizeof(*attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(*attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- ret = get_user(val, addr);
- if (ret)
- return ret;
- if (val)
- goto err_size;
- }
- size = sizeof(*attr);
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
}
- ret = copy_from_user(attr, uattr, size);
- if (ret)
- return -EFAULT;
-
attr->size = size;
if (attr->__reserved_1)
--
2.23.0
^ permalink raw reply related
* Re: [PATCH v2 1/4] lib: introduce copy_struct_from_user() helper
From: Aleksa Sarai @ 2019-09-25 23:07 UTC (permalink / raw)
To: Ingo Molnar, Peter Zijlstra, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Christian Brauner
Cc: Kees Cook, Rasmus Villemoes, Al Viro, Linus Torvalds, libc-alpha,
linux-api, linux-kernel
In-Reply-To: <20190925230332.18690-2-cyphar@cyphar.com>
[-- Attachment #1: Type: text/plain, Size: 10795 bytes --]
(Damn, I forgot to add Kees to Cc.)
On 2019-09-26, Aleksa Sarai <cyphar@cyphar.com> wrote:
> A common pattern for syscall extensions is increasing the size of a
> struct passed from userspace, such that the zero-value of the new fields
> result in the old kernel behaviour (allowing for a mix of userspace and
> kernel vintages to operate on one another in most cases).
>
> While this interface exists for communication in both directions, only
> one interface is straightforward to have reasonable semantics for
> (userspace passing a struct to the kernel). For kernel returns to
> userspace, what the correct semantics are (whether there should be an
> error if userspace is unaware of a new extension) is very
> syscall-dependent and thus probably cannot be unified between syscalls
> (a good example of this problem is [1]).
>
> Previously there was no common lib/ function that implemented
> the necessary extension-checking semantics (and different syscalls
> implemented them slightly differently or incompletely[2]). Future
> patches replace common uses of this pattern to make use of
> copy_struct_from_user().
>
> Some in-kernel selftests that insure that the handling of alignment and
> various byte patterns are all handled identically to memchr_inv() usage.
>
> [1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and
> robustify sched_read_attr() ABI logic and code")
>
> [2]: For instance {sched_setattr,perf_event_open,clone3}(2) all do do
> similar checks to copy_struct_from_user() while rt_sigprocmask(2)
> always rejects differently-sized struct arguments.
>
> Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
> ---
> include/linux/bitops.h | 7 +++
> include/linux/uaccess.h | 4 ++
> lib/strnlen_user.c | 8 +--
> lib/test_user_copy.c | 59 ++++++++++++++++++---
> lib/usercopy.c | 115 ++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 180 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/bitops.h b/include/linux/bitops.h
> index cf074bce3eb3..a23f4c054768 100644
> --- a/include/linux/bitops.h
> +++ b/include/linux/bitops.h
> @@ -4,6 +4,13 @@
> #include <asm/types.h>
> #include <linux/bits.h>
>
> +/* Set bits in the first 'n' bytes when loaded from memory */
> +#ifdef __LITTLE_ENDIAN
> +# define aligned_byte_mask(n) ((1ul << 8*(n))-1)
> +#else
> +# define aligned_byte_mask(n) (~0xfful << (BITS_PER_LONG - 8 - 8*(n)))
> +#endif
> +
> #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
> #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
>
> diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
> index 34a038563d97..824569e309e4 100644
> --- a/include/linux/uaccess.h
> +++ b/include/linux/uaccess.h
> @@ -230,6 +230,10 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
>
> #endif /* ARCH_HAS_NOCACHE_UACCESS */
>
> +extern int is_zeroed_user(const void __user *from, size_t count);
> +extern int copy_struct_from_user(void *dst, size_t ksize,
> + const void __user *src, size_t usize);
> +
> /*
> * probe_kernel_read(): safely attempt to read from a location
> * @dst: pointer to the buffer that shall take the data
> diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
> index 7f2db3fe311f..39d588aaa8cd 100644
> --- a/lib/strnlen_user.c
> +++ b/lib/strnlen_user.c
> @@ -2,16 +2,10 @@
> #include <linux/kernel.h>
> #include <linux/export.h>
> #include <linux/uaccess.h>
> +#include <linux/bitops.h>
>
> #include <asm/word-at-a-time.h>
>
> -/* Set bits in the first 'n' bytes when loaded from memory */
> -#ifdef __LITTLE_ENDIAN
> -# define aligned_byte_mask(n) ((1ul << 8*(n))-1)
> -#else
> -# define aligned_byte_mask(n) (~0xfful << (BITS_PER_LONG - 8 - 8*(n)))
> -#endif
> -
> /*
> * Do a strnlen, return length of string *with* final '\0'.
> * 'count' is the user-supplied count, while 'max' is the
> diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
> index 67bcd5dfd847..f7cde3845ccc 100644
> --- a/lib/test_user_copy.c
> +++ b/lib/test_user_copy.c
> @@ -31,14 +31,58 @@
> # define TEST_U64
> #endif
>
> -#define test(condition, msg) \
> -({ \
> - int cond = (condition); \
> - if (cond) \
> - pr_warn("%s\n", msg); \
> - cond; \
> +#define test(condition, msg, ...) \
> +({ \
> + int cond = (condition); \
> + if (cond) \
> + pr_warn("[%d] " msg "\n", __LINE__, ##__VA_ARGS__); \
> + cond; \
> })
>
> +static int test_is_zeroed_user(char *kmem, char __user *umem, size_t size)
> +{
> + int ret = 0;
> + size_t start, end, i;
> + size_t zero_start = size / 4;
> + size_t zero_end = size - zero_start;
> +
> + /*
> + * We conduct a series of is_zeroed_user() tests on a block of memory
> + * with the following byte-pattern (trying every possible [start,end]
> + * pair):
> + *
> + * [ 00 ff 00 ff ... 00 00 00 00 ... ff 00 ff 00 ]
> + *
> + * And we verify that is_zeroed_user() acts identically to memchr_inv().
> + */
> +
> + for (i = 0; i < zero_start; i += 2)
> + kmem[i] = 0x00;
> + for (i = 1; i < zero_start; i += 2)
> + kmem[i] = 0xff;
> +
> + for (i = zero_end; i < size; i += 2)
> + kmem[i] = 0xff;
> + for (i = zero_end + 1; i < size; i += 2)
> + kmem[i] = 0x00;
> +
> + ret |= test(copy_to_user(umem, kmem, size),
> + "legitimate copy_to_user failed");
> +
> + for (start = 0; start <= size; start++) {
> + for (end = start; end <= size; end++) {
> + int retval = is_zeroed_user(umem + start, end - start);
> + int expected = memchr_inv(kmem + start, 0, end - start) == NULL;
> +
> + ret |= test(retval != expected,
> + "is_zeroed_user(=%d) != memchr_inv(=%d) mismatch (start=%lu, end=%lu)",
> + retval, expected, start, end);
> + }
> + }
> +
> + return ret;
> +}
> +
> static int __init test_user_copy_init(void)
> {
> int ret = 0;
> @@ -106,6 +150,9 @@ static int __init test_user_copy_init(void)
> #endif
> #undef test_legit
>
> + /* Test usage of is_zeroed_user(). */
> + ret |= test_is_zeroed_user(kmem, usermem, PAGE_SIZE);
> +
> /*
> * Invalid usage: none of these copies should succeed.
> */
> diff --git a/lib/usercopy.c b/lib/usercopy.c
> index c2bfbcaeb3dc..f795cf0946ad 100644
> --- a/lib/usercopy.c
> +++ b/lib/usercopy.c
> @@ -1,5 +1,6 @@
> // SPDX-License-Identifier: GPL-2.0
> #include <linux/uaccess.h>
> +#include <linux/bitops.h>
>
> /* out-of-line parts */
>
> @@ -31,3 +32,117 @@ unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
> }
> EXPORT_SYMBOL(_copy_to_user);
> #endif
> +
> +/**
> + * is_zeroed_user: check if a userspace buffer is full of zeros
> + * @from: Source address, in userspace.
> + * @size: Size of buffer.
> + *
> + * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
> + * userspace addresses. If there are non-zero bytes present then false is
> + * returned, otherwise true is returned.
> + *
> + * Returns:
> + * * -EFAULT: access to userspace failed.
> + */
> +int is_zeroed_user(const void __user *from, size_t size)
> +{
> + unsigned long val;
> + uintptr_t align = (uintptr_t) from % sizeof(unsigned long);
> +
> + if (unlikely(!size))
> + return true;
> +
> + from -= align;
> + size += align;
> +
> + if (!user_access_begin(from, size))
> + return -EFAULT;
> +
> + unsafe_get_user(val, (unsigned long __user *) from, err_fault);
> + if (align)
> + val &= ~aligned_byte_mask(align);
> +
> + while (size > sizeof(unsigned long)) {
> + if (unlikely(val))
> + goto done;
> +
> + from += sizeof(unsigned long);
> + size -= sizeof(unsigned long);
> +
> + unsafe_get_user(val, (unsigned long __user *) from, err_fault);
> + }
> +
> + if (size < sizeof(unsigned long))
> + val &= aligned_byte_mask(size);
> +
> +done:
> + user_access_end();
> + return (val == 0);
> +err_fault:
> + user_access_end();
> + return -EFAULT;
> +}
> +EXPORT_SYMBOL(is_zeroed_user);
> +
> +/**
> + * copy_struct_from_user: copy a struct from userspace
> + * @dst: Destination address, in kernel space. This buffer must be @ksize
> + * bytes long.
> + * @ksize: Size of @dst struct.
> + * @src: Source address, in userspace.
> + * @usize: (Alleged) size of @src struct.
> + *
> + * Copies a struct from userspace to kernel space, in a way that guarantees
> + * backwards-compatibility for struct syscall arguments (as long as future
> + * struct extensions are made such that all new fields are *appended* to the
> + * old struct, and zeroed-out new fields have the same meaning as the old
> + * struct).
> + *
> + * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
> + * The recommended usage is something like the following:
> + *
> + * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
> + * {
> + * int err;
> + * struct foo karg = {};
> + *
> + * err = copy_struct_from_user(&karg, sizeof(karg), uarg, size);
> + * if (err)
> + * return err;
> + *
> + * // ...
> + * }
> + *
> + * There are three cases to consider:
> + * * If @usize == @ksize, then it's copied verbatim.
> + * * If @usize < @ksize, then the userspace has passed an old struct to a
> + * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
> + * are to be zero-filled.
> + * * If @usize > @ksize, then the userspace has passed a new struct to an
> + * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
> + * are checked to ensure they are zeroed, otherwise -E2BIG is returned.
> + *
> + * Returns (in all cases, some data may have been copied):
> + * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src.
> + * * -EFAULT: access to userspace failed.
> + */
> +int copy_struct_from_user(void *dst, size_t ksize,
> + const void __user *src, size_t usize)
> +{
> + size_t size = min(ksize, usize);
> + size_t rest = max(ksize, usize) - size;
> +
> + /* Deal with trailing bytes. */
> + if (usize < ksize) {
> + memset(dst + size, 0, rest);
> + } else if (usize > ksize) {
> + int ret = is_zeroed_user(src + size, rest);
> + if (ret <= 0)
> + return ret ?: -E2BIG;
> + }
> + /* Copy the interoperable parts of the struct. */
> + if (copy_from_user(dst, src, size))
> + return -EFAULT;
> + return 0;
> +}
> --
> 2.23.0
>
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply
* Re: [PATCH v2 1/4] lib: introduce copy_struct_from_user() helper
From: Christian Brauner @ 2019-09-25 23:21 UTC (permalink / raw)
To: Aleksa Sarai
Cc: Ingo Molnar, Peter Zijlstra, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Christian Brauner, Rasmus Villemoes, Al Viro,
Linus Torvalds, libc-alpha, linux-api, linux-kernel
In-Reply-To: <20190925230332.18690-2-cyphar@cyphar.com>
On Thu, Sep 26, 2019 at 01:03:29AM +0200, Aleksa Sarai wrote:
> A common pattern for syscall extensions is increasing the size of a
> struct passed from userspace, such that the zero-value of the new fields
> result in the old kernel behaviour (allowing for a mix of userspace and
> kernel vintages to operate on one another in most cases).
>
> While this interface exists for communication in both directions, only
> one interface is straightforward to have reasonable semantics for
> (userspace passing a struct to the kernel). For kernel returns to
> userspace, what the correct semantics are (whether there should be an
> error if userspace is unaware of a new extension) is very
> syscall-dependent and thus probably cannot be unified between syscalls
> (a good example of this problem is [1]).
>
> Previously there was no common lib/ function that implemented
> the necessary extension-checking semantics (and different syscalls
> implemented them slightly differently or incompletely[2]). Future
> patches replace common uses of this pattern to make use of
> copy_struct_from_user().
>
> Some in-kernel selftests that insure that the handling of alignment and
> various byte patterns are all handled identically to memchr_inv() usage.
>
> [1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and
> robustify sched_read_attr() ABI logic and code")
>
> [2]: For instance {sched_setattr,perf_event_open,clone3}(2) all do do
> similar checks to copy_struct_from_user() while rt_sigprocmask(2)
> always rejects differently-sized struct arguments.
>
> Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
> ---
> include/linux/bitops.h | 7 +++
> include/linux/uaccess.h | 4 ++
> lib/strnlen_user.c | 8 +--
> lib/test_user_copy.c | 59 ++++++++++++++++++---
> lib/usercopy.c | 115 ++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 180 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/bitops.h b/include/linux/bitops.h
> index cf074bce3eb3..a23f4c054768 100644
> --- a/include/linux/bitops.h
> +++ b/include/linux/bitops.h
> @@ -4,6 +4,13 @@
> #include <asm/types.h>
> #include <linux/bits.h>
>
> +/* Set bits in the first 'n' bytes when loaded from memory */
> +#ifdef __LITTLE_ENDIAN
> +# define aligned_byte_mask(n) ((1ul << 8*(n))-1)
> +#else
> +# define aligned_byte_mask(n) (~0xfful << (BITS_PER_LONG - 8 - 8*(n)))
> +#endif
> +
> #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
> #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
>
> diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
> index 34a038563d97..824569e309e4 100644
> --- a/include/linux/uaccess.h
> +++ b/include/linux/uaccess.h
> @@ -230,6 +230,10 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
>
> #endif /* ARCH_HAS_NOCACHE_UACCESS */
>
> +extern int is_zeroed_user(const void __user *from, size_t count);
> +extern int copy_struct_from_user(void *dst, size_t ksize,
> + const void __user *src, size_t usize);
> +
> /*
> * probe_kernel_read(): safely attempt to read from a location
> * @dst: pointer to the buffer that shall take the data
> diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
> index 7f2db3fe311f..39d588aaa8cd 100644
> --- a/lib/strnlen_user.c
> +++ b/lib/strnlen_user.c
> @@ -2,16 +2,10 @@
> #include <linux/kernel.h>
> #include <linux/export.h>
> #include <linux/uaccess.h>
> +#include <linux/bitops.h>
>
> #include <asm/word-at-a-time.h>
>
> -/* Set bits in the first 'n' bytes when loaded from memory */
> -#ifdef __LITTLE_ENDIAN
> -# define aligned_byte_mask(n) ((1ul << 8*(n))-1)
> -#else
> -# define aligned_byte_mask(n) (~0xfful << (BITS_PER_LONG - 8 - 8*(n)))
> -#endif
> -
> /*
> * Do a strnlen, return length of string *with* final '\0'.
> * 'count' is the user-supplied count, while 'max' is the
> diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
> index 67bcd5dfd847..f7cde3845ccc 100644
> --- a/lib/test_user_copy.c
> +++ b/lib/test_user_copy.c
> @@ -31,14 +31,58 @@
> # define TEST_U64
> #endif
>
> -#define test(condition, msg) \
> -({ \
> - int cond = (condition); \
> - if (cond) \
> - pr_warn("%s\n", msg); \
> - cond; \
> +#define test(condition, msg, ...) \
> +({ \
> + int cond = (condition); \
> + if (cond) \
> + pr_warn("[%d] " msg "\n", __LINE__, ##__VA_ARGS__); \
> + cond; \
> })
>
> +static int test_is_zeroed_user(char *kmem, char __user *umem, size_t size)
> +{
> + int ret = 0;
> + size_t start, end, i;
> + size_t zero_start = size / 4;
> + size_t zero_end = size - zero_start;
> +
> + /*
> + * We conduct a series of is_zeroed_user() tests on a block of memory
> + * with the following byte-pattern (trying every possible [start,end]
> + * pair):
> + *
> + * [ 00 ff 00 ff ... 00 00 00 00 ... ff 00 ff 00 ]
> + *
> + * And we verify that is_zeroed_user() acts identically to memchr_inv().
> + */
> +
> + for (i = 0; i < zero_start; i += 2)
> + kmem[i] = 0x00;
> + for (i = 1; i < zero_start; i += 2)
> + kmem[i] = 0xff;
> +
> + for (i = zero_end; i < size; i += 2)
> + kmem[i] = 0xff;
> + for (i = zero_end + 1; i < size; i += 2)
> + kmem[i] = 0x00;
> +
> + ret |= test(copy_to_user(umem, kmem, size),
> + "legitimate copy_to_user failed");
> +
> + for (start = 0; start <= size; start++) {
> + for (end = start; end <= size; end++) {
> + int retval = is_zeroed_user(umem + start, end - start);
> + int expected = memchr_inv(kmem + start, 0, end - start) == NULL;
> +
> + ret |= test(retval != expected,
> + "is_zeroed_user(=%d) != memchr_inv(=%d) mismatch (start=%lu, end=%lu)",
> + retval, expected, start, end);
> + }
> + }
> +
> + return ret;
> +}
> +
> static int __init test_user_copy_init(void)
> {
> int ret = 0;
> @@ -106,6 +150,9 @@ static int __init test_user_copy_init(void)
> #endif
> #undef test_legit
>
> + /* Test usage of is_zeroed_user(). */
> + ret |= test_is_zeroed_user(kmem, usermem, PAGE_SIZE);
> +
> /*
> * Invalid usage: none of these copies should succeed.
> */
> diff --git a/lib/usercopy.c b/lib/usercopy.c
> index c2bfbcaeb3dc..f795cf0946ad 100644
> --- a/lib/usercopy.c
> +++ b/lib/usercopy.c
> @@ -1,5 +1,6 @@
> // SPDX-License-Identifier: GPL-2.0
> #include <linux/uaccess.h>
> +#include <linux/bitops.h>
>
> /* out-of-line parts */
>
> @@ -31,3 +32,117 @@ unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
> }
> EXPORT_SYMBOL(_copy_to_user);
> #endif
> +
> +/**
> + * is_zeroed_user: check if a userspace buffer is full of zeros
> + * @from: Source address, in userspace.
> + * @size: Size of buffer.
> + *
> + * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
> + * userspace addresses. If there are non-zero bytes present then false is
> + * returned, otherwise true is returned.
> + *
> + * Returns:
> + * * -EFAULT: access to userspace failed.
> + */
> +int is_zeroed_user(const void __user *from, size_t size)
> +{
> + unsigned long val;
> + uintptr_t align = (uintptr_t) from % sizeof(unsigned long);
> +
> + if (unlikely(!size))
> + return true;
You're returning "true" and another implicit boolean with (val == 0)
down below but -EFAULT in other places. But that function is
int is_zeroed_user()
Would probably be good if you either switch to
bool is_zeroed_user()
as the name suggests or rename the function and have it return an int
everywhere.
> +
> + from -= align;
> + size += align;
> +
> + if (!user_access_begin(from, size))
> + return -EFAULT;
> +
> + unsafe_get_user(val, (unsigned long __user *) from, err_fault);
> + if (align)
> + val &= ~aligned_byte_mask(align);
> +
> + while (size > sizeof(unsigned long)) {
> + if (unlikely(val))
> + goto done;
> +
> + from += sizeof(unsigned long);
> + size -= sizeof(unsigned long);
> +
> + unsafe_get_user(val, (unsigned long __user *) from, err_fault);
> + }
> +
> + if (size < sizeof(unsigned long))
> + val &= aligned_byte_mask(size);
> +
> +done:
> + user_access_end();
> + return (val == 0);
> +err_fault:
> + user_access_end();
> + return -EFAULT;
> +}
> +EXPORT_SYMBOL(is_zeroed_user);
> +
> +/**
> + * copy_struct_from_user: copy a struct from userspace
> + * @dst: Destination address, in kernel space. This buffer must be @ksize
> + * bytes long.
> + * @ksize: Size of @dst struct.
> + * @src: Source address, in userspace.
> + * @usize: (Alleged) size of @src struct.
> + *
> + * Copies a struct from userspace to kernel space, in a way that guarantees
> + * backwards-compatibility for struct syscall arguments (as long as future
> + * struct extensions are made such that all new fields are *appended* to the
> + * old struct, and zeroed-out new fields have the same meaning as the old
> + * struct).
> + *
> + * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
> + * The recommended usage is something like the following:
> + *
> + * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
> + * {
> + * int err;
> + * struct foo karg = {};
> + *
> + * err = copy_struct_from_user(&karg, sizeof(karg), uarg, size);
> + * if (err)
> + * return err;
> + *
> + * // ...
> + * }
> + *
> + * There are three cases to consider:
> + * * If @usize == @ksize, then it's copied verbatim.
> + * * If @usize < @ksize, then the userspace has passed an old struct to a
> + * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
> + * are to be zero-filled.
> + * * If @usize > @ksize, then the userspace has passed a new struct to an
> + * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
> + * are checked to ensure they are zeroed, otherwise -E2BIG is returned.
> + *
> + * Returns (in all cases, some data may have been copied):
> + * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src.
> + * * -EFAULT: access to userspace failed.
> + */
> +int copy_struct_from_user(void *dst, size_t ksize,
> + const void __user *src, size_t usize)
> +{
> + size_t size = min(ksize, usize);
> + size_t rest = max(ksize, usize) - size;
> +
> + /* Deal with trailing bytes. */
> + if (usize < ksize) {
> + memset(dst + size, 0, rest);
> + } else if (usize > ksize) {
> + int ret = is_zeroed_user(src + size, rest);
> + if (ret <= 0)
> + return ret ?: -E2BIG;
> + }
> + /* Copy the interoperable parts of the struct. */
> + if (copy_from_user(dst, src, size))
> + return -EFAULT;
> + return 0;
> +}
> --
> 2.23.0
>
^ permalink raw reply
* Re: [RFC PATCH 2/3] fs: add RWF_ENCODED for writing compressed data
From: Omar Sandoval @ 2019-09-26 0:36 UTC (permalink / raw)
To: Dave Chinner
Cc: Jann Horn, Aleksa Sarai, Jens Axboe, linux-fsdevel, linux-btrfs,
Linux API, Kernel Team, Andy Lutomirski
In-Reply-To: <20190925071129.GB804@dread.disaster.area>
On Wed, Sep 25, 2019 at 05:11:29PM +1000, Dave Chinner wrote:
> On Tue, Sep 24, 2019 at 10:01:41PM +0200, Jann Horn wrote:
> > On Tue, Sep 24, 2019 at 9:35 PM Omar Sandoval <osandov@osandov.com> wrote:
> > > On Tue, Sep 24, 2019 at 10:15:13AM -0700, Omar Sandoval wrote:
> > > > On Thu, Sep 19, 2019 at 05:44:12PM +0200, Jann Horn wrote:
> > > > > On Thu, Sep 19, 2019 at 8:54 AM Omar Sandoval <osandov@osandov.com> wrote:
> > > > > > Btrfs can transparently compress data written by the user. However, we'd
> > > > > > like to add an interface to write pre-compressed data directly to the
> > > > > > filesystem. This adds support for so-called "encoded writes" via
> > > > > > pwritev2().
> > > > > >
> > > > > > A new RWF_ENCODED flags indicates that a write is "encoded". If this
> > > > > > flag is set, iov[0].iov_base points to a struct encoded_iov which
> > > > > > contains metadata about the write: namely, the compression algorithm and
> > > > > > the unencoded (i.e., decompressed) length of the extent. iov[0].iov_len
> > > > > > must be set to sizeof(struct encoded_iov), which can be used to extend
> > > > > > the interface in the future. The remaining iovecs contain the encoded
> > > > > > extent.
> > > > > >
> > > > > > A similar interface for reading encoded data can be added to preadv2()
> > > > > > in the future.
> > > > > >
> > > > > > Filesystems must indicate that they support encoded writes by setting
> > > > > > FMODE_ENCODED_IO in ->file_open().
> > > > > [...]
> > > > > > +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,
> > > > > > + struct iov_iter *from)
> > > > > > +{
> > > > > > + if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> > > > > > + return -EINVAL;
> > > > > > + if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> > > > > > + return -EFAULT;
> > > > > > + if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> > > > > > + encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE) {
> > > > > > + iocb->ki_flags &= ~IOCB_ENCODED;
> > > > > > + return 0;
> > > > > > + }
> > > > > > + if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> > > > > > + encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> > > > > > + return -EINVAL;
> > > > > > + if (!capable(CAP_SYS_ADMIN))
> > > > > > + return -EPERM;
> > > > >
> > > > > How does this capable() check interact with io_uring? Without having
> > > > > looked at this in detail, I suspect that when an encoded write is
> > > > > requested through io_uring, the capable() check might be executed on
> > > > > something like a workqueue worker thread, which is probably running
> > > > > with a full capability set.
> > > >
> > > > I discussed this more with Jens. You're right, per-IO permission checks
> > > > aren't going to work. In fully-polled mode, we never get an opportunity
> > > > to check capabilities in right context. So, this will probably require a
> > > > new open flag.
> > >
> > > Actually, file_ns_capable() accomplishes the same thing without a new
> > > open flag. Changing the capable() check to file_ns_capable() in
> > > init_user_ns should be enough.
> >
> > +Aleksa for openat2() and open() space
> >
> > Mmh... but if the file descriptor has been passed through a privilege
> > boundary, it isn't really clear whether the original opener of the
> > file intended for this to be possible. For example, if (as a
> > hypothetical example) the init process opens a service's logfile with
> > root privileges, then passes the file descriptor to that logfile to
> > the service on execve(), that doesn't mean that the service should be
> > able to perform compressed writes into that file, I think.
>
> Where's the privilege boundary that is being crossed?
>
> We're talking about user data read/write access here, not some
> special security capability. Access to the data has already been
> permission checked, so why should the format that the data is
> supplied to the kernel in suddenly require new privilege checks?
>
> i.e. writing encoded data to a file requires exactly the same
> access permissions as writing cleartext data to the file. The only
> extra information here is whether the _filesystem_ supports encoded
> data, and that doesn't change regardless of what the open file gets
> passed to. Hence the capability is either there or it isn't, it
> doesn't transform not matter what privilege boundary the file is
> passed across. Similarly, we have permission to access the data
> or we don't through the struct file, it doesn't transform either.
>
> Hence I don't see why CAP_SYS_ADMIN or any special permissions are
> needed for an application with access permissions to file data to
> use these RWF_ENCODED IO interfaces. I am inclined to think the
> permission check here is wrong and should be dropped, and then all
> these issues go away.
>
> Yes, the app that is going to use this needs root perms because it
> accesses all data in the fs (it's a backup app!), but that doesn't
> mean you can only use RWF_ENCODED if you have root perms.
For RWF_ENCODED writes, the risk here is that we'd be adding a way for
an unprivileged process to feed arbitrary data to zlib/lzo/zstd in the
kernel. From what I could find, this is a new attack surface for
unprivileged processes, and based on the search results for
"$compression_algorithm CVE", there are real bugs here.
For RWF_ENCODED reads, there's another potential issue that occurred to
me. There are a few operations for which we may need to chop up a
compressed extent: hole punch, truncate, reflink, and dedupe. Rather
than recompressing the data, Btrfs keeps the whole extent on disk and
updates the file metadata to refer to a piece of the extent. If we want
to support RWF_ENCODED reads for such extents (and I think we do), then
we need to return the entire original extent along with that metadata.
For an unprivileged reader, there's a security issue that we may be
returning data that the reader wasn't supposed to see. (A privileged
reader can go and read the block device anyways.)
So, in my opinion, both reads and writes should require privilege just
to be on the safe side.
^ permalink raw reply
* Re: [PATCH v2 7/7] random: Remove kernel.random.read_wakeup_threshold
From: Eric W. Biederman @ 2019-09-26 1:09 UTC (permalink / raw)
To: Jann Horn
Cc: Andy Lutomirski, Theodore Tso, LKML, Linux API, Kees Cook,
Jason A. Donenfeld, Ahmed S. Darwish, Lennart Poettering,
Alexander E. Patrakov, Michael Kerrisk, Willy Tarreau,
Matthew Garrett, Ext4 Developers List, linux-man
In-Reply-To: <CAG48ez2tnJzLNCgAqCC+AOKuLGBSvBRi2_HZ97bEJ0zP1kWLHg@mail.gmail.com>
Jann Horn <jannh@google.com> writes:
> On Fri, Sep 20, 2019 at 4:37 PM Andy Lutomirski <luto@kernel.org> wrote:
>> It has no effect any more, so remove it. We can revert this if
>> there is some user code that expects to be able to set this sysctl.
>>
>> Signed-off-by: Andy Lutomirski <luto@kernel.org>
>> ---
>> drivers/char/random.c | 18 +-----------------
>> 1 file changed, 1 insertion(+), 17 deletions(-)
>>
>> diff --git a/drivers/char/random.c b/drivers/char/random.c
> [...]
>> - {
>> - .procname = "read_wakeup_threshold",
>
> There's a line in bin_random_table in kernel/sysctl_binary.c that
> refers to this sysctl, that should probably also be deleted?
I think it should be safe to leave in kernel/sysctl_binary.c
This reminds me. I think we may finally be at a point where we can
remove practically all of kernel/sysctl_binary.c
I need to double check but last I looked no distro enables
COINFIG_SYSCTL_SYSCALL anymore. Ubunutu was the last distro I know of
that enabled it, and I think it has been a year or more since Ubuntu
disabled CONFIG_SYSCTL_SYSCALL.
Eric
^ permalink raw reply
* Re: [PATCH v2 1/4] lib: introduce copy_struct_from_user() helper
From: kbuild test robot @ 2019-09-26 5:49 UTC (permalink / raw)
Cc: kbuild-all, Ingo Molnar, Peter Zijlstra, Alexander Shishkin,
Jiri Olsa, Namhyung Kim, Christian Brauner, Aleksa Sarai,
Rasmus Villemoes, Al Viro, Linus Torvalds, libc-alpha, linux-api,
linux-kernel
In-Reply-To: <20190925230332.18690-2-cyphar@cyphar.com>
[-- Attachment #1: Type: text/plain, Size: 5404 bytes --]
Hi Aleksa,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on linus/master]
[cannot apply to v5.3 next-20190924]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]
url: https://github.com/0day-ci/linux/commits/Aleksa-Sarai/lib-introduce-copy_struct_from_user-helper/20190926-071752
config: sh-allmodconfig (attached as .config)
compiler: sh4-linux-gcc (GCC) 7.4.0
reproduce:
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.4.0 make.cross ARCH=sh
If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
In file included from include/linux/printk.h:7:0,
from include/linux/kernel.h:15,
from include/asm-generic/bug.h:18,
from arch/sh/include/asm/bug.h:112,
from include/linux/bug.h:5,
from include/linux/mmdebug.h:5,
from include/linux/mm.h:9,
from include/linux/mman.h:5,
from lib/test_user_copy.c:13:
lib/test_user_copy.c: In function 'test_is_zeroed_user':
include/linux/kern_levels.h:5:18: warning: format '%lu' expects argument of type 'long unsigned int', but argument 5 has type 'size_t {aka unsigned int}' [-Wformat=]
#define KERN_SOH "\001" /* ASCII Start Of Header */
^
include/linux/kern_levels.h:12:22: note: in expansion of macro 'KERN_SOH'
#define KERN_WARNING KERN_SOH "4" /* warning conditions */
^~~~~~~~
include/linux/printk.h:306:9: note: in expansion of macro 'KERN_WARNING'
printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
^~~~~~~~~~~~
include/linux/printk.h:307:17: note: in expansion of macro 'pr_warning'
#define pr_warn pr_warning
^~~~~~~~~~
>> lib/test_user_copy.c:38:3: note: in expansion of macro 'pr_warn'
pr_warn("[%d] " msg "\n", __LINE__, ##__VA_ARGS__); \
^~~~~~~
>> lib/test_user_copy.c:77:11: note: in expansion of macro 'test'
ret |= test(retval != expected,
^~~~
include/linux/kern_levels.h:5:18: warning: format '%lu' expects argument of type 'long unsigned int', but argument 6 has type 'size_t {aka unsigned int}' [-Wformat=]
#define KERN_SOH "\001" /* ASCII Start Of Header */
^
include/linux/kern_levels.h:12:22: note: in expansion of macro 'KERN_SOH'
#define KERN_WARNING KERN_SOH "4" /* warning conditions */
^~~~~~~~
include/linux/printk.h:306:9: note: in expansion of macro 'KERN_WARNING'
printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
^~~~~~~~~~~~
include/linux/printk.h:307:17: note: in expansion of macro 'pr_warning'
#define pr_warn pr_warning
^~~~~~~~~~
>> lib/test_user_copy.c:38:3: note: in expansion of macro 'pr_warn'
pr_warn("[%d] " msg "\n", __LINE__, ##__VA_ARGS__); \
^~~~~~~
>> lib/test_user_copy.c:77:11: note: in expansion of macro 'test'
ret |= test(retval != expected,
^~~~
vim +/pr_warn +38 lib/test_user_copy.c
33
34 #define test(condition, msg, ...) \
35 ({ \
36 int cond = (condition); \
37 if (cond) \
> 38 pr_warn("[%d] " msg "\n", __LINE__, ##__VA_ARGS__); \
39 cond; \
40 })
41
42 static int test_is_zeroed_user(char *kmem, char __user *umem, size_t size)
43 {
44 int ret = 0;
45 size_t start, end, i;
46 size_t zero_start = size / 4;
47 size_t zero_end = size - zero_start;
48
49 /*
50 * We conduct a series of is_zeroed_user() tests on a block of memory
51 * with the following byte-pattern (trying every possible [start,end]
52 * pair):
53 *
54 * [ 00 ff 00 ff ... 00 00 00 00 ... ff 00 ff 00 ]
55 *
56 * And we verify that is_zeroed_user() acts identically to memchr_inv().
57 */
58
59 for (i = 0; i < zero_start; i += 2)
60 kmem[i] = 0x00;
61 for (i = 1; i < zero_start; i += 2)
62 kmem[i] = 0xff;
63
64 for (i = zero_end; i < size; i += 2)
65 kmem[i] = 0xff;
66 for (i = zero_end + 1; i < size; i += 2)
67 kmem[i] = 0x00;
68
69 ret |= test(copy_to_user(umem, kmem, size),
70 "legitimate copy_to_user failed");
71
72 for (start = 0; start <= size; start++) {
73 for (end = start; end <= size; end++) {
74 int retval = is_zeroed_user(umem + start, end - start);
75 int expected = memchr_inv(kmem + start, 0, end - start) == NULL;
76
> 77 ret |= test(retval != expected,
78 "is_zeroed_user(=%d) != memchr_inv(=%d) mismatch (start=%lu, end=%lu)",
79 retval, expected, start, end);
80 }
81 }
82
83 return ret;
84 }
85
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 52170 bytes --]
^ permalink raw reply
* Re: [RFC PATCH 2/3] add RWF_ENCODED for writing compressed data
From: Colin Walters @ 2019-09-26 12:17 UTC (permalink / raw)
To: Chris Mason
Cc: Dave Chinner, Jann Horn, Omar Sandoval, Aleksa Sarai, Jens Axboe,
linux-fsdevel, linux-btrfs@vger.kernel.org, Linux API,
Kernel Team, Andy Lutomirski
In-Reply-To: <FF3F534F-B40D-4D7D-956B-F1B63C4751CC@fb.com>
On Wed, Sep 25, 2019, at 10:56 AM, Chris Mason wrote:
> The data is verified while being decompressed, but that's a fairly large
> fuzzing surface (all of zstd, zlib, and lzo). A lot of people will
> correctly argue that we already have that fuzzing surface today, but I'd
> rather not make a really easy way to stuff arbitrary bytes through the
> kernel decompression code until all the projects involved sign off.
Right. So maybe have this start of as a BTRFS ioctl and require
privileges? I assume that's sufficient for what Omar wants.
(Are there actually any other popular Linux filesystems that do transparent compression anyways?)
^ permalink raw reply
* Re: [PATCH v2 1/4] lib: introduce copy_struct_from_user() helper
From: Christian Brauner @ 2019-09-26 12:40 UTC (permalink / raw)
To: Aleksa Sarai
Cc: Ingo Molnar, Peter Zijlstra, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Rasmus Villemoes, Al Viro, Linus Torvalds,
libc-alpha, linux-api, linux-kernel
In-Reply-To: <20190925230332.18690-2-cyphar@cyphar.com>
On Thu, Sep 26, 2019 at 01:03:29AM +0200, Aleksa Sarai wrote:
> A common pattern for syscall extensions is increasing the size of a
> struct passed from userspace, such that the zero-value of the new fields
> result in the old kernel behaviour (allowing for a mix of userspace and
> kernel vintages to operate on one another in most cases).
>
> While this interface exists for communication in both directions, only
> one interface is straightforward to have reasonable semantics for
> (userspace passing a struct to the kernel). For kernel returns to
> userspace, what the correct semantics are (whether there should be an
> error if userspace is unaware of a new extension) is very
> syscall-dependent and thus probably cannot be unified between syscalls
> (a good example of this problem is [1]).
>
> Previously there was no common lib/ function that implemented
> the necessary extension-checking semantics (and different syscalls
> implemented them slightly differently or incompletely[2]). Future
> patches replace common uses of this pattern to make use of
> copy_struct_from_user().
>
> Some in-kernel selftests that insure that the handling of alignment and
> various byte patterns are all handled identically to memchr_inv() usage.
>
> [1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and
> robustify sched_read_attr() ABI logic and code")
>
> [2]: For instance {sched_setattr,perf_event_open,clone3}(2) all do do
> similar checks to copy_struct_from_user() while rt_sigprocmask(2)
> always rejects differently-sized struct arguments.
>
> Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
> Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
> ---
> include/linux/bitops.h | 7 +++
> include/linux/uaccess.h | 4 ++
> lib/strnlen_user.c | 8 +--
> lib/test_user_copy.c | 59 ++++++++++++++++++---
> lib/usercopy.c | 115 ++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 180 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/bitops.h b/include/linux/bitops.h
> index cf074bce3eb3..a23f4c054768 100644
> --- a/include/linux/bitops.h
> +++ b/include/linux/bitops.h
> @@ -4,6 +4,13 @@
> #include <asm/types.h>
> #include <linux/bits.h>
>
> +/* Set bits in the first 'n' bytes when loaded from memory */
> +#ifdef __LITTLE_ENDIAN
> +# define aligned_byte_mask(n) ((1ul << 8*(n))-1)
> +#else
> +# define aligned_byte_mask(n) (~0xfful << (BITS_PER_LONG - 8 - 8*(n)))
> +#endif
Nti: The style in bitops.h suggestes this should be:
+/* Set bits in the first 'n' bytes when loaded from memory */
+#ifdef __LITTLE_ENDIAN
+# define aligned_byte_mask(n) ((1UL << 8*(n))-1)
+#else
+# define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n)))
+#endif
Using UL also makes 0xffUL clearer.
> +
> #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
> #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
>
> diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
> index 34a038563d97..824569e309e4 100644
> --- a/include/linux/uaccess.h
> +++ b/include/linux/uaccess.h
> @@ -230,6 +230,10 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
>
> #endif /* ARCH_HAS_NOCACHE_UACCESS */
>
> +extern int is_zeroed_user(const void __user *from, size_t count);
> +extern int copy_struct_from_user(void *dst, size_t ksize,
> + const void __user *src, size_t usize);
> +
> /*
> * probe_kernel_read(): safely attempt to read from a location
> * @dst: pointer to the buffer that shall take the data
> diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
> index 7f2db3fe311f..39d588aaa8cd 100644
> --- a/lib/strnlen_user.c
> +++ b/lib/strnlen_user.c
> @@ -2,16 +2,10 @@
> #include <linux/kernel.h>
> #include <linux/export.h>
> #include <linux/uaccess.h>
> +#include <linux/bitops.h>
>
> #include <asm/word-at-a-time.h>
>
> -/* Set bits in the first 'n' bytes when loaded from memory */
> -#ifdef __LITTLE_ENDIAN
> -# define aligned_byte_mask(n) ((1ul << 8*(n))-1)
> -#else
> -# define aligned_byte_mask(n) (~0xfful << (BITS_PER_LONG - 8 - 8*(n)))
> -#endif
> -
> /*
> * Do a strnlen, return length of string *with* final '\0'.
> * 'count' is the user-supplied count, while 'max' is the
> diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
> index 67bcd5dfd847..f7cde3845ccc 100644
> --- a/lib/test_user_copy.c
> +++ b/lib/test_user_copy.c
> @@ -31,14 +31,58 @@
> # define TEST_U64
> #endif
>
> -#define test(condition, msg) \
> -({ \
> - int cond = (condition); \
> - if (cond) \
> - pr_warn("%s\n", msg); \
> - cond; \
> +#define test(condition, msg, ...) \
> +({ \
> + int cond = (condition); \
> + if (cond) \
> + pr_warn("[%d] " msg "\n", __LINE__, ##__VA_ARGS__); \
> + cond; \
> })
>
> +static int test_is_zeroed_user(char *kmem, char __user *umem, size_t size)
> +{
> + int ret = 0;
> + size_t start, end, i;
> + size_t zero_start = size / 4;
> + size_t zero_end = size - zero_start;
> +
> + /*
> + * We conduct a series of is_zeroed_user() tests on a block of memory
> + * with the following byte-pattern (trying every possible [start,end]
> + * pair):
> + *
> + * [ 00 ff 00 ff ... 00 00 00 00 ... ff 00 ff 00 ]
> + *
> + * And we verify that is_zeroed_user() acts identically to memchr_inv().
> + */
> +
> + for (i = 0; i < zero_start; i += 2)
> + kmem[i] = 0x00;
> + for (i = 1; i < zero_start; i += 2)
> + kmem[i] = 0xff;
> +
> + for (i = zero_end; i < size; i += 2)
> + kmem[i] = 0xff;
> + for (i = zero_end + 1; i < size; i += 2)
> + kmem[i] = 0x00;
> +
> + ret |= test(copy_to_user(umem, kmem, size),
> + "legitimate copy_to_user failed");
> +
> + for (start = 0; start <= size; start++) {
> + for (end = start; end <= size; end++) {
> + int retval = is_zeroed_user(umem + start, end - start);
> + int expected = memchr_inv(kmem + start, 0, end - start) == NULL;
> +
> + ret |= test(retval != expected,
> + "is_zeroed_user(=%d) != memchr_inv(=%d) mismatch (start=%lu, end=%lu)",
> + retval, expected, start, end);
> + }
> + }
> +
> + return ret;
> +}
> +
> static int __init test_user_copy_init(void)
> {
> int ret = 0;
> @@ -106,6 +150,9 @@ static int __init test_user_copy_init(void)
> #endif
> #undef test_legit
>
> + /* Test usage of is_zeroed_user(). */
> + ret |= test_is_zeroed_user(kmem, usermem, PAGE_SIZE);
> +
> /*
> * Invalid usage: none of these copies should succeed.
> */
> diff --git a/lib/usercopy.c b/lib/usercopy.c
> index c2bfbcaeb3dc..f795cf0946ad 100644
> --- a/lib/usercopy.c
> +++ b/lib/usercopy.c
> @@ -1,5 +1,6 @@
> // SPDX-License-Identifier: GPL-2.0
> #include <linux/uaccess.h>
> +#include <linux/bitops.h>
>
> /* out-of-line parts */
>
> @@ -31,3 +32,117 @@ unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
> }
> EXPORT_SYMBOL(_copy_to_user);
> #endif
> +
> +/**
> + * is_zeroed_user: check if a userspace buffer is full of zeros
> + * @from: Source address, in userspace.
> + * @size: Size of buffer.
> + *
> + * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
> + * userspace addresses. If there are non-zero bytes present then false is
> + * returned, otherwise true is returned.
> + *
> + * Returns:
> + * * -EFAULT: access to userspace failed.
> + */
> +int is_zeroed_user(const void __user *from, size_t size)
See my bool vs int comment from yesterday and [1] for a suggestion.
> +{
> + unsigned long val;
> + uintptr_t align = (uintptr_t) from % sizeof(unsigned long);
> +
> + if (unlikely(!size))
> + return true;
> +
> + from -= align;
> + size += align;
> +
> + if (!user_access_begin(from, size))
> + return -EFAULT;
> +
> + unsafe_get_user(val, (unsigned long __user *) from, err_fault);
> + if (align)
> + val &= ~aligned_byte_mask(align);
> +
> + while (size > sizeof(unsigned long)) {
> + if (unlikely(val))
> + goto done;
> +
> + from += sizeof(unsigned long);
> + size -= sizeof(unsigned long);
> +
> + unsafe_get_user(val, (unsigned long __user *) from, err_fault);
> + }
> +
> + if (size < sizeof(unsigned long))
> + val &= aligned_byte_mask(size);
> +
> +done:
> + user_access_end();
> + return (val == 0);
> +err_fault:
> + user_access_end();
> + return -EFAULT;
> +}
> +EXPORT_SYMBOL(is_zeroed_user);
> +
> +/**
> + * copy_struct_from_user: copy a struct from userspace
> + * @dst: Destination address, in kernel space. This buffer must be @ksize
> + * bytes long.
> + * @ksize: Size of @dst struct.
> + * @src: Source address, in userspace.
> + * @usize: (Alleged) size of @src struct.
> + *
> + * Copies a struct from userspace to kernel space, in a way that guarantees
> + * backwards-compatibility for struct syscall arguments (as long as future
> + * struct extensions are made such that all new fields are *appended* to the
> + * old struct, and zeroed-out new fields have the same meaning as the old
> + * struct).
> + *
> + * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
> + * The recommended usage is something like the following:
> + *
> + * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
> + * {
> + * int err;
> + * struct foo karg = {};
> + *
> + * err = copy_struct_from_user(&karg, sizeof(karg), uarg, size);
> + * if (err)
> + * return err;
> + *
> + * // ...
> + * }
> + *
> + * There are three cases to consider:
> + * * If @usize == @ksize, then it's copied verbatim.
> + * * If @usize < @ksize, then the userspace has passed an old struct to a
> + * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
> + * are to be zero-filled.
> + * * If @usize > @ksize, then the userspace has passed a new struct to an
> + * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
> + * are checked to ensure they are zeroed, otherwise -E2BIG is returned.
> + *
> + * Returns (in all cases, some data may have been copied):
> + * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src.
> + * * -EFAULT: access to userspace failed.
> + */
> +int copy_struct_from_user(void *dst, size_t ksize,
> + const void __user *src, size_t usize)
> +{
> + size_t size = min(ksize, usize);
> + size_t rest = max(ksize, usize) - size;
> +
> + /* Deal with trailing bytes. */
> + if (usize < ksize) {
> + memset(dst + size, 0, rest);
> + } else if (usize > ksize) {
> + int ret = is_zeroed_user(src + size, rest);
> + if (ret <= 0)
> + return ret ?: -E2BIG;
> + }
> + /* Copy the interoperable parts of the struct. */
> + if (copy_from_user(dst, src, size))
> + return -EFAULT;
> + return 0;
> +}
> --
> 2.23.0
>
[1]: How about:
/**
* <sensible documentation>
*
* Returns 1, if the user buffer is zeroed, 0 if it is not, and a
* negative error code otherwise.
*
*/
int memuser_zero(const void __user *from, size_t size)
{
unsigned long val;
uintptr_t align = (uintptr_t) from % sizeof(unsigned long);
if (unlikely(size == 0))
return 1;
from -= align;
size += align;
if (!user_access_begin(from, size))
return -EFAULT;
unsafe_get_user(val, (unsigned long __user *) from, err_fault);
if (align)
val &= ~aligned_byte_mask(align);
while (size > sizeof(unsigned long)) {
if (unlikely(val))
goto err_fault;
from += sizeof(unsigned long);
size -= sizeof(unsigned long);
unsafe_get_user(val, (unsigned long __user *) from, err_fault);
}
if (size < sizeof(unsigned long))
val &= aligned_byte_mask(size);
done:
user_access_end();
return (val == 0);
err_fault:
user_access_end();
return -EFAULT;
}
int copy_struct_from_user(void *dst, size_t ksize,
const void __user *src, size_t usize)
{
size_t size = min(ksize, usize);
size_t rest = max(ksize, usize) - size;
/* Deal with trailing bytes. */
if (usize < ksize) {
memset(dst + size, 0, rest);
} else if ((usize > ksize) {
int ret = memuser_zero(src + size, rest);
if (ret < 0) /* we failed to check the user memory somehow */
return ret;
if (ret == 0) /* some of the memory was non-zero */
return -E2BIG;
}
/* Copy the interoperable parts of the struct. */
if (copy_from_user(dst, src, size))
return -EFAULT;
return 0;
}
^ permalink raw reply
* Re: [PATCH ghak90 V7 04/21] audit: convert to contid list to check for orch/engine ownership
From: Neil Horman @ 2019-09-26 14:46 UTC (permalink / raw)
To: Richard Guy Briggs
Cc: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
LKML, netdev, netfilter-devel, Paul Moore, sgrubb, omosnace,
dhowells, simo, eparis, serge, ebiederm, dwalsh, mpatel
In-Reply-To: <6fb4e270bfafef3d0477a06b0365fdcc5a5305b5.1568834524.git.rgb@redhat.com>
On Wed, Sep 18, 2019 at 09:22:21PM -0400, Richard Guy Briggs wrote:
> Store the audit container identifier in a refcounted kernel object that
> is added to the master list of audit container identifiers. This will
> allow multiple container orchestrators/engines to work on the same
> machine without danger of inadvertantly re-using an existing identifier.
> It will also allow an orchestrator to inject a process into an existing
> container by checking if the original container owner is the one
> injecting the task. A hash table list is used to optimize searches.
>
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
> include/linux/audit.h | 26 ++++++++++++++--
> kernel/audit.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++---
> kernel/audit.h | 8 +++++
> 3 files changed, 112 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index f2e3b81f2942..e317807cdd3e 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -95,10 +95,18 @@ struct audit_ntp_data {
> struct audit_ntp_data {};
> #endif
>
> +struct audit_cont {
> + struct list_head list;
> + u64 id;
> + struct task_struct *owner;
> + refcount_t refcount;
> + struct rcu_head rcu;
> +};
> +
> struct audit_task_info {
> kuid_t loginuid;
> unsigned int sessionid;
> - u64 contid;
> + struct audit_cont *cont;
> #ifdef CONFIG_AUDITSYSCALL
> struct audit_context *ctx;
> #endif
> @@ -203,11 +211,15 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
>
> static inline u64 audit_get_contid(struct task_struct *tsk)
> {
> - if (!tsk->audit)
> + if (!tsk->audit || !tsk->audit->cont)
> return AUDIT_CID_UNSET;
> - return tsk->audit->contid;
> + return tsk->audit->cont->id;
> }
>
> +extern struct audit_cont *audit_cont(struct task_struct *tsk);
> +
> +extern void audit_cont_put(struct audit_cont *cont);
> +
I see that you manual increment this refcount at various call sites, why
no corresponding audit_contid_hold function?
Neil
> extern u32 audit_enabled;
>
> extern int audit_signal_info(int sig, struct task_struct *t);
> @@ -277,6 +289,14 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
> return AUDIT_CID_UNSET;
> }
>
> +static inline struct audit_cont *audit_cont(struct task_struct *tsk)
> +{
> + return NULL;
> +}
> +
> +static inline void audit_cont_put(struct audit_cont *cont)
> +{ }
> +
> #define audit_enabled AUDIT_OFF
>
> static inline int audit_signal_info(int sig, struct task_struct *t)
> diff --git a/kernel/audit.c b/kernel/audit.c
> index a36ea57cbb61..ea0899130cc1 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -137,6 +137,8 @@ struct audit_net {
>
> /* Hash for inode-based rules */
> struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
> +/* Hash for contid-based rules */
> +struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
>
> static struct kmem_cache *audit_buffer_cache;
>
> @@ -204,6 +206,8 @@ struct audit_reply {
>
> static struct kmem_cache *audit_task_cache;
>
> +static DEFINE_SPINLOCK(audit_contid_list_lock);
> +
> void __init audit_task_init(void)
> {
> audit_task_cache = kmem_cache_create("audit_task",
> @@ -231,7 +235,9 @@ int audit_alloc(struct task_struct *tsk)
> }
> info->loginuid = audit_get_loginuid(current);
> info->sessionid = audit_get_sessionid(current);
> - info->contid = audit_get_contid(current);
> + info->cont = audit_cont(current);
> + if (info->cont)
> + refcount_inc(&info->cont->refcount);
> tsk->audit = info;
>
> ret = audit_alloc_syscall(tsk);
> @@ -246,7 +252,7 @@ int audit_alloc(struct task_struct *tsk)
> struct audit_task_info init_struct_audit = {
> .loginuid = INVALID_UID,
> .sessionid = AUDIT_SID_UNSET,
> - .contid = AUDIT_CID_UNSET,
> + .cont = NULL,
> #ifdef CONFIG_AUDITSYSCALL
> .ctx = NULL,
> #endif
> @@ -266,6 +272,9 @@ void audit_free(struct task_struct *tsk)
> /* Freeing the audit_task_info struct must be performed after
> * audit_log_exit() due to need for loginuid and sessionid.
> */
> + spin_lock(&audit_contid_list_lock);
> + audit_cont_put(tsk->audit->cont);
> + spin_unlock(&audit_contid_list_lock);
> info = tsk->audit;
> tsk->audit = NULL;
> kmem_cache_free(audit_task_cache, info);
> @@ -1657,6 +1666,9 @@ static int __init audit_init(void)
> for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
> INIT_LIST_HEAD(&audit_inode_hash[i]);
>
> + for (i = 0; i < AUDIT_CONTID_BUCKETS; i++)
> + INIT_LIST_HEAD(&audit_contid_hash[i]);
> +
> mutex_init(&audit_cmd_mutex.lock);
> audit_cmd_mutex.owner = NULL;
>
> @@ -2356,6 +2368,32 @@ int audit_signal_info(int sig, struct task_struct *t)
> return audit_signal_info_syscall(t);
> }
>
> +struct audit_cont *audit_cont(struct task_struct *tsk)
> +{
> + if (!tsk->audit || !tsk->audit->cont)
> + return NULL;
> + return tsk->audit->cont;
> +}
> +
> +/* audit_contid_list_lock must be held by caller */
> +void audit_cont_put(struct audit_cont *cont)
> +{
> + if (!cont)
> + return;
> + if (refcount_dec_and_test(&cont->refcount)) {
> + put_task_struct(cont->owner);
> + list_del_rcu(&cont->list);
> + kfree_rcu(cont, rcu);
> + }
> +}
> +
> +static struct task_struct *audit_cont_owner(struct task_struct *tsk)
> +{
> + if (tsk->audit && tsk->audit->cont)
> + return tsk->audit->cont->owner;
> + return NULL;
> +}
> +
> /*
> * audit_set_contid - set current task's audit contid
> * @task: target task
> @@ -2382,9 +2420,12 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> }
> oldcontid = audit_get_contid(task);
> read_lock(&tasklist_lock);
> - /* Don't allow the audit containerid to be unset */
> + /* Don't allow the contid to be unset */
> if (!audit_contid_valid(contid))
> rc = -EINVAL;
> + /* Don't allow the contid to be set to the same value again */
> + else if (contid == oldcontid) {
> + rc = -EADDRINUSE;
> /* if we don't have caps, reject */
> else if (!capable(CAP_AUDIT_CONTROL))
> rc = -EPERM;
> @@ -2397,8 +2438,43 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> else if (audit_contid_set(task))
> rc = -ECHILD;
> read_unlock(&tasklist_lock);
> - if (!rc)
> - task->audit->contid = contid;
> + if (!rc) {
> + struct audit_cont *oldcont = audit_cont(task);
> + struct audit_cont *cont = NULL;
> + struct audit_cont *newcont = NULL;
> + int h = audit_hash_contid(contid);
> +
> + spin_lock(&audit_contid_list_lock);
> + list_for_each_entry_rcu(cont, &audit_contid_hash[h], list)
> + if (cont->id == contid) {
> + /* task injection to existing container */
> + if (current == cont->owner) {
> + refcount_inc(&cont->refcount);
> + newcont = cont;
> + } else {
> + rc = -ENOTUNIQ;
> + goto conterror;
> + }
> + }
> + if (!newcont) {
> + newcont = kmalloc(sizeof(struct audit_cont), GFP_ATOMIC);
> + if (newcont) {
> + INIT_LIST_HEAD(&newcont->list);
> + newcont->id = contid;
> + get_task_struct(current);
> + newcont->owner = current;
> + refcount_set(&newcont->refcount, 1);
> + list_add_rcu(&newcont->list, &audit_contid_hash[h]);
> + } else {
> + rc = -ENOMEM;
> + goto conterror;
> + }
> + }
> + task->audit->cont = newcont;
> + audit_cont_put(oldcont);
> +conterror:
> + spin_unlock(&audit_contid_list_lock);
> + }
> task_unlock(task);
>
> if (!audit_enabled)
> diff --git a/kernel/audit.h b/kernel/audit.h
> index 16bd03b88e0d..e4a31aa92dfe 100644
> --- a/kernel/audit.h
> +++ b/kernel/audit.h
> @@ -211,6 +211,14 @@ static inline int audit_hash_ino(u32 ino)
> return (ino & (AUDIT_INODE_BUCKETS-1));
> }
>
> +#define AUDIT_CONTID_BUCKETS 32
> +extern struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
> +
> +static inline int audit_hash_contid(u64 contid)
> +{
> + return (contid & (AUDIT_CONTID_BUCKETS-1));
> +}
> +
> /* Indicates that audit should log the full pathname. */
> #define AUDIT_NAME_FULL -1
>
> --
> 1.8.3.1
>
>
^ permalink raw reply
* Re: [RFC PATCH 2/3] add RWF_ENCODED for writing compressed data
From: Omar Sandoval @ 2019-09-26 17:46 UTC (permalink / raw)
To: Colin Walters
Cc: Chris Mason, Dave Chinner, Jann Horn, Aleksa Sarai, Jens Axboe,
linux-fsdevel, linux-btrfs@vger.kernel.org, Linux API,
Kernel Team, Andy Lutomirski
In-Reply-To: <4e6e03c1-b2f4-4841-99af-cbb75f33c14d@www.fastmail.com>
On Thu, Sep 26, 2019 at 08:17:12AM -0400, Colin Walters wrote:
>
>
> On Wed, Sep 25, 2019, at 10:56 AM, Chris Mason wrote:
>
> > The data is verified while being decompressed, but that's a fairly large
> > fuzzing surface (all of zstd, zlib, and lzo). A lot of people will
> > correctly argue that we already have that fuzzing surface today, but I'd
> > rather not make a really easy way to stuff arbitrary bytes through the
> > kernel decompression code until all the projects involved sign off.
>
> Right. So maybe have this start of as a BTRFS ioctl and require
> privileges? I assume that's sufficient for what Omar wants.
That was the first version of this series, but Dave requested that I
make it generic [1].
> (Are there actually any other popular Linux filesystems that do transparent compression anyways?)
A scan over the kernel tree shows that a few other filesystems do
compression:
- jffs2
- pstore (if you can call that a filesystem)
- ubifs
- cramfs (read-only)
- erofs (read-only)
- squashfs (read-only)
None of the "mainstream" general-purpose filesystems have support, but
that was also the case for reflink/dedupe before XFS added support.
1: https://lore.kernel.org/linux-fsdevel/20190905021012.GL7777@dread.disaster.area/
^ permalink raw reply
* [PATCH v5 0/1] random: getrandom(2): warn on large CRNG waits, introduce new flags
From: Ahmed S. Darwish @ 2019-09-26 20:42 UTC (permalink / raw)
To: Linus Torvalds, Theodore Y. Ts'o
Cc: Florian Weimer, Willy Tarreau, Matthew Garrett, Andy Lutomirski,
Lennart Poettering, Eric W. Biederman, Alexander E. Patrakov,
Michael Kerrisk, lkml, linux-ext4, linux-api, linux-man
In-Reply-To: <CAHk-=wiCqDiU7SE3FLn2W26MS_voUAuqj5XFa1V_tiGTrrW-zQ@mail.gmail.com>
Summary / Changelog-v5:
- Add the new flags GRND_INSECURE and GRND_SECURE_UNBOUNDED_INITIAL_WAIT
to getrandom(2), instead of introducing a new getrandom2(2) system
call, which nobody liked.
- Fix a bug discovered through testing where "int ret =
wait_event_interruptible_timeout(waitq, true, MAX_SCHEDULE_TIMEOUT)"
returns failure (-1) due to implicit LONG_MAX => int truncation
- WARN if a process is stuck on getrandom(,,flags=0) for more than 30
seconds ... defconfig and bootparam configurable
- Add documentation for "random.getrandom_wait_threshold" kernel param
- Extra comments @ include/uapi/linux/random.h and random.c::getrandom.
Explicit recommendations to *exclusively* use the new flags.
- GRND_INSECURE never issue any warning, even if CRNG is not inited.
Similarly for GRND_SECURE_UNBOUNDED_INITIAL_WAIT, no matter how
big the unbounded wait is.
In a reply to the V4 patch, Linus posted a related patch [*] with the
following additions:
- Drop the original random.c behavior of having each /dev/urandom
"CRNG not inited" warning also _reset_ the crng_init_cnt entropy.
This is not included in this patch, as IMHO this can be done as a
separate patch on top.
- Limit GRND_RANDOM max count/buflen to 32MB instead of 2GB. This
is very sane obviously, and can be done in a separate patch on
top.
This V5 patch just tries to be as conservative as possible.
- GRND_WAIT_ENTROPY and GRND_EXCPLICIT: AFAIK these were primarily
added so that getrandom(,,flags=0) can be changed to return
weaker non-blocking crypto from non-inited CRG in a possible
future.
I hope we don't have to resort to that extreme measure.. Hopefully
the WARN() on this patch will be enough in nudging distributions to
enable more hwrng sources (RDRAND, etc.) .. and also for the
user-space developres badly pointed at (hi GDM and Qt) to fix their
code.
[*] https://lkml.kernel.org/r/CAHk-=wiCqDiU7SE3FLn2W26MS_voUAuqj5XFa1V_tiGTrrW-zQ@mail.gmail.com
Ahmed S. Darwish (1):
random: getrandom(2): warn on large CRNG waits, introduce new flags
.../admin-guide/kernel-parameters.txt | 7 ++
drivers/char/Kconfig | 60 ++++++++++-
drivers/char/random.c | 102 +++++++++++++++---
include/uapi/linux/random.h | 27 ++++-
4 files changed, 177 insertions(+), 19 deletions(-)
--
2.23.0
^ permalink raw reply
* [PATCH v5 1/1] random: getrandom(2): warn on large CRNG waits, introduce new flags
From: Ahmed S. Darwish @ 2019-09-26 20:44 UTC (permalink / raw)
To: Linus Torvalds, Theodore Y. Ts'o
Cc: Florian Weimer, Willy Tarreau, Matthew Garrett, Andy Lutomirski,
Lennart Poettering, Eric W. Biederman, Alexander E. Patrakov,
Michael Kerrisk, lkml, linux-ext4, linux-api, linux-man
In-Reply-To: <20190926204217.GA1366@pc>
Since Linux v3.17, getrandom(2) has been created as a new and more
secure interface for pseudorandom data requests. It attempted to
solve three problems, as compared to /dev/urandom:
1. the need to access filesystem paths, which can fail, e.g. under a
chroot
2. the need to open a file descriptor, which can fail under file
descriptor exhaustion attacks
3. the possibility of getting not-so-random data from /dev/urandom,
due to an incompletely initialized kernel entropy pool
To solve the third point, getrandom(2) was made to block until a
proper amount of entropy has been accumulated to initialize the CRNG
ChaCha20 cipher. This made the system call have no guaranteed
upper-bound for its initial waiting time.
Thus when it was introduced at c6e9d6f38894 ("random: introduce
getrandom(2) system call"), it came with a clear warning: "Any
userspace program which uses this new functionality must take care to
assure that if it is used during the boot process, that it will not
cause the init scripts or other portions of the system startup to hang
indefinitely."
Unfortunately, due to multiple factors, including not having this
warning written in a scary-enough language in the manpages, and due to
glibc since v2.25 implementing a BSD-like getentropy(3) in terms of
getrandom(2), modern user-space is calling getrandom(2) in the boot
path everywhere (e.g. Qt, GDM, etc.)
Embedded Linux systems were first hit by this, and reports of embedded
systems "getting stuck at boot" began to be common. Over time, the
issue began to even creep into consumer-level x86 laptops: mainstream
distributions, like Debian Buster, began to recommend installing
haveged as a duct-tape workaround... just to let the system boot.
Moreover, filesystem optimizations in EXT4 and XFS, e.g. b03755ad6f33
("ext4: make __ext4_get_inode_loc plug"), which merged directory
lookup code inode table IO, and very fast systemd boots, further
exaggerated the problem by limiting interrupt-based entropy sources.
This led to large delays until the kernel's cryptographic random
number generator (CRNG) got initialized.
On a Thinkpad E480 x86 laptop and an ArchLinux user-space, the ext4
commit earlier mentioned reliably blocked the system on GDM boot.
Mitigate the problem, as a first step, in two ways:
1. Issue a big WARN_ON when any process gets stuck on getrandom(2)
for more than CONFIG_GETRANDOM_WAIT_THRESHOLD_SEC seconds.
2. Introduce new getrandom(2) flags, with clear semantics that can
hopefully guide user-space in doing the right thing.
Set CONFIG_GETRANDOM_WAIT_THRESHOLD_SEC to a heuristic 30-second
default value. System integrators and distribution builders are deeply
encouraged not to increase it much: during system boot, you either
have entropy, or you don't. And if you didn't have entropy, it will
stay like this forever, because if you had, you wouldn't have blocked
in the first place. It's an atomic "either/or" situation, with no
middle ground. Please think twice.
For the new getrandom(2) flags, be much more explicit. As Linus
mentioned several times in the bug report thread, Linux should've
never provided /dev/random and the getrandom(GRND_RANDOM) APIs. These
interfaces are broken by design due to their almost-permanent
blockage, leading to the current misuse of /dev/urandom and
getrandom(flags=0) calls. Thus introduce the flags:
1. GRND_INSECURE
2. GRND_SECURE_UNBOUNDED_INITIAL_WAIT
where both extract randomness _exclusively_ from the urandom source.
Due to the explicit semantics of these new flags, GRND_INSECURE will
never issue a kernel warning message even if the CRNG is not yet
inited. Similarly, GRND_SECURE_UNBOUNDED_INITIAL_WAIT will never
cause any any kernel WARN, no matter how large the unbounded wait is.
Rreported-by: Ahmed S. Darwish <darwish.07@gmail.com>
Link: https://lkml.kernel.org/r/20190910042107.GA1517@darwi-home-pc
Link: https://lkml.kernel.org/r/20190912034421.GA2085@darwi-home-pc
Link: https://lkml.kernel.org/r/20190914222432.GC19710@mit.edu
Link: https://lkml.kernel.org/r/20180514003034.GI14763@thunk.org
Link: https://lkml.kernel.org/r/CAHk-=wjyH910+JRBdZf_Y9G54c1M=LBF8NKXB6vJcm9XjLnRfg@mail.gmail.com
Link: https://lkml.kernel.org/r/20190917052438.GA26923@1wt.eu
Link: https://lkml.kernel.org/r/20190917160844.GC31567@gardel-login
Link: https://lkml.kernel.org/r/CAHk-=wjABG3+daJFr4w3a+OWuraVcZpi=SMUg=pnZ+7+O0E2FA@mail.gmail.com
Link: https://lkml.kernel.org/r/CAHk-=wjQeiYu8Q_wcMgM-nAcW7KsBfG1+90DaTD5WF2cCeGCgA@mail.gmail.com
Link: https://factorable.net ("Widespread Weak Keys in Network Devices")
Link: https://man.openbsd.org/man4/random.4
Signed-off-by: Ahmed S. Darwish <darwish.07@gmail.com>
---
.../admin-guide/kernel-parameters.txt | 7 ++
drivers/char/Kconfig | 60 ++++++++++-
drivers/char/random.c | 102 +++++++++++++++---
include/uapi/linux/random.h | 27 ++++-
4 files changed, 177 insertions(+), 19 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6ef205fd7c97..d82eafc6a62a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3728,6 +3728,13 @@
fully seed the kernel's CRNG. Default is controlled
by CONFIG_RANDOM_TRUST_CPU.
+ random.getrandom_wait_threshold=
+ Maximum amount, in seconds, for a process to block
+ in a getrandom(,,flags=0) systemcall without a loud
+ warning in the kernel logs. Default is controlled by
+ CONFIG_RANDOM_GETRANDOM_WAIT_THRESHOLD_SEC. Check
+ the config option help text for more information.
+
ras=option[,option,...] [KNL] RAS-specific options
cec_disable [X86]
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index df0fc997dc3e..adc9bc63d27c 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -535,8 +535,6 @@ config ADI
and SSM (Silicon Secured Memory). Intended consumers of this
driver include crash and makedumpfile.
-endmenu
-
config RANDOM_TRUST_CPU
bool "Trust the CPU manufacturer to initialize Linux's CRNG"
depends on X86 || S390 || PPC
@@ -559,4 +557,60 @@ config RANDOM_TRUST_BOOTLOADER
device randomness. Say Y here to assume the entropy provided by the
booloader is trustworthy so it will be added to the kernel's entropy
pool. Otherwise, say N here so it will be regarded as device input that
- only mixes the entropy pool.
\ No newline at end of file
+ only mixes the entropy pool.
+
+config RANDOM_GETRANDOM_WAIT_THRESHOLD_SEC
+ int
+ default 30
+ help
+ The getrandom(2) system call, when asking for entropy from the
+ urandom source, blocks until the kernel's Cryptographic Random
+ Number Generator (CRNG) gets initialized. This configuration
+ option sets the maximum wait time, in seconds, for a process
+ to get blocked on such a system call before the kernel issues
+ a loud warning. Rationale follows:
+
+ When the getrandom(2) system call was created, it came with
+ the clear warning: "Any userspace program which uses this new
+ functionality must take care to assure that if it is used
+ during the boot process, that it will not cause the init
+ scripts or other portions of the system startup to hang
+ indefinitely.
+
+ Unfortunately, due to multiple factors, including not having
+ this warning written in a scary-enough language in the
+ manpages, and due to glibc since v2.25 implementing a BSD-like
+ getentropy(3) in terms of getrandom(2), modern user-space is
+ calling getrandom(2) in the boot path everywhere.
+
+ Embedded Linux systems were first hit by this, and reports of
+ embedded system "getting stuck at boot" began to be
+ common. Over time, the issue began to even creep into consumer
+ level x86 laptops: mainstream distributions, like Debian
+ Buster, began to recommend installing haveged as a workaround,
+ just to let the system boot.
+
+ Filesystem optimizations in EXT4 and XFS exaggerated the
+ problem, due to aggressive batching of IO requests, and thus
+ minimizing sources of entropy at boot. This led to large
+ delays until the kernel's CRNG got initialized.
+
+ System integrators and distribution builders are not
+ encouraged to considerably increase this value: during system
+ boot, you either have entropy, or you don't. And if you didn't
+ have entropy, it will stay like this forever, because if you
+ had, you wouldn't have blocked in the first place. It's an
+ atomic "either/or" situation, with no middle ground. Please
+ think twice.
+
+ Ideally, systems would be configured with hardware random
+ number generators, and/or configured to trust the CPU-provided
+ RNG's (CONFIG_RANDOM_TRUST_CPU) or boot-loader provided ones
+ (CONFIG_RANDOM_TRUST_BOOTLOADER). In addition, userspace
+ should generate cryptographic keys only as late as possible,
+ when they are needed, instead of during early boot. For
+ non-cryptographic use cases, such as dictionary seeds or MIT
+ Magic Cookies, the getrandom2(GRND2_INSECURE) system call,
+ or even random(3), may be more appropriate.
+
+endmenu
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 566922df4b7b..37c00cff1c08 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -322,6 +322,7 @@
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/nodemask.h>
+#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
#include <linux/percpu.h>
@@ -854,12 +855,21 @@ static void invalidate_batched_entropy(void);
static void numa_crng_init(void);
static bool trust_cpu __ro_after_init = IS_ENABLED(CONFIG_RANDOM_TRUST_CPU);
+static int getrandom_wait_threshold __ro_after_init =
+ CONFIG_RANDOM_GETRANDOM_WAIT_THRESHOLD_SEC;
+
static int __init parse_trust_cpu(char *arg)
{
return kstrtobool(arg, &trust_cpu);
}
early_param("random.trust_cpu", parse_trust_cpu);
+static int __init parse_getrandom_wait_threshold(char *arg)
+{
+ return kstrtoint(arg, 0, &getrandom_wait_threshold);
+}
+early_param("random.getrandom_wait_threshold", parse_getrandom_wait_threshold);
+
static void crng_initialize(struct crng_state *crng)
{
int i;
@@ -1960,7 +1970,7 @@ random_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
}
static ssize_t
-urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
+_urandom_read(char __user *buf, size_t nbytes, bool warn_on_noninited_crng)
{
unsigned long flags;
static int maxwarn = 10;
@@ -1968,7 +1978,7 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
if (!crng_ready() && maxwarn > 0) {
maxwarn--;
- if (__ratelimit(&urandom_warning))
+ if (warn_on_noninited_crng && __ratelimit(&urandom_warning))
printk(KERN_NOTICE "random: %s: uninitialized "
"urandom read (%zd bytes read)\n",
current->comm, nbytes);
@@ -1982,6 +1992,13 @@ urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
return ret;
}
+static ssize_t
+urandom_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
+{
+ /* warn on non-inited CRNG */
+ return _urandom_read(buf, nbytes, true);
+}
+
static __poll_t
random_poll(struct file *file, poll_table * wait)
{
@@ -2118,13 +2135,55 @@ const struct file_operations urandom_fops = {
.llseek = noop_llseek,
};
+static int geturandom_wait(char __user *buf, size_t count,
+ bool warn_on_large_wait)
+{
+ long ret, timeout = MAX_SCHEDULE_TIMEOUT;
+
+ if (warn_on_large_wait && (getrandom_wait_threshold > 0))
+ timeout = HZ * getrandom_wait_threshold;
+
+ do {
+ ret = wait_event_interruptible_timeout(crng_init_wait,
+ crng_ready(),
+ timeout);
+ if (ret < 0)
+ return ret;
+
+ if (ret == 0) {
+ WARN(1, "random: %s[%d]: getrandom(%zu bytes) "
+ "is blocked for more than %d seconds. Check "
+ "getrandom_wait(7)\n", current->comm,
+ task_pid_nr(current), count,
+ getrandom_wait_threshold);
+
+ /* warn once per caller */
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ }
+
+ } while (ret == 0);
+
+ return _urandom_read(buf, count, true);
+}
+
SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
unsigned int, flags)
{
- int ret;
+ unsigned int i, invalid_combs[] = {
+ GRND_INSECURE|GRND_SECURE_UNBOUNDED_INITIAL_WAIT,
+ GRND_INSECURE|GRND_RANDOM,
+ };
- if (flags & ~(GRND_NONBLOCK|GRND_RANDOM))
+ if (flags & ~(GRND_NONBLOCK | \
+ GRND_RANDOM | \
+ GRND_INSECURE | \
+ GRND_SECURE_UNBOUNDED_INITIAL_WAIT)) {
return -EINVAL;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(invalid_combs); i++)
+ if ((flags & invalid_combs[i]) == invalid_combs[i])
+ return -EINVAL;
if (count > INT_MAX)
count = INT_MAX;
@@ -2132,14 +2191,33 @@ SYSCALL_DEFINE3(getrandom, char __user *, buf, size_t, count,
if (flags & GRND_RANDOM)
return _random_read(flags & GRND_NONBLOCK, buf, count);
- if (!crng_ready()) {
- if (flags & GRND_NONBLOCK)
+ /*
+ * urandom: explicit request *not* to wait for CRNG init, and
+ * thus no "uninitialized urandom read" warnings.
+ */
+ if (flags & GRND_INSECURE)
+ return _urandom_read(buf, count, false);
+
+ /* urandom: nonblocking access */
+ if ((flags & GRND_NONBLOCK) && !crng_ready())
return -EAGAIN;
- ret = wait_for_random_bytes();
- if (unlikely(ret))
- return ret;
- }
- return urandom_read(NULL, buf, count, NULL);
+
+ /*
+ * urandom: explicit request *to* wait for CRNG init, and thus
+ * no "getrandom is blocked for more than X seconds" warnings
+ * on large waits.
+ */
+ if (flags & GRND_SECURE_UNBOUNDED_INITIAL_WAIT)
+ return geturandom_wait(buf, count, false);
+
+ /*
+ * urandom: *implicit* request to wait for CRNG init (flags=0)
+ *
+ * User-space has been badly abusing this by calling getrandom
+ * with flags=0 in the boot path, and thus blocking system
+ * boots forever in absence of entropy. Warn on large waits.
+ */
+ return geturandom_wait(buf, count, true);
}
/********************************************************************
@@ -2458,4 +2536,4 @@ void add_bootloader_randomness(const void *buf, unsigned int size)
else
add_device_randomness(buf, size);
}
-EXPORT_SYMBOL_GPL(add_bootloader_randomness);
\ No newline at end of file
+EXPORT_SYMBOL_GPL(add_bootloader_randomness);
diff --git a/include/uapi/linux/random.h b/include/uapi/linux/random.h
index 26ee91300e3e..5a3df92270a7 100644
--- a/include/uapi/linux/random.h
+++ b/include/uapi/linux/random.h
@@ -8,6 +8,7 @@
#ifndef _UAPI_LINUX_RANDOM_H
#define _UAPI_LINUX_RANDOM_H
+#include <linux/bits.h>
#include <linux/types.h>
#include <linux/ioctl.h>
#include <linux/irqnr.h>
@@ -23,7 +24,7 @@
/* Get the contents of the entropy pool. (Superuser only.) */
#define RNDGETPOOL _IOR( 'R', 0x02, int [2] )
-/*
+/*
* Write bytes into the entropy pool and add to the entropy count.
* (Superuser only.)
*/
@@ -47,10 +48,28 @@ struct rand_pool_info {
/*
* Flags for getrandom(2)
*
+ * 0 discouraged - don't use (see below)
* GRND_NONBLOCK Don't block and return EAGAIN instead
- * GRND_RANDOM Use the /dev/random pool instead of /dev/urandom
+ * GRND_RANDOM discouraged - don't use (uses /dev/random pool)
+ * GRND_INSECURE Use urandom pool, never block even if CRNG isn't inited
+ * GRND_SECURE_UNBOUNDED_INITIAL_WAIT
+ * Use urandom pool, block until CRNG is inited
+ *
+ * User-space has been badly abusing getrandom(flags=0) by calling
+ * it in the boot path, and thus blocking system boots forever in
+ * the absence of entropy (a blocked system cannot generate more
+ * entropy, by definition).
+ *
+ * Thus if a process blocks on a getrandom(flags=0), waithing for
+ * more than CONFIG_RANDOM_GETRANDOM_WAIT_THRESHOLD_SEC seconds,
+ * the kernel will issue a loud warning.
+ *
+ * In general, don't use flags=0. Always use either GRND_INSECURE
+ * or GRND_SECURE_UNBOUNDED_INITIAL_WAIT instead.
*/
-#define GRND_NONBLOCK 0x0001
-#define GRND_RANDOM 0x0002
+#define GRND_NONBLOCK BIT(0)
+#define GRND_RANDOM BIT(1)
+#define GRND_INSECURE BIT(2)
+#define GRND_SECURE_UNBOUNDED_INITIAL_WAIT BIT(3)
#endif /* _UAPI_LINUX_RANDOM_H */
--
2.23.0
^ permalink raw reply related
* Re: [PATCH RFC v4 1/1] random: WARN on large getrandom() waits and introduce getrandom2()
From: Ahmed S. Darwish @ 2019-09-26 21:11 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Florian Weimer, Linus Torvalds, Lennart Poettering,
Theodore Y. Ts'o, Eric W. Biederman, Alexander E. Patrakov,
Michael Kerrisk, Willy Tarreau, Matthew Garrett, lkml,
Ext4 Developers List, Linux API, linux-man
In-Reply-To: <CALCETrWM9opVj+BBrHnnTakTLunW_fB9RM+VSNpNSkR9drDjMw@mail.gmail.com>
On Mon, Sep 23, 2019 at 11:33:21AM -0700, Andy Lutomirski wrote:
> On Fri, Sep 20, 2019 at 11:07 PM Florian Weimer <fweimer@redhat.com> wrote:
> >
> > * Linus Torvalds:
> >
> > > Violently agreed. And that's kind of what the GRND_EXPLICIT is really
> > > aiming for.
> > >
> > > However, it's worth noting that nobody should ever use GRND_EXPLICIT
> > > directly. That's just the name for the bit. The actual users would use
> > > GRND_INSECURE or GRND_SECURE.
> >
> > Should we switch glibc's getentropy to GRND_EXPLICIT? Or something
> > else?
> >
> > I don't think we want to print a kernel warning for this function.
> >
>
> Contemplating this question, I think the answer is that we should just
> not introduce GRND_EXPLICIT or anything like it. glibc is going to
> have to do *something*, and getentropy() is unlikely to just go away.
> The explicitly documented semantics are that it blocks if the RNG
> isn't seeded.
>
> Similarly, FreeBSD has getrandom():
>
> https://www.freebsd.org/cgi/man.cgi?query=getrandom&sektion=2&manpath=freebsd-release-ports
>
> and if we make getrandom(..., 0) warn, then we have a situation where
> the *correct* (if regrettable) way to use the function on FreeBSD
> causes a warning on Linux.
>
> Let's just add GRND_INSECURE, make the blocking mode work better, and,
> if we're feeling a bit more adventurous, add GRND_SECURE_BLOCKING as a
> better replacement for 0, ...
This is what's now done in the just-submitted V5, except the "make the
blocking mode work better" part:
https://lkml.kernel.org/r/20190926204217.GA1366@pc
It's a very conservative patch so far IMHO (minus the loud warning).
Thanks,
--
Ahmed Darwish
^ permalink raw reply
* Re: [PATCH v5 1/1] random: getrandom(2): warn on large CRNG waits, introduce new flags
From: Andy Lutomirski @ 2019-09-26 21:39 UTC (permalink / raw)
To: Ahmed S. Darwish, Linus Torvalds, Theodore Y. Ts'o
Cc: Florian Weimer, Willy Tarreau, Matthew Garrett,
Lennart Poettering, Eric W. Biederman, Alexander E. Patrakov,
Michael Kerrisk, lkml, linux-ext4, linux-api, linux-man
In-Reply-To: <20190926204425.GA2198@pc>
On 9/26/19 1:44 PM, Ahmed S. Darwish wrote:
> Since Linux v3.17, getrandom(2) has been created as a new and more
> secure interface for pseudorandom data requests. It attempted to
> solve three problems, as compared to /dev/urandom:
>
> 1. the need to access filesystem paths, which can fail, e.g. under a
> chroot
>
> 2. the need to open a file descriptor, which can fail under file
> descriptor exhaustion attacks
>
> 3. the possibility of getting not-so-random data from /dev/urandom,
> due to an incompletely initialized kernel entropy pool
>
> To solve the third point, getrandom(2) was made to block until a
> proper amount of entropy has been accumulated to initialize the CRNG
> ChaCha20 cipher. This made the system call have no guaranteed
> upper-bound for its initial waiting time.
>
> Thus when it was introduced at c6e9d6f38894 ("random: introduce
> getrandom(2) system call"), it came with a clear warning: "Any
> userspace program which uses this new functionality must take care to
> assure that if it is used during the boot process, that it will not
> cause the init scripts or other portions of the system startup to hang
> indefinitely."
>
> Unfortunately, due to multiple factors, including not having this
> warning written in a scary-enough language in the manpages, and due to
> glibc since v2.25 implementing a BSD-like getentropy(3) in terms of
> getrandom(2), modern user-space is calling getrandom(2) in the boot
> path everywhere (e.g. Qt, GDM, etc.)
>
> Embedded Linux systems were first hit by this, and reports of embedded
> systems "getting stuck at boot" began to be common. Over time, the
> issue began to even creep into consumer-level x86 laptops: mainstream
> distributions, like Debian Buster, began to recommend installing
> haveged as a duct-tape workaround... just to let the system boot.
>
> Moreover, filesystem optimizations in EXT4 and XFS, e.g. b03755ad6f33
> ("ext4: make __ext4_get_inode_loc plug"), which merged directory
> lookup code inode table IO, and very fast systemd boots, further
> exaggerated the problem by limiting interrupt-based entropy sources.
> This led to large delays until the kernel's cryptographic random
> number generator (CRNG) got initialized.
>
> On a Thinkpad E480 x86 laptop and an ArchLinux user-space, the ext4
> commit earlier mentioned reliably blocked the system on GDM boot.
> Mitigate the problem, as a first step, in two ways:
>
> 1. Issue a big WARN_ON when any process gets stuck on getrandom(2)
> for more than CONFIG_GETRANDOM_WAIT_THRESHOLD_SEC seconds.
>
> 2. Introduce new getrandom(2) flags, with clear semantics that can
> hopefully guide user-space in doing the right thing.
>
> Set CONFIG_GETRANDOM_WAIT_THRESHOLD_SEC to a heuristic 30-second
> default value. System integrators and distribution builders are deeply
> encouraged not to increase it much: during system boot, you either
> have entropy, or you don't. And if you didn't have entropy, it will
> stay like this forever, because if you had, you wouldn't have blocked
> in the first place. It's an atomic "either/or" situation, with no
> middle ground. Please think twice.
So what do we expect glibc's getentropy() to do? If it just adds the
new flag to shut up the warning, we haven't really accomplished much.
^ permalink raw reply
* Re: [PATCH v2 1/4] lib: introduce copy_struct_from_user() helper
From: Aleksa Sarai @ 2019-09-27 1:07 UTC (permalink / raw)
To: Christian Brauner
Cc: Ingo Molnar, Peter Zijlstra, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Christian Brauner, Rasmus Villemoes, Al Viro,
Linus Torvalds, libc-alpha, linux-api, linux-kernel
In-Reply-To: <20190925232139.45sbhj34fj7yvxer@wittgenstein>
[-- Attachment #1: Type: text/plain, Size: 1384 bytes --]
On 2019-09-26, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> On Thu, Sep 26, 2019 at 01:03:29AM +0200, Aleksa Sarai wrote:
> > +int is_zeroed_user(const void __user *from, size_t size)
> > +{
> > + unsigned long val;
> > + uintptr_t align = (uintptr_t) from % sizeof(unsigned long);
> > +
> > + if (unlikely(!size))
> > + return true;
>
> You're returning "true" and another implicit boolean with (val == 0)
> down below but -EFAULT in other places. But that function is int
> is_zeroed_user() Would probably be good if you either switch to bool
> is_zeroed_user() as the name suggests or rename the function and have
> it return an int everywhere.
I just checked, and in C11 (and presumably in older specs) it is
guaranteed that "true" and "false" from <stdbool.h> have the values 1
and 0 (respectively) [§7.18]. So this is perfectly well-defined.
Personally, I think it's more readable to have:
if (unlikely(size == 0))
return true;
/* ... */
return (val == 0);
compared to:
if (unlikely(size == 0))
return 1;
/* ... */
return val ? 0 : 1;
But I will change the function name (to check_zeroed_user) to make it
clearer that it isn't returning a boolean and that you need to check for
negative returns.
--
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply
* Re: [PATCH v2 1/4] lib: introduce copy_struct_from_user() helper
From: Christian Brauner @ 2019-09-27 8:20 UTC (permalink / raw)
To: Aleksa Sarai
Cc: Ingo Molnar, Peter Zijlstra, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Rasmus Villemoes, Al Viro, Linus Torvalds,
libc-alpha, linux-api, linux-kernel
In-Reply-To: <20190927010736.gy3vvvkjhwlybosj@yavin.dot.cyphar.com>
On Fri, Sep 27, 2019 at 11:07:36AM +1000, Aleksa Sarai wrote:
> On 2019-09-26, Christian Brauner <christian.brauner@ubuntu.com> wrote:
> > On Thu, Sep 26, 2019 at 01:03:29AM +0200, Aleksa Sarai wrote:
> > > +int is_zeroed_user(const void __user *from, size_t size)
> > > +{
> > > + unsigned long val;
> > > + uintptr_t align = (uintptr_t) from % sizeof(unsigned long);
> > > +
> > > + if (unlikely(!size))
> > > + return true;
> >
> > You're returning "true" and another implicit boolean with (val == 0)
> > down below but -EFAULT in other places. But that function is int
> > is_zeroed_user() Would probably be good if you either switch to bool
> > is_zeroed_user() as the name suggests or rename the function and have
> > it return an int everywhere.
>
> I just checked, and in C11 (and presumably in older specs) it is
> guaranteed that "true" and "false" from <stdbool.h> have the values 1
> and 0 (respectively) [§7.18]. So this is perfectly well-defined.
>
If you declare a function as returning an int, return ints and don't mix
returning ints and "proper" C boolean types. This:
static int foo()
{
if (bla)
return true;
return -1;
}
is just messy.
>
> Personally, I think it's more readable to have:
>
> if (unlikely(size == 0))
> return true;
> /* ... */
> return (val == 0);
>
> compared to:
>
> if (unlikely(size == 0))
> return 1;
> /* ... */
> return val ? 0 : 1;
Just do:
if (unlikely(size == 0))
return 1;
/* ... */
return (val == 0);
You don't need to change the last return.
Also, as I said in a previous mail: Please wait for rc1 (that's just two
days) to be out so you can base your patchset on that as there are
changes in mainline that cause a merge conflict with your changes.
Thanks!
Christian
^ permalink raw reply
* Re: [PATCH ghak90 V7 06/21] audit: contid limit of 32k imposed to avoid DoS
From: Neil Horman @ 2019-09-27 12:51 UTC (permalink / raw)
To: Richard Guy Briggs
Cc: containers, linux-api, Linux-Audit Mailing List, linux-fsdevel,
LKML, netdev, netfilter-devel, Paul Moore, sgrubb, omosnace,
dhowells, simo, eparis, serge, ebiederm, dwalsh, mpatel
In-Reply-To: <230e91cd3e50a3d8015daac135c24c4c58cf0a21.1568834524.git.rgb@redhat.com>
On Wed, Sep 18, 2019 at 09:22:23PM -0400, Richard Guy Briggs wrote:
> Set an arbitrary limit on the number of audit container identifiers to
> limit abuse.
>
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
> kernel/audit.c | 8 ++++++++
> kernel/audit.h | 4 ++++
> 2 files changed, 12 insertions(+)
>
> diff --git a/kernel/audit.c b/kernel/audit.c
> index 53d13d638c63..329916534dd2 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -139,6 +139,7 @@ struct audit_net {
> struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
> /* Hash for contid-based rules */
> struct list_head audit_contid_hash[AUDIT_CONTID_BUCKETS];
> +int audit_contid_count = 0;
>
> static struct kmem_cache *audit_buffer_cache;
>
> @@ -2384,6 +2385,7 @@ void audit_cont_put(struct audit_cont *cont)
> put_task_struct(cont->owner);
> list_del_rcu(&cont->list);
> kfree_rcu(cont, rcu);
> + audit_contid_count--;
> }
> }
>
> @@ -2456,6 +2458,11 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> goto conterror;
> }
> }
> + /* Set max contids */
> + if (audit_contid_count > AUDIT_CONTID_COUNT) {
> + rc = -ENOSPC;
> + goto conterror;
> + }
You should check for audit_contid_count == AUDIT_CONTID_COUNT here, no?
or at least >=, since you increment it below. Otherwise its possible
that you will exceed it by one in the full condition.
> if (!newcont) {
> newcont = kmalloc(sizeof(struct audit_cont), GFP_ATOMIC);
> if (newcont) {
> @@ -2465,6 +2472,7 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> newcont->owner = current;
> refcount_set(&newcont->refcount, 1);
> list_add_rcu(&newcont->list, &audit_contid_hash[h]);
> + audit_contid_count++;
> } else {
> rc = -ENOMEM;
> goto conterror;
> diff --git a/kernel/audit.h b/kernel/audit.h
> index 162de8366b32..543f1334ba47 100644
> --- a/kernel/audit.h
> +++ b/kernel/audit.h
> @@ -219,6 +219,10 @@ static inline int audit_hash_contid(u64 contid)
> return (contid & (AUDIT_CONTID_BUCKETS-1));
> }
>
> +extern int audit_contid_count;
> +
> +#define AUDIT_CONTID_COUNT 1 << 16
> +
Just to ask the question, since it wasn't clear in the changelog, what
abuse are you avoiding here? Ostensibly you should be able to create as
many container ids as you have space for, and the simple creation of
container ids doesn't seem like the resource strain I would be concerned
about here, given that an orchestrator can still create as many
containers as the system will otherwise allow, which will consume
significantly more ram/disk/etc.
> /* Indicates that audit should log the full pathname. */
> #define AUDIT_NAME_FULL -1
>
> --
> 1.8.3.1
>
>
^ permalink raw reply
* Re: [PATCH v5 1/1] random: getrandom(2): warn on large CRNG waits, introduce new flags
From: Ahmed S. Darwish @ 2019-09-28 9:30 UTC (permalink / raw)
To: Andy Lutomirski
Cc: Linus Torvalds, Theodore Y. Ts'o, Florian Weimer,
Willy Tarreau, Matthew Garrett, Lennart Poettering,
Eric W. Biederman, Alexander E. Patrakov, Michael Kerrisk, lkml,
linux-ext4, linux-api, linux-man
In-Reply-To: <9a9715dc-e30b-24fb-a754-464449cafb2f@kernel.org>
On Thu, Sep 26, 2019 at 02:39:44PM -0700, Andy Lutomirski wrote:
> On 9/26/19 1:44 PM, Ahmed S. Darwish wrote:
> > Since Linux v3.17, getrandom(2) has been created as a new and more
> > secure interface for pseudorandom data requests. It attempted to
> > solve three problems, as compared to /dev/urandom:
> >
> > 1. the need to access filesystem paths, which can fail, e.g. under a
> > chroot
> >
> > 2. the need to open a file descriptor, which can fail under file
> > descriptor exhaustion attacks
> >
> > 3. the possibility of getting not-so-random data from /dev/urandom,
> > due to an incompletely initialized kernel entropy pool
> >
> > To solve the third point, getrandom(2) was made to block until a
> > proper amount of entropy has been accumulated to initialize the CRNG
> > ChaCha20 cipher. This made the system call have no guaranteed
> > upper-bound for its initial waiting time.
> >
> > Thus when it was introduced at c6e9d6f38894 ("random: introduce
> > getrandom(2) system call"), it came with a clear warning: "Any
> > userspace program which uses this new functionality must take care to
> > assure that if it is used during the boot process, that it will not
> > cause the init scripts or other portions of the system startup to hang
> > indefinitely."
> >
> > Unfortunately, due to multiple factors, including not having this
> > warning written in a scary-enough language in the manpages, and due to
> > glibc since v2.25 implementing a BSD-like getentropy(3) in terms of
> > getrandom(2), modern user-space is calling getrandom(2) in the boot
> > path everywhere (e.g. Qt, GDM, etc.)
> >
> > Embedded Linux systems were first hit by this, and reports of embedded
> > systems "getting stuck at boot" began to be common. Over time, the
> > issue began to even creep into consumer-level x86 laptops: mainstream
> > distributions, like Debian Buster, began to recommend installing
> > haveged as a duct-tape workaround... just to let the system boot.
> >
> > Moreover, filesystem optimizations in EXT4 and XFS, e.g. b03755ad6f33
> > ("ext4: make __ext4_get_inode_loc plug"), which merged directory
> > lookup code inode table IO, and very fast systemd boots, further
> > exaggerated the problem by limiting interrupt-based entropy sources.
> > This led to large delays until the kernel's cryptographic random
> > number generator (CRNG) got initialized.
> >
> > On a Thinkpad E480 x86 laptop and an ArchLinux user-space, the ext4
> > commit earlier mentioned reliably blocked the system on GDM boot.
> > Mitigate the problem, as a first step, in two ways:
> >
> > 1. Issue a big WARN_ON when any process gets stuck on getrandom(2)
> > for more than CONFIG_GETRANDOM_WAIT_THRESHOLD_SEC seconds.
> >
> > 2. Introduce new getrandom(2) flags, with clear semantics that can
> > hopefully guide user-space in doing the right thing.
> >
> > Set CONFIG_GETRANDOM_WAIT_THRESHOLD_SEC to a heuristic 30-second
> > default value. System integrators and distribution builders are deeply
> > encouraged not to increase it much: during system boot, you either
> > have entropy, or you don't. And if you didn't have entropy, it will
> > stay like this forever, because if you had, you wouldn't have blocked
> > in the first place. It's an atomic "either/or" situation, with no
> > middle ground. Please think twice.
>
> So what do we expect glibc's getentropy() to do? If it just adds the new
> flag to shut up the warning, we haven't really accomplished much.
Yes, if glibc adds GRND_SECURE_UNBOUNDED_INITIAL_WAIT to gentropy(3),
then this exercise would indeed be invalidated. Hopefully,
coordination with glibc will be done so it won't happen... @Florian?
Afterwards, a sane approach would be for gentropy(3) to be deprecated,
and to add getentropy_secure_unbounded_initial_wait(3) and
getentropy_insecure(3).
Note that this V5 patch does not claim to fully solve the problem, but
it will:
1. Pinpoint to the processes causing system boots to block
2. Tell people what correct alternative to use when facing problem
#1 above, through the proposed getrandom_wait(7) manpage. That
manpage page will fully describe the problem, and advise
user-space to either use the new getrandom flags, or the new
glibc gentropy_*() variants.
thanks,
--
Ahmed Darwish
^ permalink raw reply
* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
From: Steven Rostedt @ 2019-09-28 23:37 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Andy Lutomirski, Andy Lutomirski, Alexei Starovoitov, Kees Cook,
LSM List, James Morris, Jann Horn, Peter Zijlstra,
Masami Hiramatsu, David S. Miller, Daniel Borkmann,
Network Development, bpf, kernel-team, Linux API
In-Reply-To: <20190829040721.ef6rumbaunkavyrr@ast-mbp.dhcp.thefacebook.com>
On Wed, 28 Aug 2019 21:07:24 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >
> > This won’t make me much more comfortable, since CAP_BPF lets it do an ever-growing set of nasty things. I’d much rather one or both of two things happen:
> >
> > 1. Give it CAP_TRACING only. It can leak my data, but it’s rather hard for it to crash my laptop, lose data, or cause other shenanigans.
> >
> > 2. Improve it a bit do all the privileged ops are wrapped by capset().
> >
> > Does this make sense? I’m a security person on occasion. I find
> > vulnerabilities and exploit them deliberately and I break things by
> > accident on a regular basis. In my considered opinion, CAP_TRACING
> > alone, even extended to cover part of BPF as I’ve described, is
> > decently safe. Getting root with just CAP_TRACING will be decently
> > challenging, especially if I don’t get to read things like sshd’s
> > memory, and improvements to mitigate even that could be added. I
> > am quite confident that attacks starting with CAP_TRACING will have
> > clear audit signatures if auditing is on. I am also confident that
> > CAP_BPF *will* allow DoS and likely privilege escalation, and this
> > will only get more likely as BPF gets more widely used. And, if
> > BPF-based auditing ever becomes a thing, writing to the audit
> > daemon’s maps will be a great way to cover one’s tracks.
>
> CAP_TRACING, as I'm proposing it, will allow full tracefs access.
> I think Steven and Massami prefer that as well.
> That includes kprobe with probe_kernel_read.
> That also means mini-DoS by installing kprobes everywhere or running
> too much ftrace.
I was talking with Kees at Plumbers about this, and we were talking
about just using simple file permissions. I started playing with some
patches to allow the tracefs be visible but by default it would only be
visible by root.
rwx------
Then a start up script (or perhaps mount options) could change the
group owner, and change this to:
rwxrwx---
Where anyone in the group assigned (say "tracing") gets full access to
the file system.
The more I was playing with this, the less I see the need for
CAP_TRACING for ftrace and reading the format files.
-- Steve
^ permalink raw reply
* Re: [PATCH -next] treewide: remove unused argument in lock_release()
From: Peter Zijlstra @ 2019-09-30 7:29 UTC (permalink / raw)
To: Qian Cai
Cc: juri.lelli, airlied, dri-devel, mhocko, alexander.levin,
joseph.qi, netdev, st, will, duyuyang, daniel, mark, mingo,
vdavydov.dev, jslaby, linux-ext4, intel-gfx, jack, mripard,
linux-fsdevel, viro, cgroups, bpf, tytso, linux-mm, linux-api,
linux-kernel, hannes, jlbec, gregkh, tj, akpm, davem, ocfs2-devel
In-Reply-To: <1568909380-32199-1-git-send-email-cai@lca.pw>
On Thu, Sep 19, 2019 at 12:09:40PM -0400, Qian Cai wrote:
> Since the commit b4adfe8e05f1 ("locking/lockdep: Remove unused argument
> in __lock_release"), @nested is no longer used in lock_release(), so
> remove it from all lock_release() calls and friends.
Right; I never did this cleanup for not wanting the churn, but as long
as it applies I'll take it.
> Signed-off-by: Qian Cai <cai@lca.pw>
> ---
> drivers/gpu/drm/drm_connector.c | 2 +-
> drivers/gpu/drm/i915/gem/i915_gem_shrinker.c | 6 +++---
> drivers/gpu/drm/i915/gt/intel_engine_pm.c | 2 +-
> drivers/gpu/drm/i915/i915_request.c | 2 +-
> drivers/tty/tty_ldsem.c | 8 ++++----
> fs/dcache.c | 2 +-
> fs/jbd2/transaction.c | 4 ++--
> fs/kernfs/dir.c | 4 ++--
> fs/ocfs2/dlmglue.c | 2 +-
> include/linux/jbd2.h | 2 +-
> include/linux/lockdep.h | 21 ++++++++++-----------
> include/linux/percpu-rwsem.h | 4 ++--
> include/linux/rcupdate.h | 2 +-
> include/linux/rwlock_api_smp.h | 16 ++++++++--------
> include/linux/seqlock.h | 4 ++--
> include/linux/spinlock_api_smp.h | 8 ++++----
> include/linux/ww_mutex.h | 2 +-
> include/net/sock.h | 2 +-
> kernel/bpf/stackmap.c | 2 +-
> kernel/cpu.c | 2 +-
> kernel/locking/lockdep.c | 3 +--
> kernel/locking/mutex.c | 4 ++--
> kernel/locking/rtmutex.c | 6 +++---
> kernel/locking/rwsem.c | 10 +++++-----
> kernel/printk/printk.c | 10 +++++-----
> kernel/sched/core.c | 2 +-
> lib/locking-selftest.c | 24 ++++++++++++------------
> mm/memcontrol.c | 2 +-
> net/core/sock.c | 2 +-
> tools/lib/lockdep/include/liblockdep/common.h | 3 +--
> tools/lib/lockdep/include/liblockdep/mutex.h | 2 +-
> tools/lib/lockdep/include/liblockdep/rwlock.h | 2 +-
> tools/lib/lockdep/preload.c | 16 ++++++++--------
> 33 files changed, 90 insertions(+), 93 deletions(-)
Thanks!
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply
* [PATCH v3 0/4] lib: introduce copy_struct_from_user() helper
From: Aleksa Sarai @ 2019-09-30 18:28 UTC (permalink / raw)
To: Ingo Molnar, Peter Zijlstra, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Christian Brauner, Kees Cook
Cc: Aleksa Sarai, Rasmus Villemoes, Al Viro, Linus Torvalds,
libc-alpha, linux-api, linux-kernel
Patch changelog:
v3:
* Rename is_zeroed_user() to check_zeroed_user(). [Christian Brauner]
* Various minor cleanups. [Christian Brauner]
* Add tests for check_zeroed_user() and copy_struct_from_user() to
lib/test_user_copy.ko (and thus EXPORT_SYMBOL them both).
v2: <https://lore.kernel.org/lkml/20190925230332.18690-1-cyphar@cyphar.com/>
v1: <https://lore.kernel.org/lkml/20190925165915.8135-1-cyphar@cyphar.com/>
This series was split off from the openat2(2) syscall discussion[1].
However, the copy_struct_to_user() helper has been dropped, because
after some discussion it appears that there is no really obvious
semantics for how copy_struct_to_user() should work on mixed-vintages
(for instance, whether [2] is the correct semantics for all syscalls).
A common pattern for syscall extensions is increasing the size of a
struct passed from userspace, such that the zero-value of the new fields
result in the old kernel behaviour (allowing for a mix of userspace and
kernel vintages to operate on one another in most cases).
Previously there was no common lib/ function that implemented
the necessary extension-checking semantics (and different syscalls
implemented them slightly differently or incompletely[3]). This series
implements the helper and ports several syscalls to use it.
Some in-kernel selftests are included in this patch. More complete
self-tests for copy_struct_from_user() are included in the openat2()
patchset.
[1]: https://lore.kernel.org/lkml/20190904201933.10736-1-cyphar@cyphar.com/
[2]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and
robustify sched_read_attr() ABI logic and code")
[3]: For instance {sched_setattr,perf_event_open,clone3}(2) all do do
similar checks to copy_struct_from_user() while rt_sigprocmask(2)
always rejects differently-sized struct arguments.
Aleksa Sarai (4):
lib: introduce copy_struct_from_user() helper
clone3: switch to copy_struct_from_user()
sched_setattr: switch to copy_struct_from_user()
perf_event_open: switch to copy_struct_from_user()
include/linux/bitops.h | 7 ++
include/linux/uaccess.h | 4 ++
include/uapi/linux/sched.h | 2 +
kernel/events/core.c | 47 +++----------
kernel/fork.c | 34 ++--------
kernel/sched/core.c | 43 ++----------
lib/strnlen_user.c | 8 +--
lib/test_user_copy.c | 133 +++++++++++++++++++++++++++++++++++--
lib/usercopy.c | 123 ++++++++++++++++++++++++++++++++++
9 files changed, 287 insertions(+), 114 deletions(-)
--
2.23.0
^ permalink raw reply
* [PATCH v3 1/4] lib: introduce copy_struct_from_user() helper
From: Aleksa Sarai @ 2019-09-30 18:28 UTC (permalink / raw)
To: Ingo Molnar, Peter Zijlstra, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Christian Brauner, Kees Cook
Cc: Aleksa Sarai, Rasmus Villemoes, Al Viro, Linus Torvalds,
libc-alpha, linux-api, linux-kernel
In-Reply-To: <20190930182810.6090-1-cyphar@cyphar.com>
A common pattern for syscall extensions is increasing the size of a
struct passed from userspace, such that the zero-value of the new fields
result in the old kernel behaviour (allowing for a mix of userspace and
kernel vintages to operate on one another in most cases).
While this interface exists for communication in both directions, only
one interface is straightforward to have reasonable semantics for
(userspace passing a struct to the kernel). For kernel returns to
userspace, what the correct semantics are (whether there should be an
error if userspace is unaware of a new extension) is very
syscall-dependent and thus probably cannot be unified between syscalls
(a good example of this problem is [1]).
Previously there was no common lib/ function that implemented
the necessary extension-checking semantics (and different syscalls
implemented them slightly differently or incompletely[2]). Future
patches replace common uses of this pattern to make use of
copy_struct_from_user().
Some in-kernel selftests that insure that the handling of alignment and
various byte patterns are all handled identically to memchr_inv() usage.
[1]: commit 1251201c0d34 ("sched/core: Fix uclamp ABI bug, clean up and
robustify sched_read_attr() ABI logic and code")
[2]: For instance {sched_setattr,perf_event_open,clone3}(2) all do do
similar checks to copy_struct_from_user() while rt_sigprocmask(2)
always rejects differently-sized struct arguments.
Suggested-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
include/linux/bitops.h | 7 +++
include/linux/uaccess.h | 4 ++
lib/strnlen_user.c | 8 +--
lib/test_user_copy.c | 133 ++++++++++++++++++++++++++++++++++++++--
lib/usercopy.c | 123 +++++++++++++++++++++++++++++++++++++
5 files changed, 262 insertions(+), 13 deletions(-)
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index cf074bce3eb3..c94a9ff9f082 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -4,6 +4,13 @@
#include <asm/types.h>
#include <linux/bits.h>
+/* Set bits in the first 'n' bytes when loaded from memory */
+#ifdef __LITTLE_ENDIAN
+# define aligned_byte_mask(n) ((1UL << 8*(n))-1)
+#else
+# define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n)))
+#endif
+
#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 70bbdc38dc37..94f20e6ec6ab 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -231,6 +231,10 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
#endif /* ARCH_HAS_NOCACHE_UACCESS */
+extern int check_zeroed_user(const void __user *from, size_t size);
+extern int copy_struct_from_user(void *dst, size_t ksize,
+ const void __user *src, size_t usize);
+
/*
* probe_kernel_read(): safely attempt to read from a location
* @dst: pointer to the buffer that shall take the data
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index 28ff554a1be8..6c0005d5dd5c 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -3,16 +3,10 @@
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/mm.h>
+#include <linux/bitops.h>
#include <asm/word-at-a-time.h>
-/* Set bits in the first 'n' bytes when loaded from memory */
-#ifdef __LITTLE_ENDIAN
-# define aligned_byte_mask(n) ((1ul << 8*(n))-1)
-#else
-# define aligned_byte_mask(n) (~0xfful << (BITS_PER_LONG - 8 - 8*(n)))
-#endif
-
/*
* Do a strnlen, return length of string *with* final '\0'.
* 'count' is the user-supplied count, while 'max' is the
diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
index 67bcd5dfd847..3a17f71029bb 100644
--- a/lib/test_user_copy.c
+++ b/lib/test_user_copy.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
+#include <linux/random.h>
/*
* Several 32-bit architectures support 64-bit {get,put}_user() calls.
@@ -31,14 +32,129 @@
# define TEST_U64
#endif
-#define test(condition, msg) \
-({ \
- int cond = (condition); \
- if (cond) \
- pr_warn("%s\n", msg); \
- cond; \
+#define test(condition, msg, ...) \
+({ \
+ int cond = (condition); \
+ if (cond) \
+ pr_warn("[%d] " msg "\n", __LINE__, ##__VA_ARGS__); \
+ cond; \
})
+static bool is_zeroed(void *from, size_t size)
+{
+ return memchr_inv(from, 0x0, size) == NULL;
+}
+
+static int test_check_nonzero_user(char *kmem, char __user *umem, size_t size)
+{
+ int ret = 0;
+ size_t start, end, i;
+ size_t zero_start = size / 4;
+ size_t zero_end = size - zero_start;
+
+ /*
+ * We conduct a series of check_nonzero_user() tests on a block of memory
+ * with the following byte-pattern (trying every possible [start,end]
+ * pair):
+ *
+ * [ 00 ff 00 ff ... 00 00 00 00 ... ff 00 ff 00 ]
+ *
+ * And we verify that check_nonzero_user() acts identically to memchr_inv().
+ */
+
+ memset(kmem, 0x0, size);
+ for (i = 1; i < zero_start; i += 2)
+ kmem[i] = 0xff;
+ for (i = zero_end; i < size; i += 2)
+ kmem[i] = 0xff;
+
+ ret |= test(copy_to_user(umem, kmem, size),
+ "legitimate copy_to_user failed");
+
+ for (start = 0; start <= size; start++) {
+ for (end = start; end <= size; end++) {
+ size_t len = end - start;
+ int retval = check_zeroed_user(umem + start, len);
+ int expected = is_zeroed(kmem + start, len);
+
+ ret |= test(retval != expected,
+ "check_nonzero_user(=%d) != memchr_inv(=%d) mismatch (start=%zu, end=%zu)",
+ retval, expected, start, end);
+ }
+ }
+
+ return ret;
+}
+
+static int test_copy_struct_from_user(char *kmem, char __user *umem,
+ size_t size)
+{
+ int ret = 0;
+ char *rand = NULL, *expected = NULL;
+ size_t ksize, usize;
+
+ rand = kmalloc(size, GFP_KERNEL);
+ if (ret |= test(rand == NULL, "kmalloc failed"))
+ goto out_free;
+
+ expected = kmalloc(size, GFP_KERNEL);
+ if (ret |= test(expected == NULL, "kmalloc failed"))
+ goto out_free;
+
+ /* Fill umem with random bytes. */
+ memset(kmem, 0x0, size);
+ prandom_bytes(rand, size);
+ ret |= test(copy_to_user(umem, rand, size),
+ "legitimate copy_to_user failed");
+
+ /* Check basic case -- (usize == ksize). */
+ ksize = size;
+ usize = size;
+ memcpy(expected, rand, ksize);
+
+ ret |= test(copy_struct_from_user(kmem, ksize, umem, usize),
+ "copy_struct_from_user(usize == ksize) failed");
+ ret |= test(memcmp(kmem, expected, ksize),
+ "copy_struct_from_user(usize == ksize) gives unexpected copy");
+
+ /* Old userspace case -- (usize < ksize). */
+ ksize = size;
+ usize = ksize / 2;
+
+ memcpy(expected, rand, usize);
+ memset(expected + usize, 0x0, ksize - usize);
+
+ ret |= test(copy_struct_from_user(kmem, ksize, umem, usize),
+ "copy_struct_from_user(usize < ksize) failed");
+ ret |= test(memcmp(kmem, expected, ksize),
+ "copy_struct_from_user(usize < ksize) gives unexpected copy");
+
+ /* New userspace (-E2BIG) case -- (usize > ksize). */
+ usize = size;
+ ksize = usize / 2;
+
+ ret |= test(copy_struct_from_user(kmem, ksize, umem, usize) != -E2BIG,
+ "copy_struct_from_user(usize > ksize) didn't give E2BIG");
+
+ /* New userspace (success) case -- (usize > ksize). */
+ usize = size;
+ ksize = usize / 2;
+
+ memcpy(expected, rand, ksize);
+
+ ret |= test(clear_user(umem + ksize, usize - ksize),
+ "legitimate clear_user failed");
+ ret |= test(copy_struct_from_user(kmem, ksize, umem, usize),
+ "copy_struct_from_user(usize > ksize) failed");
+ ret |= test(memcmp(kmem, expected, ksize),
+ "copy_struct_from_user(usize > ksize) gives unexpected copy");
+
+out_free:
+ kfree(expected);
+ kfree(rand);
+ return ret;
+}
+
static int __init test_user_copy_init(void)
{
int ret = 0;
@@ -106,6 +222,11 @@ static int __init test_user_copy_init(void)
#endif
#undef test_legit
+ /* Test usage of check_nonzero_user(). */
+ ret |= test_check_nonzero_user(kmem, usermem, 2 * PAGE_SIZE);
+ /* Test usage of copy_struct_from_user(). */
+ ret |= test_copy_struct_from_user(kmem, usermem, 2 * PAGE_SIZE);
+
/*
* Invalid usage: none of these copies should succeed.
*/
diff --git a/lib/usercopy.c b/lib/usercopy.c
index c2bfbcaeb3dc..cf7f854ed9c8 100644
--- a/lib/usercopy.c
+++ b/lib/usercopy.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/uaccess.h>
+#include <linux/bitops.h>
/* out-of-line parts */
@@ -31,3 +32,125 @@ unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
}
EXPORT_SYMBOL(_copy_to_user);
#endif
+
+/**
+ * check_zeroed_user: check if a userspace buffer only contains zero bytes
+ * @from: Source address, in userspace.
+ * @size: Size of buffer.
+ *
+ * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for
+ * userspace addresses (and is more efficient because we don't care where the
+ * first non-zero byte is).
+ *
+ * Returns:
+ * * 0: There were non-zero bytes present in the buffer.
+ * * 1: The buffer was full of zero bytes.
+ * * -EFAULT: access to userspace failed.
+ */
+int check_zeroed_user(const void __user *from, size_t size)
+{
+ unsigned long val;
+ uintptr_t align = (uintptr_t) from % sizeof(unsigned long);
+
+ if (unlikely(size == 0))
+ return 1;
+
+ from -= align;
+ size += align;
+
+ if (!user_access_begin(from, size))
+ return -EFAULT;
+
+ unsafe_get_user(val, (unsigned long __user *) from, err_fault);
+ if (align)
+ val &= ~aligned_byte_mask(align);
+
+ while (size > sizeof(unsigned long)) {
+ if (unlikely(val))
+ goto done;
+
+ from += sizeof(unsigned long);
+ size -= sizeof(unsigned long);
+
+ unsafe_get_user(val, (unsigned long __user *) from, err_fault);
+ }
+
+ if (size < sizeof(unsigned long))
+ val &= aligned_byte_mask(size);
+
+done:
+ user_access_end();
+ return (val == 0);
+err_fault:
+ user_access_end();
+ return -EFAULT;
+}
+EXPORT_SYMBOL(check_zeroed_user);
+
+/**
+ * copy_struct_from_user: copy a struct from userspace
+ * @dst: Destination address, in kernel space. This buffer must be @ksize
+ * bytes long.
+ * @ksize: Size of @dst struct.
+ * @src: Source address, in userspace.
+ * @usize: (Alleged) size of @src struct.
+ *
+ * Copies a struct from userspace to kernel space, in a way that guarantees
+ * backwards-compatibility for struct syscall arguments (as long as future
+ * struct extensions are made such that all new fields are *appended* to the
+ * old struct, and zeroed-out new fields have the same meaning as the old
+ * struct).
+ *
+ * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
+ * The recommended usage is something like the following:
+ *
+ * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
+ * {
+ * int err;
+ * struct foo karg = {};
+ *
+ * if (usize > PAGE_SIZE)
+ * return -E2BIG;
+ * if (usize < FOO_SIZE_VER0)
+ * return -EINVAL;
+ *
+ * err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
+ * if (err)
+ * return err;
+ *
+ * // ...
+ * }
+ *
+ * There are three cases to consider:
+ * * If @usize == @ksize, then it's copied verbatim.
+ * * If @usize < @ksize, then the userspace has passed an old struct to a
+ * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
+ * are to be zero-filled.
+ * * If @usize > @ksize, then the userspace has passed a new struct to an
+ * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
+ * are checked to ensure they are zeroed, otherwise -E2BIG is returned.
+ *
+ * Returns (in all cases, some data may have been copied):
+ * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src.
+ * * -EFAULT: access to userspace failed.
+ */
+int copy_struct_from_user(void *dst, size_t ksize,
+ const void __user *src, size_t usize)
+{
+ size_t size = min(ksize, usize);
+ size_t rest = max(ksize, usize) - size;
+
+ /* Deal with trailing bytes. */
+ if (usize < ksize) {
+ memset(dst + size, 0, rest);
+ } else if (usize > ksize) {
+ int ret = check_zeroed_user(src + size, rest);
+ if (ret <= 0)
+ return ret ?: -E2BIG;
+ }
+ /* Copy the interoperable parts of the struct. */
+ if (copy_from_user(dst, src, size))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL(copy_struct_from_user);
--
2.23.0
^ permalink raw reply related
* [PATCH v3 2/4] clone3: switch to copy_struct_from_user()
From: Aleksa Sarai @ 2019-09-30 18:28 UTC (permalink / raw)
To: Ingo Molnar, Peter Zijlstra, Alexander Shishkin, Jiri Olsa,
Namhyung Kim, Christian Brauner, Kees Cook
Cc: Aleksa Sarai, Rasmus Villemoes, Al Viro, Linus Torvalds,
libc-alpha, linux-api, linux-kernel
In-Reply-To: <20190930182810.6090-1-cyphar@cyphar.com>
The change is very straightforward, and helps unify the syscall
interface for struct-from-userspace syscalls. Additionally, explicitly
define CLONE_ARGS_SIZE_VER0 to match the other users of the
struct-extension pattern.
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
include/uapi/linux/sched.h | 2 ++
kernel/fork.c | 34 +++++++---------------------------
2 files changed, 9 insertions(+), 27 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index b3105ac1381a..0945805982b4 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -47,6 +47,8 @@ struct clone_args {
__aligned_u64 tls;
};
+#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
+
/*
* Scheduling policies
*/
diff --git a/kernel/fork.c b/kernel/fork.c
index f9572f416126..2ef529869c64 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2525,39 +2525,19 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
#ifdef __ARCH_WANT_SYS_CLONE3
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
- size_t size)
+ size_t usize)
{
+ int err;
struct clone_args args;
- if (unlikely(size > PAGE_SIZE))
+ if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
-
- if (unlikely(size < sizeof(struct clone_args)))
+ if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
return -EINVAL;
- if (unlikely(!access_ok(uargs, size)))
- return -EFAULT;
-
- if (size > sizeof(struct clone_args)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uargs + sizeof(struct clone_args);
- end = (void __user *)uargs + size;
-
- for (; addr < end; addr++) {
- if (get_user(val, addr))
- return -EFAULT;
- if (val)
- return -E2BIG;
- }
-
- size = sizeof(struct clone_args);
- }
-
- if (copy_from_user(&args, uargs, size))
- return -EFAULT;
+ err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+ if (err)
+ return err;
/*
* Verify that higher 32bits of exit_signal are unset and that
--
2.23.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox