Linux userland API discussions
 help / color / mirror / Atom feed
* [RFC PATCH 20/21] selinux: Implement the watch_key security hook
From: David Howells @ 2019-10-15 21:50 UTC (permalink / raw)
  To: torvalds
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>

Implement the watch_key security hook to make sure that a key grants the
caller View permission in order to set a watch on a key.

For the moment, the watch_devices security hook is left unimplemented as
it's not obvious what the object should be since the queue is global and
didn't previously exist.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
---

 security/selinux/hooks.c |   14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 9625b99e677f..53637dccee00 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -6579,6 +6579,17 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer)
 	*_buffer = context;
 	return rc;
 }
+
+#ifdef CONFIG_KEY_NOTIFICATIONS
+static int selinux_watch_key(struct key *key)
+{
+	struct key_security_struct *ksec = key->security;
+	u32 sid = current_sid();
+
+	return avc_has_perm(&selinux_state,
+			    sid, ksec->sid, SECCLASS_KEY, KEY_NEED_VIEW, NULL);
+}
+#endif
 #endif
 
 #ifdef CONFIG_SECURITY_INFINIBAND
@@ -7012,6 +7023,9 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(key_free, selinux_key_free),
 	LSM_HOOK_INIT(key_permission, selinux_key_permission),
 	LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity),
+#ifdef CONFIG_KEY_NOTIFICATIONS
+	LSM_HOOK_INIT(watch_key, selinux_watch_key),
+#endif
 #endif
 
 #ifdef CONFIG_AUDIT

^ permalink raw reply related

* [RFC PATCH 21/21] smack: Implement the watch_key and post_notification hooks
From: David Howells @ 2019-10-15 21:50 UTC (permalink / raw)
  To: torvalds
  Cc: dhowells, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>

Implement the watch_key security hook in Smack to make sure that a key
grants the caller Read permission in order to set a watch on a key.

Also implement the post_notification security hook to make sure that the
notification source is granted Write permission by the watch queue.

For the moment, the watch_devices security hook is left unimplemented as
it's not obvious what the object should be since the queue is global and
didn't previously exist.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
---

 include/linux/lsm_audit.h  |    1 +
 security/smack/smack_lsm.c |   82 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 915330abf6e5..734d67889826 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -74,6 +74,7 @@ struct common_audit_data {
 #define LSM_AUDIT_DATA_FILE	12
 #define LSM_AUDIT_DATA_IBPKEY	13
 #define LSM_AUDIT_DATA_IBENDPORT 14
+#define LSM_AUDIT_DATA_NOTIFICATION 15
 	union 	{
 		struct path path;
 		struct dentry *dentry;
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index ecea41ce919b..71b6f37d49c1 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4273,7 +4273,7 @@ static int smack_key_permission(key_ref_t key_ref,
 	if (tkp == NULL)
 		return -EACCES;
 
-	if (smack_privileged_cred(CAP_MAC_OVERRIDE, cred))
+	if (smack_privileged(CAP_MAC_OVERRIDE))
 		return 0;
 
 #ifdef CONFIG_AUDIT
@@ -4319,8 +4319,81 @@ static int smack_key_getsecurity(struct key *key, char **_buffer)
 	return length;
 }
 
+
+#ifdef CONFIG_KEY_NOTIFICATIONS
+/**
+ * smack_watch_key - Smack access to watch a key for notifications.
+ * @key: The key to be watched
+ *
+ * Return 0 if the @watch->cred has permission to read from the key object and
+ * an error otherwise.
+ */
+static int smack_watch_key(struct key *key)
+{
+	struct smk_audit_info ad;
+	struct smack_known *tkp = smk_of_current();
+	int rc;
+
+	if (key == NULL)
+		return -EINVAL;
+	/*
+	 * If the key hasn't been initialized give it access so that
+	 * it may do so.
+	 */
+	if (key->security == NULL)
+		return 0;
+	/*
+	 * This should not occur
+	 */
+	if (tkp == NULL)
+		return -EACCES;
+
+	if (smack_privileged_cred(CAP_MAC_OVERRIDE, current_cred()))
+		return 0;
+
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY);
+	ad.a.u.key_struct.key = key->serial;
+	ad.a.u.key_struct.key_desc = key->description;
+#endif
+	rc = smk_access(tkp, key->security, MAY_READ, &ad);
+	rc = smk_bu_note("key watch", tkp, key->security, MAY_READ, rc);
+	return rc;
+}
+#endif /* CONFIG_KEY_NOTIFICATIONS */
 #endif /* CONFIG_KEYS */
 
+#ifdef CONFIG_WATCH_QUEUE
+/**
+ * smack_post_notification - Smack access to post a notification to a queue
+ * @w_cred: The credentials of the watcher.
+ * @cred: The credentials of the event source (may be NULL).
+ * @n: The notification message to be posted.
+ */
+static int smack_post_notification(const struct cred *w_cred,
+				   const struct cred *cred,
+				   struct watch_notification *n)
+{
+	struct smk_audit_info ad;
+	struct smack_known *subj, *obj;
+	int rc;
+
+	/* Always let maintenance notifications through. */
+	if (n->type == WATCH_TYPE_META)
+		return 0;
+
+	if (!cred)
+		return 0;
+	subj = smk_of_task(smack_cred(cred));
+	obj = smk_of_task(smack_cred(w_cred));
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NOTIFICATION);
+	rc = smk_access(subj, obj, MAY_WRITE, &ad);
+	rc = smk_bu_note("notification", subj, obj, MAY_WRITE, rc);
+	return rc;
+}
+#endif /* CONFIG_WATCH_QUEUE */
+
 /*
  * Smack Audit hooks
  *
@@ -4709,8 +4782,15 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(key_free, smack_key_free),
 	LSM_HOOK_INIT(key_permission, smack_key_permission),
 	LSM_HOOK_INIT(key_getsecurity, smack_key_getsecurity),
+#ifdef CONFIG_KEY_NOTIFICATIONS
+	LSM_HOOK_INIT(watch_key, smack_watch_key),
+#endif
 #endif /* CONFIG_KEYS */
 
+#ifdef CONFIG_WATCH_QUEUE
+	LSM_HOOK_INIT(post_notification, smack_post_notification),
+#endif
+
  /* Audit hooks */
 #ifdef CONFIG_AUDIT
 	LSM_HOOK_INIT(audit_rule_init, smack_audit_rule_init),

^ permalink raw reply related

* Re: [RFC PATCH 00/21] pipe: Keyrings, Block and USB notifications
From: James Morris @ 2019-10-15 22:11 UTC (permalink / raw)
  To: David Howells
  Cc: torvalds, Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, linux-security-module, linux-fsdevel, linux-api
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>

On Tue, 15 Oct 2019, David Howells wrote:

> 
> Here's a set of patches to add a general notification queue concept and to
> add event sources such as:

On the next iteration, please include the rationale for this per your 
previous posting: https://lkml.org/lkml/2019/9/5/857


-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply

* Re: [RFC PATCH 02/21] Add a prelocked wake-up
From: Linus Torvalds @ 2019-10-15 22:14 UTC (permalink / raw)
  To: David Howells, Tim Chen, Kan Liang
  Cc: Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	Nicolas Dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, LSM List, linux-fsdevel, Linux API,
	Linux Kernel Mailing List
In-Reply-To: <157117608708.15019.1998141309054662114.stgit@warthog.procyon.org.uk>

On Tue, Oct 15, 2019 at 2:48 PM David Howells <dhowells@redhat.com> wrote:
>
> Add a wakeup call for a case whereby the caller already has the waitqueue
> spinlock held.

That naming is crazy.

We already have helper functions like this, and they are just called
"wake_up_locked()".

So the "prelocked" naming is just odd. Make it be
wake_up_interruptible_sync_poll_locked().

The helper function should likely be

  void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
unsigned int mode, void *key)
  {
        __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
  }
  EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);

to match the other naming patterns there.

[ Unrelated ]

Looking at that mess of functions, I also wonder if we should try to
just remove the bookmark code again. It was cute, and it was useful,
but I think the problem with the page lock list may have been fixed by
commit 9a1ea439b16b ("mm: put_and_wait_on_page_locked() while page is
migrated") which avoids the retry condition with
migrate_page_move_mapping().

Tim/Kan? Do you have the problematic load still?

              Linus

^ permalink raw reply

* Re: [RFC PATCH 08/21] pipe: Check for ring full inside of the spinlock in pipe_write()
From: Linus Torvalds @ 2019-10-15 22:20 UTC (permalink / raw)
  To: David Howells
  Cc: Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	Nicolas Dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, LSM List, linux-fsdevel, Linux API,
	Linux Kernel Mailing List
In-Reply-To: <157117614109.15019.15677943675625422728.stgit@warthog.procyon.org.uk>

On Tue, Oct 15, 2019 at 2:49 PM David Howells <dhowells@redhat.com> wrote:
>
> +                       if (head - pipe->tail == buffers) {

Can we just have helper inline functions for these things?

You describe them in the commit message of 03/21 (good), but it would
be even better if the code was just self-describing..

           Linus

^ permalink raw reply

* Re: [RFC PATCH 00/21] pipe: Keyrings, Block and USB notifications
From: Linus Torvalds @ 2019-10-15 22:32 UTC (permalink / raw)
  To: David Howells
  Cc: Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	Nicolas Dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, LSM List, linux-fsdevel, Linux API,
	Linux Kernel Mailing List
In-Reply-To: <157117606853.15019.15459271147790470307.stgit@warthog.procyon.org.uk>

Aside from the two small comments, the pipe side looked reasonable,
but I stopped looking when the patches moved on to the notificaiton
part, and maybe I missed something in the earlier ones too.

Which does bring me to the meat of this email: can we please keep the
pipe cleanups and prepwork (and benchmarking) as a separate patch
series? I'd like that to be done separately from the notification
code, since it's re-organization and cleanup - while the eventual goal
is to be able to add messages to the pipe atomically, I think the
series makes sense (and should make sense) on its own.

          Linus

^ permalink raw reply

* Re: [RFC PATCH 02/21] Add a prelocked wake-up
From: David Howells @ 2019-10-15 22:33 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: dhowells, Tim Chen, Kan Liang, Casey Schaufler, Stephen Smalley,
	Greg Kroah-Hartman, Nicolas Dichtel, raven, Christian Brauner,
	keyrings, linux-usb, linux-block, LSM List, linux-fsdevel,
	Linux API, Linux Kernel Mailing List
In-Reply-To: <CAHk-=whiz1sHXu8SVZKEC2dup=r5JMrftPtEt6ff9Ea8dyH8yQ@mail.gmail.com>

Linus Torvalds <torvalds@linux-foundation.org> wrote:

> > Add a wakeup call for a case whereby the caller already has the waitqueue
> > spinlock held.
> 
> That naming is crazy.

Sorry, yeah.  This is a bit hacked together at the moment and needs some more
tidying up.  I've had a lot of problems with performance regressions of up to
40% from seemingly simple changes involving moving locks around - it turns out
that the problem was that I was running with kasan enabled.

David

^ permalink raw reply

* Re: [PATCH 0/7] Harden userfaultfd
From: James Morris @ 2019-10-16  0:02 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: linux-api, linux-kernel, lokeshgidra, nnk, nosh, timmurray
In-Reply-To: <20191012191602.45649-1-dancol@google.com>

On Sat, 12 Oct 2019, Daniel Colascione wrote:

>  Documentation/admin-guide/sysctl/vm.rst | 19 +++++-
>  fs/anon_inodes.c                        | 89 +++++++++++++++++--------
>  fs/userfaultfd.c                        | 47 +++++++++++--
>  include/linux/anon_inodes.h             | 27 ++++++--
>  include/linux/lsm_hooks.h               |  8 +++
>  include/linux/security.h                |  2 +
>  include/linux/userfaultfd_k.h           |  3 +
>  include/uapi/linux/userfaultfd.h        | 14 ++++
>  kernel/sysctl.c                         |  9 +++
>  security/security.c                     |  8 +++
>  security/selinux/hooks.c                | 68 +++++++++++++++++++
>  security/selinux/include/classmap.h     |  2 +
>  12 files changed, 256 insertions(+), 40 deletions(-)

For any changes to security/ please include the linux-security-module 
list.

-- 
James Morris
<jmorris@namei.org>

^ permalink raw reply

* Re: [RFC PATCH 03/21] pipe: Use head and tail pointers for the ring, not cursor and length
From: Rasmus Villemoes @ 2019-10-16  7:46 UTC (permalink / raw)
  To: David Howells, torvalds
  Cc: Casey Schaufler, Stephen Smalley, Greg Kroah-Hartman,
	nicolas.dichtel, raven, Christian Brauner, keyrings, linux-usb,
	linux-block, linux-security-module, linux-fsdevel, linux-api,
	linux-kernel
In-Reply-To: <157117609543.15019.17103851546424902507.stgit@warthog.procyon.org.uk>

On 15/10/2019 23.48, David Howells wrote:
> Convert pipes to use head and tail pointers for the buffer ring rather than
> pointer and length as the latter requires two atomic ops to update (or a
> combined op) whereas the former only requires one.
> 
>  (1) The head pointer is the point at which production occurs and points to
>      the slot in which the next buffer will be placed.  This is equivalent
>      to pipe->curbuf + pipe->nrbufs.
> 
>      The head pointer belongs to the write-side.
> 
>  (2) The tail pointer is the point at which consumption occurs.  It points
>      to the next slot to be consumed.  This is equivalent to pipe->curbuf.
> 
>      The tail pointer belongs to the read-side.
> 
>  (3) head and tail are allowed to run to UINT_MAX and wrap naturally.  They
>      are only masked off when the array is being accessed, e.g.:
> 
> 	pipe->bufs[head & mask]
> 
>      This means that it is not necessary to have a dead slot in the ring as
>      head == tail isn't ambiguous.
> 
>  (4) The ring is empty if "head == tail".
> 
>  (5) The occupancy of the ring is "head - tail".
> 
>  (6) The number of free slots in the ring is "(tail + pipe->ring_size) -
>      head".

Seems an odd way of writing pipe->ring_size - (head - tail) ; i.e.
obviously #free slots is #size minus #occupancy.

>  (7) The ring is full if "head >= (tail + pipe->ring_size)", which can also
>      be written as "head - tail >= pipe->ring_size".
>

No it cannot, it _must_ be written in the latter form. Assuming
sizeof(int)==1 for simplicity, consider ring_size = 16, tail = 240.
Regardless whether head is 240, 241, ..., 255, 0, tail + ring_size wraps
to 0, so the former expression states the ring is full in all cases.

Better spell out somewhere that while head and tail are free-running, at
any point in time they satisfy the invariant head - tail <= pipe_size
(and also 0 <= head - tail, but that's a tautology for unsigned
ints...). Then it's a matter of taste if one wants to write "full" as
head-tail == pipe_size or head-tail >= pipe_size.

> Also split pipe->buffers into pipe->ring_size (which indicates the size of the
> ring) and pipe->max_usage (which restricts the amount of ring that write() is
> allowed to fill).  This allows for a pipe that is both writable by the kernel
> notification facility and by userspace, allowing plenty of ring space for
> notifications to be added whilst preventing userspace from being able to use
> up too much buffer space.

That seems like something that should be added in a separate patch -
adding ->max_usage and switching appropriate users of ->ring_size over,
so it's more clear where you're using one or the other.

> @@ -1949,8 +1950,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>  
>  	pipe_lock(pipe);
>  
> -	bufs = kvmalloc_array(pipe->nrbufs, sizeof(struct pipe_buffer),
> -			      GFP_KERNEL);
> +	head = pipe->head;
> +	tail = pipe->tail;
> +	mask = pipe->ring_size - 1;
> +	count = head - tail;
> +
> +	bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
>  	if (!bufs) {
>  		pipe_unlock(pipe);
>  		return -ENOMEM;
> @@ -1958,8 +1963,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>  
>  	nbuf = 0;
>  	rem = 0;
> -	for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
> -		rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
> +	for (idx = tail; idx < head && rem < len; idx++)
> +		rem += pipe->bufs[idx & mask].len;
>  
>  	ret = -EINVAL;
>  	if (rem < len)
> @@ -1970,16 +1975,16 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
>  		struct pipe_buffer *ibuf;
>  		struct pipe_buffer *obuf;
>  
> -		BUG_ON(nbuf >= pipe->buffers);
> -		BUG_ON(!pipe->nrbufs);
> -		ibuf = &pipe->bufs[pipe->curbuf];
> +		BUG_ON(nbuf >= pipe->ring_size);
> +		BUG_ON(tail == head);
> +		ibuf = &pipe->bufs[tail];

I don't see where tail gets masked between tail = pipe->tail; above and
here, but I may be missing it. In any case, how about seeding head and
tail with something like 1<<20 when creating the pipe so bugs like that
are hit more quickly.

> @@ -515,17 +525,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
>  static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>  {
>  	struct pipe_inode_info *pipe = filp->private_data;
> -	int count, buf, nrbufs;
> +	int count, head, tail, mask;
>  
>  	switch (cmd) {
>  		case FIONREAD:
>  			__pipe_lock(pipe);
>  			count = 0;
> -			buf = pipe->curbuf;
> -			nrbufs = pipe->nrbufs;
> -			while (--nrbufs >= 0) {
> -				count += pipe->bufs[buf].len;
> -				buf = (buf+1) & (pipe->buffers - 1);
> +			head = pipe->head;
> +			tail = pipe->tail;
> +			mask = pipe->ring_size - 1;
> +
> +			while (tail < head) {
> +				count += pipe->bufs[tail & mask].len;
> +				tail++;
>  			}

This is broken if head has wrapped but tail has not. It has to be "while
(head - tail)" or perhaps just "while (tail != head)" or something along
those lines.

> @@ -1086,17 +1104,21 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
>  	}
>  
>  	/*
> -	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
> -	 * expect a lot of shrink+grow operations, just free and allocate
> -	 * again like we would do for growing. If the pipe currently
> +	 * We can shrink the pipe, if arg is greater than the ring occupancy.
> +	 * Since we don't expect a lot of shrink+grow operations, just free and
> +	 * allocate again like we would do for growing.  If the pipe currently
>  	 * contains more buffers than arg, then return busy.
>  	 */
> -	if (nr_pages < pipe->nrbufs) {
> +	mask = pipe->ring_size - 1;
> +	head = pipe->head & mask;
> +	tail = pipe->tail & mask;
> +	n = pipe->head - pipe->tail;

I think it's confusing to "premask" head and tail here. Can you either
drop that (pipe_set_size should hardly be a hot path?), or perhaps call
them something else to avoid a future reader seeing an unmasked
bufs[head] and thinking that's a bug?

> @@ -1254,9 +1290,10 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
>  		   struct page **pages, size_t maxsize, unsigned maxpages,
>  		   size_t *start)
>  {
> +	unsigned int p_tail;
> +	unsigned int i_head;
>  	unsigned npages;
>  	size_t capacity;
> -	int idx;
>  
>  	if (!maxsize)
>  		return 0;
> @@ -1264,12 +1301,15 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
>  	if (!sanity(i))
>  		return -EFAULT;
>  
> -	data_start(i, &idx, start);
> -	/* some of this one + all after this one */
> -	npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
> -	capacity = min(npages,maxpages) * PAGE_SIZE - *start;
> +	data_start(i, &i_head, start);
> +	p_tail = i->pipe->tail;
> +	/* Amount of free space: some of this one + all after this one */
> +	npages = (p_tail + i->pipe->ring_size) - i_head;

Hm, it's not clear that this is equivalent to the old computation. Since
it seems repeated in a few places, could it be factored to a little
helper (before this patch) and the "some of this one + all after this
one" comment perhaps expanded to explain what is going on?

Rasmus

^ permalink raw reply

* Re: [RFC PATCH v2 3/5] btrfs: generalize btrfs_lookup_bio_sums_dio()
From: Nikolay Borisov @ 2019-10-16  9:22 UTC (permalink / raw)
  To: Omar Sandoval, linux-btrfs, linux-fsdevel
  Cc: kernel-team, Dave Chinner, Jann Horn, linux-api
In-Reply-To: <01fdb646d7572f7d0d123937835db5c605e25a5e.1571164762.git.osandov@fb.com>



On 15.10.19 г. 21:42 ч., Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> This isn't actually dio-specific; it just looks up the csums starting at
> the given offset instead of using the page index. Rename it to
> btrfs_lookup_bio_sums_at_offset() and add the dst parameter. We might
> even want to expose __btrfs_lookup_bio_sums() as the public API instead
> of having two trivial wrappers, but I'll leave that for another day.

IMO exposing btrfs_lookup_bio_sums and adding proper kernel doc for its
parameters is the correct way forward. Consider doing this if the
general direction of this patchset is accepted and before sending the
final revision.

^ permalink raw reply

* Re: [RFC PATCH v2 2/5] fs: add RWF_ENCODED for reading/writing compressed data
From: Nikolay Borisov @ 2019-10-16  9:50 UTC (permalink / raw)
  To: Omar Sandoval, linux-btrfs, linux-fsdevel
  Cc: kernel-team, Dave Chinner, Jann Horn, linux-api
In-Reply-To: <7f98cf5409cf2b583cd5b3451fc739fd3428873b.1571164762.git.osandov@fb.com>



On 15.10.19 г. 21:42 ч., Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> Btrfs supports transparent compression: data written by the user can be
> compressed when written to disk and decompressed when read back.
> However, we'd like to add an interface to write pre-compressed data
> directly to the filesystem, and the matching interface to read
> compressed data without decompressing it. This adds support for
> so-called "encoded I/O" via preadv2() and pwritev2().
> 
> A new RWF_ENCODED flags indicates that a read or write is "encoded". If
> this flag is set, iov[0].iov_base points to a struct encoded_iov which
> is used for metadata: namely, the compression algorithm, unencoded
> (i.e., decompressed) length, and what subrange of the unencoded data

In the future when encryption is also supported. What should be the
mechanism to enforce ordering of encoding operations i.e. first compress
then encrypt => uncoded_len should be the resulting size after the
encrypt operation. To me (not being a cryptographer) this seems the
sensible thing to do since compression will be effective that way.
However, what if , for whatever reasons, a different filesystem wants to
support this interface but chooses to do it the other around -> encrypt,
then compress?

> should be used (needed for truncated or hole-punched extents and when
> reading in the middle of an extent). For reads, the filesystem returns
> this information; for writes, the caller provides it to the filesystem.
> iov[0].iov_len must be set to sizeof(struct encoded_iov), which can be
> used to extend the interface in the future. The remaining iovecs contain
> the encoded extent.
> 
> Filesystems must indicate that they support encoded writes by setting
> FMODE_ENCODED_IO in ->file_open().
> 
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  include/linux/fs.h      | 14 +++++++
>  include/uapi/linux/fs.h | 26 ++++++++++++-
>  mm/filemap.c            | 82 ++++++++++++++++++++++++++++++++++-------
>  3 files changed, 108 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index e0d909d35763..54681f21e05e 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
>  /* File does not contribute to nr_files count */
>  #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
>  
> +/* File supports encoded IO */
> +#define FMODE_ENCODED_IO	((__force fmode_t)0x40000000)
> +
>  /*
>   * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
>   * that indicates that they should check the contents of the iovec are
> @@ -314,6 +317,7 @@ enum rw_hint {
>  #define IOCB_SYNC		(1 << 5)
>  #define IOCB_WRITE		(1 << 6)
>  #define IOCB_NOWAIT		(1 << 7)
> +#define IOCB_ENCODED		(1 << 8)
>  
>  struct kiocb {
>  	struct file		*ki_filp;
> @@ -3088,6 +3092,11 @@ extern int sb_min_blocksize(struct super_block *, int);
>  extern int generic_file_mmap(struct file *, struct vm_area_struct *);
>  extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
>  extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
> +struct encoded_iov;
> +extern int generic_encoded_write_checks(struct kiocb *, struct encoded_iov *);
> +extern ssize_t check_encoded_read(struct kiocb *, struct iov_iter *);
> +extern int import_encoded_write(struct kiocb *, struct encoded_iov *,
> +				struct iov_iter *);
>  extern int generic_remap_checks(struct file *file_in, loff_t pos_in,
>  				struct file *file_out, loff_t pos_out,
>  				loff_t *count, unsigned int remap_flags);
> @@ -3403,6 +3412,11 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
>  			return -EOPNOTSUPP;
>  		ki->ki_flags |= IOCB_NOWAIT;
>  	}
> +	if (flags & RWF_ENCODED) {
> +		if (!(ki->ki_filp->f_mode & FMODE_ENCODED_IO))
> +			return -EOPNOTSUPP;
> +		ki->ki_flags |= IOCB_ENCODED;
> +	}
>  	if (flags & RWF_HIPRI)
>  		ki->ki_flags |= IOCB_HIPRI;
>  	if (flags & RWF_DSYNC)
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index 379a612f8f1d..ed92a8a257cb 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -284,6 +284,27 @@ struct fsxattr {
>  
>  typedef int __bitwise __kernel_rwf_t;
>  
> +enum {
> +	ENCODED_IOV_COMPRESSION_NONE,
> +	ENCODED_IOV_COMPRESSION_ZLIB,
> +	ENCODED_IOV_COMPRESSION_LZO,
> +	ENCODED_IOV_COMPRESSION_ZSTD,
> +	ENCODED_IOV_COMPRESSION_TYPES = ENCODED_IOV_COMPRESSION_ZSTD,
> +};
> +
> +enum {
> +	ENCODED_IOV_ENCRYPTION_NONE,
> +	ENCODED_IOV_ENCRYPTION_TYPES = ENCODED_IOV_ENCRYPTION_NONE,
> +};
> +
> +struct encoded_iov {
> +	__u64 len;
> +	__u64 unencoded_len;
> +	__u64 unencoded_offset;
> +	__u32 compression;
> +	__u32 encryption;
> +};
> +
>  /* high priority request, poll if possible */
>  #define RWF_HIPRI	((__force __kernel_rwf_t)0x00000001)
>  
> @@ -299,8 +320,11 @@ typedef int __bitwise __kernel_rwf_t;
>  /* per-IO O_APPEND */
>  #define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
>  
> +/* encoded (e.g., compressed or encrypted) IO */

nit: s/or/and\/or/ or both are exclusive?

> +#define RWF_ENCODED	((__force __kernel_rwf_t)0x00000020)
> +
>  /* mask of flags supported by the kernel */
>  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
> -			 RWF_APPEND)
> +			 RWF_APPEND | RWF_ENCODED)
>  
>  #endif /* _UAPI_LINUX_FS_H */
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 1146fcfa3215..d2e6d9caf353 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2948,24 +2948,15 @@ static int generic_write_check_limits(struct file *file, loff_t pos,
>  	return 0;
>  }
>  
> -/*
> - * Performs necessary checks before doing a write
> - *
> - * Can adjust writing position or amount of bytes to write.
> - * Returns appropriate error code that caller should return or
> - * zero in case that write should be allowed.
> - */
> -inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> +static int generic_write_checks_common(struct kiocb *iocb, loff_t *count)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file->f_mapping->host;
> -	loff_t count;
> -	int ret;
>  
>  	if (IS_SWAPFILE(inode))
>  		return -ETXTBSY;
>  
> -	if (!iov_iter_count(from))
> +	if (!*count)
>  		return 0;
>  
>  	/* FIXME: this is for backwards compatibility with 2.4 */
> @@ -2975,8 +2966,21 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
>  		return -EINVAL;
>  
> -	count = iov_iter_count(from);
> -	ret = generic_write_check_limits(file, iocb->ki_pos, &count);
> +	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
> +}
> +
> +/*
> + * Performs necessary checks before doing a write
> + *
> + * Can adjust writing position or amount of bytes to write.
> + * Returns a negative errno or the new number of bytes to write.
> + */
> +inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	loff_t count = iov_iter_count(from);
> +	int ret;
> +
> +	ret = generic_write_checks_common(iocb, &count);
>  	if (ret)
>  		return ret;
>  
> @@ -2985,6 +2989,58 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  }
>  EXPORT_SYMBOL(generic_write_checks);
>  
> +int generic_encoded_write_checks(struct kiocb *iocb,
> +				 struct encoded_iov *encoded)
> +{
> +	loff_t count = encoded->unencoded_len;
> +	int ret;
> +
> +	ret = generic_write_checks_common(iocb, &count);

That's a bit confusing. You will only ever write encoded len bytes, yet
you check the unencoded len. Presumably that's to ensure the data can be
read back successfully? Still it feels a bit odd. IMO this warrants a
comment.

When you use this function in patch 5 all the checks are performed
against unencoded_len yet you do :

count = encoded.len;

> +	if (ret)
> +		return ret;
> +
> +	if (count != encoded->unencoded_len) {
> +		/*
> +		 * The write got truncated by generic_write_checks_common(). We
> +		 * can't do a partial encoded write.
> +		 */
> +		return -EFBIG;
> +	}
> +	return 0;
> +}
> +EXPORT_SYMBOL(generic_encoded_write_checks);
> +
> +ssize_t check_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
> +{
> +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> +		return -EPERM;
> +	if (iov_iter_single_seg_count(iter) != sizeof(struct encoded_iov))
> +		return -EINVAL;
> +	return iov_iter_count(iter) - sizeof(struct encoded_iov);
> +}
> +EXPORT_SYMBOL(check_encoded_read);
> +
> +int import_encoded_write(struct kiocb *iocb, struct encoded_iov *encoded,

nit: This might be just me but 'import' doesn't sound right, how about
parse_encoded_write ?


> +			 struct iov_iter *from)
> +{
> +	if (!(iocb->ki_filp->f_flags & O_ENCODED))
> +		return -EPERM;
> +	if (iov_iter_single_seg_count(from) != sizeof(*encoded))
> +		return -EINVAL;
> +	if (copy_from_iter(encoded, sizeof(*encoded), from) != sizeof(*encoded))
> +		return -EFAULT;
> +	if (encoded->compression == ENCODED_IOV_COMPRESSION_NONE &&
> +	    encoded->encryption == ENCODED_IOV_ENCRYPTION_NONE)
> +		return -EINVAL;
> +	if (encoded->compression > ENCODED_IOV_COMPRESSION_TYPES ||
> +	    encoded->encryption > ENCODED_IOV_ENCRYPTION_TYPES)
> +		return -EINVAL;
> +	if (encoded->unencoded_offset >= encoded->unencoded_len)
> +		return -EINVAL;
> +	return 0;
> +}
> +EXPORT_SYMBOL(import_encoded_write);
> +
>  /*
>   * Performs necessary checks before doing a clone.
>   *
> 

^ permalink raw reply

* Re: [PATCHv7 01/33] ns: Introduce Time Namespace
From: Vincenzo Frascino @ 2019-10-16 10:27 UTC (permalink / raw)
  To: Dmitry Safonov, linux-kernel
  Cc: Dmitry Safonov, Andrei Vagin, Adrian Reber, Andy Lutomirski,
	Arnd Bergmann, Christian Brauner, Cyrill Gorcunov,
	Eric W. Biederman, H. Peter Anvin, Ingo Molnar, Jann Horn,
	Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	Thomas Gleixner, containers, criu, linux-api, x86, Andrei Vagin
In-Reply-To: <20191011012341.846266-2-dima@arista.com>

Hi Andrei and Dmitry,

On 10/11/19 2:23 AM, Dmitry Safonov wrote:
> From: Andrei Vagin <avagin@openvz.org>
> 
> Time Namespace isolates clock values.
> 
> The kernel provides access to several clocks CLOCK_REALTIME,
> CLOCK_MONOTONIC, CLOCK_BOOTTIME, etc.
> 
> CLOCK_REALTIME
>       System-wide clock that measures real (i.e., wall-clock) time.
> 
> CLOCK_MONOTONIC
>       Clock that cannot be set and represents monotonic time since
>       some unspecified starting point.
> 
> CLOCK_BOOTTIME
>       Identical to CLOCK_MONOTONIC, except it also includes any time
>       that the system is suspended.
> 
> For many users, the time namespace means the ability to changes date and
> time in a container (CLOCK_REALTIME).
> 
> But in a context of the checkpoint/restore functionality, monotonic and
> bootime clocks become interesting. Both clocks are monotonic with
> unspecified staring points. These clocks are widely used to measure time
> slices and set timers. After restoring or migrating processes, we have to
> guarantee that they never go backward. In an ideal case, the behavior of
> these clocks should be the same as for a case when a whole system is
> suspended. All this means that we need to be able to set CLOCK_MONOTONIC
> and CLOCK_BOOTTIME clocks, what can be done by adding per-namespace
> offsets for clocks.
> 
> A time namespace is similar to a pid namespace in a way how it is
> created: unshare(CLONE_NEWTIME) system call creates a new time namespace,
> but doesn't set it to the current process. Then all children of
> the process will be born in the new time namespace, or a process can
> use the setns() system call to join a namespace.
> 
> This scheme allows setting clock offsets for a namespace, before any
> processes appear in it.
> 
> All available clone flags have been used, so CLONE_NEWTIME uses the
> highest bit of CSIGNAL. It means that we can use it with the unshare()
> system call only. Rith now, this works for us, because time namespace
> offsets can be set only when a new time namespace is not populated. In a
> future, we will have the clone3() system call [1] which will allow to use
> the CSIGNAL mask for clone flags.
> 
> [1]: httmps://lkml.kernel.org/r/20190604160944.4058-1-christian@brauner.io
> 
> Link: https://criu.org/Time_namespace
> Link: https://lists.openvz.org/pipermail/criu/2018-June/041504.html
> Signed-off-by: Andrei Vagin <avagin@gmail.com>
> Co-developed-by: Dmitry Safonov <dima@arista.com>
> Signed-off-by: Dmitry Safonov <dima@arista.com>
> ---
>  MAINTAINERS                    |   2 +
>  fs/proc/namespaces.c           |   4 +
>  include/linux/nsproxy.h        |   2 +
>  include/linux/proc_ns.h        |   3 +
>  include/linux/time_namespace.h |  66 ++++++++++
>  include/linux/user_namespace.h |   1 +
>  include/uapi/linux/sched.h     |   6 +
>  init/Kconfig                   |   7 ++
>  kernel/fork.c                  |  16 ++-
>  kernel/nsproxy.c               |  41 +++++--
>  kernel/time/Makefile           |   1 +
>  kernel/time/namespace.c        | 217 +++++++++++++++++++++++++++++++++
>  12 files changed, 356 insertions(+), 10 deletions(-)
>  create mode 100644 include/linux/time_namespace.h
>  create mode 100644 kernel/time/namespace.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index d44d6732510d..cabe7bddbf69 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -13009,6 +13009,8 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
>  S:	Maintained
>  F:	fs/timerfd.c
>  F:	include/linux/timer*
> +F:	include/linux/time_namespace.h
> +F:	kernel/time_namespace.c
>  F:	kernel/time/*timer*
>  
>  POWER MANAGEMENT CORE
> diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
> index dd2b35f78b09..8b5c720fe5d7 100644
> --- a/fs/proc/namespaces.c
> +++ b/fs/proc/namespaces.c
> @@ -33,6 +33,10 @@ static const struct proc_ns_operations *ns_entries[] = {
>  #ifdef CONFIG_CGROUPS
>  	&cgroupns_operations,
>  #endif
> +#ifdef CONFIG_TIME_NS
> +	&timens_operations,
> +	&timens_for_children_operations,
> +#endif
>  };
>  
>  static const char *proc_ns_get_link(struct dentry *dentry,
> diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
> index 2ae1b1a4d84d..074f395b9ad2 100644
> --- a/include/linux/nsproxy.h
> +++ b/include/linux/nsproxy.h
> @@ -35,6 +35,8 @@ struct nsproxy {
>  	struct mnt_namespace *mnt_ns;
>  	struct pid_namespace *pid_ns_for_children;
>  	struct net 	     *net_ns;
> +	struct time_namespace *time_ns;
> +	struct time_namespace *time_ns_for_children;
>  	struct cgroup_namespace *cgroup_ns;
>  };
>  extern struct nsproxy init_nsproxy;
> diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
> index d31cb6215905..d312e6281e69 100644
> --- a/include/linux/proc_ns.h
> +++ b/include/linux/proc_ns.h
> @@ -32,6 +32,8 @@ extern const struct proc_ns_operations pidns_for_children_operations;
>  extern const struct proc_ns_operations userns_operations;
>  extern const struct proc_ns_operations mntns_operations;
>  extern const struct proc_ns_operations cgroupns_operations;
> +extern const struct proc_ns_operations timens_operations;
> +extern const struct proc_ns_operations timens_for_children_operations;
>  
>  /*
>   * We always define these enumerators
> @@ -43,6 +45,7 @@ enum {
>  	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
>  	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
>  	PROC_CGROUP_INIT_INO	= 0xEFFFFFFBU,
> +	PROC_TIME_INIT_INO	= 0xEFFFFFFAU,
>  };
>  
>  #ifdef CONFIG_PROC_FS
> diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
> new file mode 100644
> index 000000000000..873b908c9ba8
> --- /dev/null
> +++ b/include/linux/time_namespace.h
> @@ -0,0 +1,66 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_TIMENS_H
> +#define _LINUX_TIMENS_H
> +
> +
> +#include <linux/sched.h>
> +#include <linux/kref.h>
> +#include <linux/nsproxy.h>
> +#include <linux/ns_common.h>
> +#include <linux/err.h>
> +
> +struct user_namespace;
> +extern struct user_namespace init_user_ns;
> +
> +struct time_namespace {
> +	struct kref kref;
> +	struct user_namespace *user_ns;
> +	struct ucounts *ucounts;
> +	struct ns_common ns;
> +} __randomize_layout;
> +extern struct time_namespace init_time_ns;
> +
> +#ifdef CONFIG_TIME_NS
> +static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
> +{
> +	kref_get(&ns->kref);
> +	return ns;
> +}
> +
> +extern struct time_namespace *copy_time_ns(unsigned long flags,
> +	struct user_namespace *user_ns, struct time_namespace *old_ns);
> +extern void free_time_ns(struct kref *kref);
> +extern int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
> +
> +static inline void put_time_ns(struct time_namespace *ns)
> +{
> +	kref_put(&ns->kref, free_time_ns);
> +}
> +
> +#else
> +static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
> +{
> +	return NULL;
> +}
> +
> +static inline void put_time_ns(struct time_namespace *ns)
> +{
> +}
> +
> +static inline struct time_namespace *copy_time_ns(unsigned long flags,
> +	struct user_namespace *user_ns, struct time_namespace *old_ns)
> +{
> +	if (flags & CLONE_NEWTIME)
> +		return ERR_PTR(-EINVAL);
> +
> +	return old_ns;
> +}
> +
> +static inline int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
> +{
> +	return 0;
> +}
> +
> +#endif
> +
> +#endif /* _LINUX_TIMENS_H */
> diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
> index fb9f4f799554..6ef1c7109fc4 100644
> --- a/include/linux/user_namespace.h
> +++ b/include/linux/user_namespace.h
> @@ -45,6 +45,7 @@ enum ucount_type {
>  	UCOUNT_NET_NAMESPACES,
>  	UCOUNT_MNT_NAMESPACES,
>  	UCOUNT_CGROUP_NAMESPACES,
> +	UCOUNT_TIME_NAMESPACES,
>  #ifdef CONFIG_INOTIFY_USER
>  	UCOUNT_INOTIFY_INSTANCES,
>  	UCOUNT_INOTIFY_WATCHES,
> diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
> index 99335e1f4a27..5b7511ac2005 100644
> --- a/include/uapi/linux/sched.h
> +++ b/include/uapi/linux/sched.h
> @@ -33,6 +33,12 @@
>  #define CLONE_NEWNET		0x40000000	/* New network namespace */
>  #define CLONE_IO		0x80000000	/* Clone io context */
>  
> +/*
> + * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
> + * syscalls only:
> + */
> +#define CLONE_NEWTIME	0x00000080	/* New time namespace */
> +
>  #ifndef __ASSEMBLY__
>  /**
>   * struct clone_args - arguments for the clone3 syscall
> diff --git a/init/Kconfig b/init/Kconfig
> index b4daad2bac23..bc2ee93408ad 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1096,6 +1096,13 @@ config UTS_NS
>  	  In this namespace tasks see different info provided with the
>  	  uname() system call
>  
> +config TIME_NS
> +	bool "TIME namespace"
> +	default y

Having CONFIG_TIME_NS "default y" makes so that the option is selected even on
the architectures that have no support for time namespaces.
The direct consequence is that the fallbacks defined in this patch are never
selected and this ends up in kernel compilation errors due to missing symbols.

The error below shows what happens on arm64 (similar behavior on other
architectures):

aarch64-linux-gnu-ld: kernel/time/namespace.o: in function `timens_on_fork':
kernel/time/namespace.c:321: undefined reference to `vdso_join_timens'

aarch64-linux-gnu-ld: kernel/time/namespace.o: in function
`timens_set_vvar_page': kernel/time/namespace.c:218: undefined reference to
`arch_get_vdso_
data'

aarch64-linux-gnu-ld: kernel/time/namespace.o: in function `timens_install':
/data1/Projects/LinuxKernel/linux/kernel/time/namespace.c:295: undefined
reference to `vdso_join_time
ns'
/data1/Projects/LinuxKernel/linux/Makefile:1074: recipe for target 'vmlinux'
failed
make[1]: *** [vmlinux] Error 1
make[1]: Leaving directory '/data1/Projects/LinuxKernel/linux-out'
                    Makefile:179: recipe for target 'sub-make' failed
make: *** [sub-make] Error 2

My proposal is to keep TIME_NS "default n" (just remove "default y"), let the
architectures that enable time namespaces select it and make CONFIG_TIME_NS
select GENERIC_VDSO_TIME_NS if arch has HAVE_GENERIC_VDSO.

What do you think?

Thanks,
Vincenzo

> +	help
> +	  In this namespace boottime and monotonic clocks can be set.
> +	  The time will keep going with the same pace.
> +
>  config IPC_NS
>  	bool "IPC namespace"
>  	depends on (SYSVIPC || POSIX_MQUEUE)
> diff --git a/kernel/fork.c b/kernel/fork.c
> index bcdf53125210..aa65333dbc45 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1772,6 +1772,7 @@ static __latent_entropy struct task_struct *copy_process(
>  	struct multiprocess_signals delayed;
>  	struct file *pidfile = NULL;
>  	u64 clone_flags = args->flags;
> +	struct nsproxy *nsp = current->nsproxy;
>  
>  	/*
>  	 * Don't allow sharing the root directory with processes in a different
> @@ -1814,8 +1815,16 @@ static __latent_entropy struct task_struct *copy_process(
>  	 */
>  	if (clone_flags & CLONE_THREAD) {
>  		if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
> -		    (task_active_pid_ns(current) !=
> -				current->nsproxy->pid_ns_for_children))
> +		    (task_active_pid_ns(current) != nsp->pid_ns_for_children))
> +			return ERR_PTR(-EINVAL);
> +	}
> +
> +	/*
> +	 * If the new process will be in a different time namespace
> +	 * do not allow it to share VM or a thread group with the forking task.
> +	 */
> +	if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
> +		if (nsp->time_ns != nsp->time_ns_for_children)
>  			return ERR_PTR(-EINVAL);
>  	}
>  
> @@ -2703,7 +2712,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
>  	if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
>  				CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
>  				CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
> -				CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
> +				CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
> +				CLONE_NEWTIME))
>  		return -EINVAL;
>  	/*
>  	 * Not implemented, but pretend it works if there is nothing
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index c815f58e6bc0..ed9882108cd2 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -18,6 +18,7 @@
>  #include <linux/pid_namespace.h>
>  #include <net/net_namespace.h>
>  #include <linux/ipc_namespace.h>
> +#include <linux/time_namespace.h>
>  #include <linux/proc_ns.h>
>  #include <linux/file.h>
>  #include <linux/syscalls.h>
> @@ -40,6 +41,10 @@ struct nsproxy init_nsproxy = {
>  #ifdef CONFIG_CGROUPS
>  	.cgroup_ns		= &init_cgroup_ns,
>  #endif
> +#ifdef CONFIG_TIME_NS
> +	.time_ns		= &init_time_ns,
> +	.time_ns_for_children	= &init_time_ns,
> +#endif
>  };
>  
>  static inline struct nsproxy *create_nsproxy(void)
> @@ -106,8 +111,18 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
>  		goto out_net;
>  	}
>  
> +	new_nsp->time_ns_for_children = copy_time_ns(flags, user_ns,
> +					tsk->nsproxy->time_ns_for_children);
> +	if (IS_ERR(new_nsp->time_ns_for_children)) {
> +		err = PTR_ERR(new_nsp->time_ns_for_children);
> +		goto out_time;
> +	}
> +	new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);
> +
>  	return new_nsp;
>  
> +out_time:
> +	put_net(new_nsp->net_ns);
>  out_net:
>  	put_cgroup_ns(new_nsp->cgroup_ns);
>  out_cgroup:
> @@ -136,15 +151,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
>  	struct nsproxy *old_ns = tsk->nsproxy;
>  	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
>  	struct nsproxy *new_ns;
> +	int ret;
>  
>  	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
>  			      CLONE_NEWPID | CLONE_NEWNET |
> -			      CLONE_NEWCGROUP)))) {
> -		get_nsproxy(old_ns);
> -		return 0;
> -	}
> -
> -	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
> +			      CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
> +		if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
> +			get_nsproxy(old_ns);
> +			return 0;
> +		}
> +	} else if (!ns_capable(user_ns, CAP_SYS_ADMIN))
>  		return -EPERM;
>  
>  	/*
> @@ -162,6 +178,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
>  	if (IS_ERR(new_ns))
>  		return  PTR_ERR(new_ns);
>  
> +	ret = timens_on_fork(new_ns, tsk);
> +	if (ret) {
> +		free_nsproxy(new_ns);
> +		return ret;
> +	}
> +
>  	tsk->nsproxy = new_ns;
>  	return 0;
>  }
> @@ -176,6 +198,10 @@ void free_nsproxy(struct nsproxy *ns)
>  		put_ipc_ns(ns->ipc_ns);
>  	if (ns->pid_ns_for_children)
>  		put_pid_ns(ns->pid_ns_for_children);
> +	if (ns->time_ns)
> +		put_time_ns(ns->time_ns);
> +	if (ns->time_ns_for_children)
> +		put_time_ns(ns->time_ns_for_children);
>  	put_cgroup_ns(ns->cgroup_ns);
>  	put_net(ns->net_ns);
>  	kmem_cache_free(nsproxy_cachep, ns);
> @@ -192,7 +218,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
>  	int err = 0;
>  
>  	if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> -			       CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
> +			       CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
> +			       CLONE_NEWTIME)))
>  		return 0;
>  
>  	user_ns = new_cred ? new_cred->user_ns : current_user_ns();
> diff --git a/kernel/time/Makefile b/kernel/time/Makefile
> index 1867044800bb..c8f00168afe8 100644
> --- a/kernel/time/Makefile
> +++ b/kernel/time/Makefile
> @@ -19,3 +19,4 @@ obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o tick-sched.o
>  obj-$(CONFIG_HAVE_GENERIC_VDSO)			+= vsyscall.o
>  obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o
>  obj-$(CONFIG_TEST_UDELAY)			+= test_udelay.o
> +obj-$(CONFIG_TIME_NS)				+= namespace.o
> diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
> new file mode 100644
> index 000000000000..2662a69e0382
> --- /dev/null
> +++ b/kernel/time/namespace.c
> @@ -0,0 +1,217 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Author: Andrei Vagin <avagin@openvz.org>
> + * Author: Dmitry Safonov <dima@arista.com>
> + */
> +
> +#include <linux/time_namespace.h>
> +#include <linux/user_namespace.h>
> +#include <linux/sched/signal.h>
> +#include <linux/sched/task.h>
> +#include <linux/proc_ns.h>
> +#include <linux/export.h>
> +#include <linux/time.h>
> +#include <linux/slab.h>
> +#include <linux/cred.h>
> +#include <linux/err.h>
> +
> +static struct ucounts *inc_time_namespaces(struct user_namespace *ns)
> +{
> +	return inc_ucount(ns, current_euid(), UCOUNT_TIME_NAMESPACES);
> +}
> +
> +static void dec_time_namespaces(struct ucounts *ucounts)
> +{
> +	dec_ucount(ucounts, UCOUNT_TIME_NAMESPACES);
> +}
> +
> +/**
> + * clone_time_ns - Clone a time namespace
> + * @user_ns:	User namespace which owns a new namespace.
> + * @old_ns:	Namespace to clone
> + *
> + * Clone @old_ns and set the clone refcount to 1
> + *
> + * Return: The new namespace or ERR_PTR.
> + */
> +static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
> +					  struct time_namespace *old_ns)
> +{
> +	struct time_namespace *ns;
> +	struct ucounts *ucounts;
> +	int err;
> +
> +	err = -ENOSPC;
> +	ucounts = inc_time_namespaces(user_ns);
> +	if (!ucounts)
> +		goto fail;
> +
> +	err = -ENOMEM;
> +	ns = kmalloc(sizeof(*ns), GFP_KERNEL);
> +	if (!ns)
> +		goto fail_dec;
> +
> +	kref_init(&ns->kref);
> +
> +	err = ns_alloc_inum(&ns->ns);
> +	if (err)
> +		goto fail_free;
> +
> +	ns->ucounts = ucounts;
> +	ns->ns.ops = &timens_operations;
> +	ns->user_ns = get_user_ns(user_ns);
> +	return ns;
> +
> +fail_free:
> +	kfree(ns);
> +fail_dec:
> +	dec_time_namespaces(ucounts);
> +fail:
> +	return ERR_PTR(err);
> +}
> +
> +/**
> + * copy_time_ns - Create timens_for_children from @old_ns
> + * @flags:	Cloning flags
> + * @user_ns:	User namespace which owns a new namespace.
> + * @old_ns:	Namespace to clone
> + *
> + * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children;
> + * adds a refcounter to @old_ns otherwise.
> + *
> + * Return: timens_for_children namespace or ERR_PTR.
> + */
> +struct time_namespace *copy_time_ns(unsigned long flags,
> +	struct user_namespace *user_ns, struct time_namespace *old_ns)
> +{
> +	if (!(flags & CLONE_NEWTIME))
> +		return get_time_ns(old_ns);
> +
> +	return clone_time_ns(user_ns, old_ns);
> +}
> +
> +void free_time_ns(struct kref *kref)
> +{
> +	struct time_namespace *ns;
> +
> +	ns = container_of(kref, struct time_namespace, kref);
> +	dec_time_namespaces(ns->ucounts);
> +	put_user_ns(ns->user_ns);
> +	ns_free_inum(&ns->ns);
> +	kfree(ns);
> +}
> +
> +static struct time_namespace *to_time_ns(struct ns_common *ns)
> +{
> +	return container_of(ns, struct time_namespace, ns);
> +}
> +
> +static struct ns_common *timens_get(struct task_struct *task)
> +{
> +	struct time_namespace *ns = NULL;
> +	struct nsproxy *nsproxy;
> +
> +	task_lock(task);
> +	nsproxy = task->nsproxy;
> +	if (nsproxy) {
> +		ns = nsproxy->time_ns;
> +		get_time_ns(ns);
> +	}
> +	task_unlock(task);
> +
> +	return ns ? &ns->ns : NULL;
> +}
> +
> +static struct ns_common *timens_for_children_get(struct task_struct *task)
> +{
> +	struct time_namespace *ns = NULL;
> +	struct nsproxy *nsproxy;
> +
> +	task_lock(task);
> +	nsproxy = task->nsproxy;
> +	if (nsproxy) {
> +		ns = nsproxy->time_ns_for_children;
> +		get_time_ns(ns);
> +	}
> +	task_unlock(task);
> +
> +	return ns ? &ns->ns : NULL;
> +}
> +
> +static void timens_put(struct ns_common *ns)
> +{
> +	put_time_ns(to_time_ns(ns));
> +}
> +
> +static int timens_install(struct nsproxy *nsproxy, struct ns_common *new)
> +{
> +	struct time_namespace *ns = to_time_ns(new);
> +
> +	if (!current_is_single_threaded())
> +		return -EUSERS;
> +
> +	if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
> +	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	get_time_ns(ns);
> +	put_time_ns(nsproxy->time_ns);
> +	nsproxy->time_ns = ns;
> +
> +	get_time_ns(ns);
> +	put_time_ns(nsproxy->time_ns_for_children);
> +	nsproxy->time_ns_for_children = ns;
> +	return 0;
> +}
> +
> +int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
> +{
> +	struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
> +	struct time_namespace *ns = to_time_ns(nsc);
> +
> +	/* create_new_namespaces() already incremented the ref counter */
> +	if (nsproxy->time_ns == nsproxy->time_ns_for_children)
> +		return 0;
> +
> +	get_time_ns(ns);
> +	put_time_ns(nsproxy->time_ns);
> +	nsproxy->time_ns = ns;
> +
> +	return 0;
> +}
> +
> +static struct user_namespace *timens_owner(struct ns_common *ns)
> +{
> +	return to_time_ns(ns)->user_ns;
> +}
> +
> +const struct proc_ns_operations timens_operations = {
> +	.name		= "time",
> +	.type		= CLONE_NEWTIME,
> +	.get		= timens_get,
> +	.put		= timens_put,
> +	.install	= timens_install,
> +	.owner		= timens_owner,
> +};
> +
> +const struct proc_ns_operations timens_for_children_operations = {
> +	.name		= "time_for_children",
> +	.type		= CLONE_NEWTIME,
> +	.get		= timens_for_children_get,
> +	.put		= timens_put,
> +	.install	= timens_install,
> +	.owner		= timens_owner,
> +};
> +
> +struct time_namespace init_time_ns = {
> +	.kref		= KREF_INIT(3),
> +	.user_ns	= &init_user_ns,
> +	.ns.inum	= PROC_TIME_INIT_INO,
> +	.ns.ops		= &timens_operations,
> +};
> +
> +static int __init time_ns_init(void)
> +{
> +	return 0;
> +}
> +subsys_initcall(time_ns_init);
> 

-- 
Regards,
Vincenzo

^ permalink raw reply

* Re: [PATCHv7 01/33] ns: Introduce Time Namespace
From: Thomas Gleixner @ 2019-10-16 10:39 UTC (permalink / raw)
  To: Vincenzo Frascino
  Cc: Dmitry Safonov, linux-kernel, Dmitry Safonov, Andrei Vagin,
	Adrian Reber, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
	Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	containers, criu, linux-api, x86, Andrei Vagin
In-Reply-To: <80af93da-d497-81de-2a2a-179bb3bc852d@arm.com>

On Wed, 16 Oct 2019, Vincenzo Frascino wrote:

< Trim 250+ lines ( 3+ pages) of pointlessly wasted electrons >

> > --- a/init/Kconfig
> > +++ b/init/Kconfig
> > @@ -1096,6 +1096,13 @@ config UTS_NS
> >  	  In this namespace tasks see different info provided with the
> >  	  uname() system call
> >  
> > +config TIME_NS
> > +	bool "TIME namespace"
> > +	default y
> 
> Having CONFIG_TIME_NS "default y" makes so that the option is selected even on
> the architectures that have no support for time namespaces.
> The direct consequence is that the fallbacks defined in this patch are never
> selected and this ends up in kernel compilation errors due to missing symbols.
> 
> The error below shows what happens on arm64 (similar behavior on other
> architectures):
> 
> aarch64-linux-gnu-ld: kernel/time/namespace.o: in function `timens_on_fork':
> kernel/time/namespace.c:321: undefined reference to `vdso_join_timens'
> 
> My proposal is to keep TIME_NS "default n" (just remove "default y"), let the
> architectures that enable time namespaces select it and make CONFIG_TIME_NS
> select GENERIC_VDSO_TIME_NS if arch has HAVE_GENERIC_VDSO.

Nah.

config TIME_NS
	bool "TIME namespace"
	depends on GENERIC_VDSO_TIME_NS
	default y

and in lib/vdso/Kconfig

config GENERIC_VDSO_TIME_NS
	bool

and let architectures which have support for the VDSO bits select it.

< Trim another gazillion of useless lines >

See: https://people.kernel.org/tglx/notes-about-netiquette

Thanks,

	tglx

^ permalink raw reply

* Re: [PATCHv7 01/33] ns: Introduce Time Namespace
From: Vincenzo Frascino @ 2019-10-16 10:44 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Dmitry Safonov, linux-kernel, Dmitry Safonov, Andrei Vagin,
	Adrian Reber, Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
	Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	containers, criu, linux-api, x86, Andrei Vagin
In-Reply-To: <alpine.DEB.2.21.1910161230070.2046@nanos.tec.linutronix.de>


On 10/16/19 11:39 AM, Thomas Gleixner wrote:
> On Wed, 16 Oct 2019, Vincenzo Frascino wrote:
> 
> < Trim 250+ lines ( 3+ pages) of pointlessly wasted electrons >
> 
>>> --- a/init/Kconfig
>>> +++ b/init/Kconfig
>>> @@ -1096,6 +1096,13 @@ config UTS_NS
>>>  	  In this namespace tasks see different info provided with the
>>>  	  uname() system call
>>>  
>>> +config TIME_NS
>>> +	bool "TIME namespace"
>>> +	default y
>>
>> Having CONFIG_TIME_NS "default y" makes so that the option is selected even on
>> the architectures that have no support for time namespaces.
>> The direct consequence is that the fallbacks defined in this patch are never
>> selected and this ends up in kernel compilation errors due to missing symbols.
>>
>> The error below shows what happens on arm64 (similar behavior on other
>> architectures):
>>
>> aarch64-linux-gnu-ld: kernel/time/namespace.o: in function `timens_on_fork':
>> kernel/time/namespace.c:321: undefined reference to `vdso_join_timens'
>>
>> My proposal is to keep TIME_NS "default n" (just remove "default y"), let the
>> architectures that enable time namespaces select it and make CONFIG_TIME_NS
>> select GENERIC_VDSO_TIME_NS if arch has HAVE_GENERIC_VDSO.
> 
> Nah.
> 
> config TIME_NS
> 	bool "TIME namespace"
> 	depends on GENERIC_VDSO_TIME_NS
> 	default y
> 
> and in lib/vdso/Kconfig
> 
> config GENERIC_VDSO_TIME_NS
> 	bool
> 
> and let architectures which have support for the VDSO bits select it.
>

Agreed, this is even better.

> < Trim another gazillion of useless lines >
> 
> See: https://people.kernel.org/tglx/notes-about-netiquette
> 
> Thanks,
> 
> 	tglx
> 

-- 
Regards,
Vincenzo

^ permalink raw reply

* Re: [RFC PATCH v2 5/5] btrfs: implement RWF_ENCODED writes
From: Nikolay Borisov @ 2019-10-16 10:44 UTC (permalink / raw)
  To: Omar Sandoval, linux-btrfs, linux-fsdevel
  Cc: kernel-team, Dave Chinner, Jann Horn, linux-api
In-Reply-To: <904de93d9bbe630aff7f725fd587810c6eb48344.1571164762.git.osandov@fb.com>



On 15.10.19 г. 21:42 ч., Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> The implementation resembles direct I/O: we have to flush any ordered
> extents, invalidate the page cache, and do the io tree/delalloc/extent
> map/ordered extent dance. From there, we can reuse the compression code
> with a minor modification to distinguish the write from writeback.
> 
> Now that read and write are implemented, this also sets the
> FMODE_ENCODED_IO flag in btrfs_file_open().
> 
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  fs/btrfs/compression.c |   6 +-
>  fs/btrfs/compression.h |   5 +-
>  fs/btrfs/ctree.h       |   2 +
>  fs/btrfs/file.c        |  40 +++++++--
>  fs/btrfs/inode.c       | 197 ++++++++++++++++++++++++++++++++++++++++-
>  5 files changed, 237 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index b05b361e2062..6632dd8d2e4d 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -276,7 +276,8 @@ static void end_compressed_bio_write(struct bio *bio)
>  			bio->bi_status == BLK_STS_OK);
>  	cb->compressed_pages[0]->mapping = NULL;
>  
> -	end_compressed_writeback(inode, cb);
> +	if (cb->writeback)
> +		end_compressed_writeback(inode, cb);
>  	/* note, our inode could be gone now */
>  
>  	/*
> @@ -311,7 +312,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
>  				 unsigned long compressed_len,
>  				 struct page **compressed_pages,
>  				 unsigned long nr_pages,
> -				 unsigned int write_flags)
> +				 unsigned int write_flags, bool writeback)

I don't see this function being called with true in this patch set,
meaning it essentially eliminates end_compressed_writeback call in
end_compressed_bio_write? Am I missing anything?

>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>  	struct bio *bio = NULL;
> @@ -336,6 +337,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
>  	cb->mirror_num = 0;
>  	cb->compressed_pages = compressed_pages;
>  	cb->compressed_len = compressed_len;
> +	cb->writeback = writeback;
>  	cb->orig_bio = NULL;
>  	cb->nr_pages = nr_pages;
>  
> diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
> index 4cb8be9ff88b..d4176384ec15 100644
> --- a/fs/btrfs/compression.h
> +++ b/fs/btrfs/compression.h
> @@ -47,6 +47,9 @@ struct compressed_bio {
>  	/* the compression algorithm for this bio */
>  	int compress_type;
>  
> +	/* Whether this is a write for writeback. */
> +	bool writeback;
> +
>  	/* number of compressed pages in the array */
>  	unsigned long nr_pages;
>  
> @@ -93,7 +96,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start,
>  				  unsigned long compressed_len,
>  				  struct page **compressed_pages,
>  				  unsigned long nr_pages,
> -				  unsigned int write_flags);
> +				  unsigned int write_flags, bool writeback);
>  blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
>  				 int mirror_num, unsigned long bio_flags);
>  
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 3b2aa1c7218c..9e1719e82cc8 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -2907,6 +2907,8 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
>  void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
>  					  u64 end, int uptodate);
>  ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter);
> +ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
> +			    struct encoded_iov *encoded);
>  
>  extern const struct dentry_operations btrfs_dentry_operations;
>  
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 51740cee39fc..8de6ac9b4b9c 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1893,8 +1893,7 @@ static void update_time_for_write(struct inode *inode)
>  		inode_inc_iversion(inode);
>  }
>  
> -static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
> -				    struct iov_iter *from)
> +static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file_inode(file);
> @@ -1904,14 +1903,22 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
>  	u64 end_pos;
>  	ssize_t num_written = 0;
>  	const bool sync = iocb->ki_flags & IOCB_DSYNC;
> +	struct encoded_iov encoded;
>  	ssize_t err;
>  	loff_t pos;
>  	size_t count;
>  	loff_t oldsize;
>  	int clean_page = 0;
>  
> -	if (!(iocb->ki_flags & IOCB_DIRECT) &&
> -	    (iocb->ki_flags & IOCB_NOWAIT))
> +	if (iocb->ki_flags & IOCB_ENCODED) {
> +		err = import_encoded_write(iocb, &encoded, from);
> +		if (err)
> +			return err;
> +	}
> +
> +	if ((iocb->ki_flags & IOCB_NOWAIT) &&
> +	    (!(iocb->ki_flags & IOCB_DIRECT) ||
> +	     (iocb->ki_flags & IOCB_ENCODED)))
>  		return -EOPNOTSUPP;
>  
>  	if (!inode_trylock(inode)) {
> @@ -1920,14 +1927,27 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
>  		inode_lock(inode);
>  	}
>  
> -	err = generic_write_checks(iocb, from);
> -	if (err <= 0) {
> +	if (iocb->ki_flags & IOCB_ENCODED) {
> +		err = generic_encoded_write_checks(iocb, &encoded);
> +		if (err) {
> +			inode_unlock(inode);
> +			return err;
> +		}
> +		count = encoded.len;
> +	} else {
> +		err = generic_write_checks(iocb, from);
> +		if (err < 0) {
> +			inode_unlock(inode);
> +			return err;
> +		}
> +		count = iov_iter_count(from);
> +	}
> +	if (count == 0) {
>  		inode_unlock(inode);
>  		return err;
>  	}
>  
>  	pos = iocb->ki_pos;
> -	count = iov_iter_count(from);
>  	if (iocb->ki_flags & IOCB_NOWAIT) {
>  		/*
>  		 * We will allocate space in case nodatacow is not set,
> @@ -1986,7 +2006,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
>  	if (sync)
>  		atomic_inc(&BTRFS_I(inode)->sync_writers);
>  
> -	if (iocb->ki_flags & IOCB_DIRECT) {
> +	if (iocb->ki_flags & IOCB_ENCODED) {
> +		num_written = btrfs_encoded_write(iocb, from, &encoded);
> +	} else if (iocb->ki_flags & IOCB_DIRECT) {
>  		num_written = __btrfs_direct_write(iocb, from);
>  	} else {
>  		num_written = btrfs_buffered_write(iocb, from);
> @@ -3461,7 +3483,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
>  
>  static int btrfs_file_open(struct inode *inode, struct file *filp)
>  {
> -	filp->f_mode |= FMODE_NOWAIT;
> +	filp->f_mode |= FMODE_NOWAIT | FMODE_ENCODED_IO;
>  	return generic_file_open(inode, filp);
>  }
>  
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 174d0738d2c9..bcc5a2bed22b 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -865,7 +865,7 @@ static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
>  				    ins.objectid,
>  				    ins.offset, async_extent->pages,
>  				    async_extent->nr_pages,
> -				    async_chunk->write_flags)) {
> +				    async_chunk->write_flags, true)) {
>  			struct page *p = async_extent->pages[0];
>  			const u64 start = async_extent->start;
>  			const u64 end = start + async_extent->ram_size - 1;
> @@ -11055,6 +11055,201 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
>  	return ret;
>  }
>  
> +ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
> +			    struct encoded_iov *encoded)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	struct btrfs_root *root = BTRFS_I(inode)->root;
> +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> +	struct extent_changeset *data_reserved = NULL;
> +	struct extent_state *cached_state = NULL;
> +	int compression;
> +	size_t orig_count;
> +	u64 disk_num_bytes, num_bytes;
> +	u64 start, end;
> +	unsigned long nr_pages, i;
> +	struct page **pages;
> +	struct btrfs_key ins;
> +	struct extent_map *em;
> +	ssize_t ret;
> +
> +	switch (encoded->compression) {
> +	case ENCODED_IOV_COMPRESSION_ZLIB:
> +		compression = BTRFS_COMPRESS_ZLIB;
> +		break;
> +	case ENCODED_IOV_COMPRESSION_LZO:
> +		compression = BTRFS_COMPRESS_LZO;
> +		break;
> +	case ENCODED_IOV_COMPRESSION_ZSTD:
> +		compression = BTRFS_COMPRESS_ZSTD;
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	disk_num_bytes = orig_count = iov_iter_count(from);
> +
> +	/* For now, it's too hard to support bookend extents. */
> +	if (encoded->unencoded_len != encoded->len ||
> +	    encoded->unencoded_offset != 0)
> +		return -EINVAL;
> +
> +	/* The extent size must be sane. */
> +	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
> +	    disk_num_bytes > BTRFS_MAX_COMPRESSED || disk_num_bytes == 0)
> +		return -EINVAL;
> +
> +	/*
> +	 * The compressed data on disk must be sector-aligned. For convenience,
> +	 * we extend it with zeroes if it isn't.
> +	 */
> +	disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize);
> +
> +	/*
> +	 * The extent in the file must also be sector-aligned. However, we allow
> +	 * a write which ends at or extends i_size to have an unaligned length;
> +	 * we round up the extent size and set i_size to the given length.
> +	 */
> +	start = iocb->ki_pos;
> +	if (!IS_ALIGNED(start, fs_info->sectorsize))
> +		return -EINVAL;
> +	if (start + encoded->len >= inode->i_size) {
> +		num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
> +	} else {
> +		num_bytes = encoded->len;
> +		if (!IS_ALIGNED(num_bytes, fs_info->sectorsize))
> +			return -EINVAL;
> +	}
> +
> +	/*
> +	 * It's valid to have compressed data which is larger than or the same
> +	 * size as the decompressed data. However, for buffered I/O, we fall
> +	 * back to writing the decompressed data if compression didn't shrink
> +	 * it. So, for now, let's not allow creating such extents.
> +	 *
> +	 * Note that for now this also implicitly prevents writing data that
> +	 * would fit in an inline extent.
> +	 */
> +	if (disk_num_bytes >= num_bytes)
> +		return -EINVAL;
> +
> +	end = start + num_bytes - 1;
> +
> +	nr_pages = (disk_num_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;

nit: nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE)

> +	pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_USER);

This could be a simple GFP_KERNEL  allocation

> +	if (!pages)
> +		return -ENOMEM;
> +	for (i = 0; i < nr_pages; i++) {
> +		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
> +		char *kaddr;
> +
> +		pages[i] = alloc_page(GFP_HIGHUSER);

Why GFP_HIGHUSER? You are reading from userspace,  not writing to it. A
plain, NOFS allocation should suffice (of course using the newer
memalloc_nofs_save api)?


> +		if (!pages[i]) {
> +			ret = -ENOMEM;
> +			goto out_pages;
> +		}
> +		kaddr = kmap(pages[i]);
> +		if (copy_from_iter(kaddr, bytes, from) != bytes) {
> +			kunmap(pages[i]);
> +			ret = -EFAULT;
> +			goto out_pages;
> +		}
> +		if (bytes < PAGE_SIZE)
> +			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
> +		kunmap(pages[i]);
> +	}
> +
> +	for (;;) {
> +		struct btrfs_ordered_extent *ordered;
> +
> +		ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
> +		if (ret)
> +			goto out_pages;
> +		ret = invalidate_inode_pages2_range(inode->i_mapping,
> +						    start >> PAGE_SHIFT,
> +						    end >> PAGE_SHIFT);
> +		if (ret)
> +			goto out_pages;
> +		lock_extent_bits(io_tree, start, end, &cached_state);
> +		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
> +						     end - start + 1);
> +		if (!ordered &&
> +		    !filemap_range_has_page(inode->i_mapping, start, end))
> +			break;
> +		if (ordered)
> +			btrfs_put_ordered_extent(ordered);
> +		unlock_extent_cached(io_tree, start, end, &cached_state);
> +		cond_resched();
> +	}
> +
> +	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start,
> +					   num_bytes);
> +	if (ret)
> +		goto out_unlock;
> +
> +	ret = btrfs_reserve_extent(root, num_bytes, disk_num_bytes,
> +				   disk_num_bytes, 0, 0, &ins, 1, 1);
> +	if (ret)
> +		goto out_delalloc_release;
> +
> +	em = create_io_em(inode, start, num_bytes, start, ins.objectid,
> +			  ins.offset, ins.offset, num_bytes, compression,
> +			  BTRFS_ORDERED_COMPRESSED);
> +	if (IS_ERR(em)) {
> +		ret = PTR_ERR(em);
> +		goto out_free_reserve;
> +	}
> +	free_extent_map(em);
> +
> +	ret = btrfs_add_ordered_extent_compress(inode, start, ins.objectid,
> +						num_bytes, ins.offset,
> +						BTRFS_ORDERED_COMPRESSED,
> +						compression);
> +	if (ret) {
> +		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
> +		goto out_free_reserve;
> +	}
> +	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
> +
> +	if (start + encoded->len > inode->i_size)
> +		i_size_write(inode, start + encoded->len);

Don't we want the inode size to be updated once data hits disk and
btrfs_finish_ordered_io is called?

> +
> +	unlock_extent_cached(io_tree, start, end, &cached_state);
> +
> +	btrfs_delalloc_release_extents(BTRFS_I(inode), num_bytes, false);
> +
> +	if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
> +					  ins.offset, pages, nr_pages, 0,
> +					  false)) {
> +		struct page *page = pages[0];
> +
> +		page->mapping = inode->i_mapping;
> +		btrfs_writepage_endio_finish_ordered(page, start, end, 0);
> +		page->mapping = NULL;
> +		ret = -EIO;
> +		goto out_pages;
> +	}
> +	iocb->ki_pos += encoded->len;
> +	return orig_count;
> +
> +out_free_reserve:
> +	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
> +	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
> +out_delalloc_release:
> +	btrfs_delalloc_release_space(inode, data_reserved, start, num_bytes,
> +				     true);
> +out_unlock:
> +	unlock_extent_cached(io_tree, start, end, &cached_state);
> +out_pages:
> +	for (i = 0; i < nr_pages; i++) {
> +		if (pages[i])
> +			put_page(pages[i]);
> +	}
> +	kvfree(pages);
> +	return ret;
> +}
> +
>  #ifdef CONFIG_SWAP
>  /*
>   * Add an entry indicating a block group or device which is pinned by a
> 

^ permalink raw reply

* Re: [RFC PATCH v2 4/5] btrfs: implement RWF_ENCODED reads
From: Nikolay Borisov @ 2019-10-16 11:10 UTC (permalink / raw)
  To: Omar Sandoval, linux-btrfs, linux-fsdevel
  Cc: kernel-team, Dave Chinner, Jann Horn, linux-api
In-Reply-To: <338d3b28dd31249053620b83e6ff190ad965fadc.1571164762.git.osandov@fb.com>



On 15.10.19 г. 21:42 ч., Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> There are 4 main cases:
> 
> 1. Inline extents: we copy the data straight out of the extent buffer.
> 2. Hole/preallocated extents: we indicate the size of the extent
>    starting from the read position; we don't need to copy zeroes.
> 3. Regular, uncompressed extents: we read the sectors we need directly
>    from disk.
> 4. Regular, compressed extents: we read the entire compressed extent
>    from disk and indicate what subset of the decompressed extent is in
>    the file.
> 
> This initial implementation simplifies a few things that can be improved
> in the future:
> 
> - We hold the inode lock during the operation.
> - Cases 1, 3, and 4 allocate temporary memory to read into before
>   copying out to userspace.
> - Cases 3 and 4 do not implement repair yet.
> 
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  fs/btrfs/ctree.h |   2 +
>  fs/btrfs/file.c  |  12 +-
>  fs/btrfs/inode.c | 462 +++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 475 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 71552b2ca340..3b2aa1c7218c 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -2906,6 +2906,8 @@ int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
>  int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
>  void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
>  					  u64 end, int uptodate);
> +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter);
> +
>  extern const struct dentry_operations btrfs_dentry_operations;
>  
>  /* ioctl.c */
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 27e5b269e729..51740cee39fc 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -390,6 +390,16 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
>  	return 0;
>  }
>  
> +static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
> +{
> +	if (iocb->ki_flags & IOCB_ENCODED) {
> +		if (iocb->ki_flags & IOCB_NOWAIT)
> +			return -EOPNOTSUPP;
> +		return btrfs_encoded_read(iocb, iter);
> +	}
> +	return generic_file_read_iter(iocb, iter);
> +}
> +
>  /* simple helper to fault in pages and copy.  This should go away
>   * and be replaced with calls into generic code.
>   */
> @@ -3457,7 +3467,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
>  
>  const struct file_operations btrfs_file_operations = {
>  	.llseek		= btrfs_file_llseek,
> -	.read_iter      = generic_file_read_iter,
> +	.read_iter      = btrfs_file_read_iter,
>  	.splice_read	= generic_file_splice_read,
>  	.write_iter	= btrfs_file_write_iter,
>  	.mmap		= btrfs_file_mmap,
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 8bce46122ef7..174d0738d2c9 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -10593,6 +10593,468 @@ void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
>  	}
>  }
>  
> +static int encoded_iov_compression_from_btrfs(struct encoded_iov *encoded,
> +					      unsigned int compress_type)
> +{
> +	switch (compress_type) {
> +	case BTRFS_COMPRESS_NONE:
> +		encoded->compression = ENCODED_IOV_COMPRESSION_NONE;
> +		break;
> +	case BTRFS_COMPRESS_ZLIB:
> +		encoded->compression = ENCODED_IOV_COMPRESSION_ZLIB;
> +		break;
> +	case BTRFS_COMPRESS_LZO:
> +		encoded->compression = ENCODED_IOV_COMPRESSION_LZO;
> +		break;
> +	case BTRFS_COMPRESS_ZSTD:
> +		encoded->compression = ENCODED_IOV_COMPRESSION_ZSTD;
> +		break;
> +	default:
> +		return -EIO;
> +	}
> +	return 0;
> +}
> +
> +static ssize_t btrfs_encoded_read_inline(struct kiocb *iocb,
> +					 struct iov_iter *iter, u64 start,
> +					 u64 lockend,
> +					 struct extent_state **cached_state,
> +					 u64 extent_start, size_t count,
> +					 struct encoded_iov *encoded,
> +					 bool *unlocked)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> +	struct btrfs_path *path;
> +	struct extent_buffer *leaf;
> +	struct btrfs_file_extent_item *item;
> +	u64 ram_bytes;
> +	unsigned long ptr;
> +	void *tmp;
> +	ssize_t ret;
> +
> +	path = btrfs_alloc_path();
> +	if (!path) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path,
> +				       btrfs_ino(BTRFS_I(inode)), extent_start,
> +				       0);
> +	if (ret) {
> +		if (ret > 0) {
> +			/* The extent item disappeared? */
> +			ret = -EIO;
> +		}
> +		goto out;
> +	}
> +	leaf = path->nodes[0];
> +	item = btrfs_item_ptr(leaf, path->slots[0],
> +			      struct btrfs_file_extent_item);
> +
> +	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
> +	ptr = btrfs_file_extent_inline_start(item);
> +
> +	encoded->len = (min_t(u64, extent_start + ram_bytes, inode->i_size) -
> +			iocb->ki_pos);
> +	ret = encoded_iov_compression_from_btrfs(encoded,
> +				 btrfs_file_extent_compression(leaf, item));
> +	if (ret)
> +		goto out;
> +	if (encoded->compression) {
> +		size_t inline_size;
> +
> +		inline_size = btrfs_file_extent_inline_item_len(leaf,
> +						btrfs_item_nr(path->slots[0]));
> +		if (inline_size > count) {
> +			ret = -EFBIG;
> +			goto out;
> +		}
> +		count = inline_size;
> +		encoded->unencoded_len = ram_bytes;
> +		encoded->unencoded_offset = iocb->ki_pos - extent_start;
> +	} else {
> +		encoded->len = encoded->unencoded_len = count =
> +			min_t(u64, count, encoded->len);
> +		ptr += iocb->ki_pos - extent_start;
> +	}
> +
> +	tmp = kmalloc(count, GFP_NOFS);
> +	if (!tmp) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +	read_extent_buffer(leaf, tmp, ptr, count);
> +	btrfs_free_path(path);
> +	path = NULL;
> +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> +	inode_unlock(inode);
> +	*unlocked = true;
> +	if (copy_to_iter(encoded, sizeof(*encoded), iter) == sizeof(*encoded) &&
> +	    copy_to_iter(tmp, count, iter) == count)
> +		ret = count;
> +	else
> +		ret = -EFAULT;
> +	kfree(tmp);
> +
> +out:
> +	btrfs_free_path(path);
> +	return ret;
> +}
> +
> +struct btrfs_encoded_read_private {
> +	struct inode *inode;
> +	wait_queue_head_t wait;
> +	atomic_t pending;
> +	bool uptodate;
> +	bool skip_csum;
> +};
> +
> +static bool btrfs_encoded_read_check_csums(struct btrfs_io_bio *io_bio)
> +{
> +	struct btrfs_encoded_read_private *priv = io_bio->bio.bi_private;
> +	struct inode *inode = priv->inode;
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	u32 sectorsize = fs_info->sectorsize;
> +	struct bio_vec *bvec;
> +	struct bvec_iter_all iter_all;
> +	u64 offset = 0;
> +
> +	if (priv->skip_csum)
> +		return true;
> +	bio_for_each_segment_all(bvec, &io_bio->bio, iter_all) {
> +		unsigned int i, nr_sectors, pgoff;
> +
> +		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
> +		pgoff = bvec->bv_offset;
> +		for (i = 0; i < nr_sectors; i++) {
> +			int csum_pos;
> +
> +			csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
> +			if (__readpage_endio_check(inode, io_bio, csum_pos,
> +						   bvec->bv_page, pgoff,
> +						   io_bio->logical + offset,
> +						   sectorsize))
> +				return false;
> +			offset += sectorsize;
> +			pgoff += sectorsize;
> +		}
> +	}
> +	return true;
> +}
> +
> +static void btrfs_encoded_read_endio(struct bio *bio)
> +{
> +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> +	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
> +
> +	if (bio->bi_status || !btrfs_encoded_read_check_csums(io_bio))
> +		priv->uptodate = false;
> +	if (!atomic_dec_return(&priv->pending))
> +		wake_up(&priv->wait);
> +	btrfs_io_bio_free_csum(io_bio);
> +	bio_put(bio);
> +}
> +
> +static bool btrfs_submit_encoded_read(struct bio *bio)
> +{
> +	struct btrfs_encoded_read_private *priv = bio->bi_private;
> +	struct inode *inode = priv->inode;
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	blk_status_t status;
> +
> +	atomic_inc(&priv->pending);
> +
> +	if (!priv->skip_csum) {
> +		status = btrfs_lookup_bio_sums_at_offset(inode, bio,
> +							 btrfs_io_bio(bio)->logical,
> +							 NULL);
> +		if (status)
> +			goto out;
> +	}
> +
> +	status = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
> +	if (status)
> +		goto out;
> +
> +	status = btrfs_map_bio(fs_info, bio, 0, 0);
> +out:
> +	if (status) {
> +		bio->bi_status = status;
> +		bio_endio(bio);
> +		return false;
> +	}
> +	return true;
> +}
> +
> +static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
> +					  struct iov_iter *iter,
> +					  u64 start, u64 lockend,
> +					  struct extent_state **cached_state,
> +					  struct block_device *bdev,
> +					  u64 offset, u64 disk_io_size,
> +					  size_t count,
> +					  const struct encoded_iov *encoded,
> +					  bool *unlocked)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> +	struct btrfs_encoded_read_private priv = {
> +		.inode = inode,
> +		.wait = __WAIT_QUEUE_HEAD_INITIALIZER(priv.wait),
> +		.pending = ATOMIC_INIT(1),
> +		.uptodate = true,
> +		.skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM,
> +	};
> +	struct page **pages;
> +	unsigned long nr_pages, i;
> +	struct bio *bio = NULL;
> +	u64 cur;
> +	size_t page_offset;
> +	ssize_t ret;
> +
> +	nr_pages = (disk_io_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> +	if (!pages)
> +		return -ENOMEM;
> +	for (i = 0; i < nr_pages; i++) {
> +		pages[i] = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
> +		if (!pages[i]) {
> +			ret = -ENOMEM;
> +			goto out;
> +		}
> +	}
> +
> +	i = 0;
> +	cur = 0;
> +	while (cur < disk_io_size) {
> +		size_t bytes = min_t(u64, disk_io_size - cur,
> +				     PAGE_SIZE);
> +
> +		if (!bio) {
> +			bio = btrfs_bio_alloc(offset + cur);
> +			bio_set_dev(bio, bdev);
> +			bio->bi_end_io = btrfs_encoded_read_endio;
> +			bio->bi_private = &priv;
> +			bio->bi_opf = REQ_OP_READ;
> +			btrfs_io_bio(bio)->logical = start + cur;
> +		}
> +
> +		if (bio_add_page(bio, pages[i], bytes, 0) < bytes) {
> +			bool success;
> +
> +			success = btrfs_submit_encoded_read(bio);
> +			bio = NULL;
> +			if (!success)
> +				break;
> +			continue;
> +		}
> +		i++;
> +		cur += bytes;
> +	}
> +
> +	if (bio)
> +		btrfs_submit_encoded_read(bio);
> +	if (atomic_dec_return(&priv.pending))
> +		wait_event(priv.wait, !atomic_read(&priv.pending));
> +	if (!priv.uptodate) {
> +		ret = -EIO;
> +		goto out;
> +	}
> +
> +	unlock_extent_cached(io_tree, start, lockend, cached_state);
> +	inode_unlock(inode);
> +	*unlocked = true;
> +
> +	if (copy_to_iter(encoded, sizeof(*encoded), iter) != sizeof(*encoded)) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +	if (encoded->compression) {
> +		i = 0;
> +		page_offset = 0;
> +	} else {
> +		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
> +		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
> +	}
> +	cur = 0;
> +	while (cur < count) {
> +		size_t bytes = min_t(size_t, count - cur,
> +				     PAGE_SIZE - page_offset);
> +
> +		if (copy_page_to_iter(pages[i], page_offset, bytes,
> +				      iter) != bytes) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +		i++;
> +		cur += bytes;
> +		page_offset = 0;
> +	}
> +	ret = count;
> +out:
> +	for (i = 0; i < nr_pages; i++) {
> +		if (pages[i])
> +			put_page(pages[i]);
> +	}
> +	kfree(pages);
> +	return ret;
> +}
> +
> +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> +	ssize_t ret;
> +	size_t count;
> +	struct block_device *em_bdev;
> +	u64 start, lockend, offset, disk_io_size;
> +	struct extent_state *cached_state = NULL;
> +	struct extent_map *em;
> +	struct encoded_iov encoded = {};
> +	bool unlocked = false;
> +
> +	ret = check_encoded_read(iocb, iter);
> +	if (ret < 0)
> +		return ret;
> +	if (ret == 0) {
> +empty:
> +		if (copy_to_iter(&encoded, sizeof(encoded), iter) ==
> +		    sizeof(encoded))
> +			return 0;
> +		else
> +			return -EFAULT;

nit: Just put the label at the end of the function since it's a simple
error handler.

> +	}
> +	count = ret;
> +
> +	file_accessed(iocb->ki_filp);
> +
> +	inode_lock(inode);
> +
> +	if (iocb->ki_pos >= inode->i_size) {
> +		inode_unlock(inode);
> +		goto empty;

That way you won't have to jump backwards here ...

> +	}
> +	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
> +	/*
> +	 * We don't know how long the extent containing iocb->ki_pos is, but if
> +	 * it's compressed we know that it won't be longer than this.
> +	 */
> +	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
> +
> +	for (;;) {
> +		struct btrfs_ordered_extent *ordered;
> +
> +		ret = btrfs_wait_ordered_range(inode, start,
> +					       lockend - start + 1);
> +		if (ret)
> +			goto out_unlock_inode;
> +		lock_extent_bits(io_tree, start, lockend, &cached_state);
> +		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
> +						     lockend - start + 1);
> +		if (!ordered)
> +			break;
> +		btrfs_put_ordered_extent(ordered);
> +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> +		cond_resched();
> +	}
> +
> +	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start,
> +			      lockend - start + 1, 0);
> +	if (IS_ERR(em)) {
> +		ret = PTR_ERR(em);
> +		goto out_unlock_extent;
> +	}
> +	em_bdev = em->bdev;
> +
> +	if (em->block_start == EXTENT_MAP_INLINE) {
> +		u64 extent_start = em->start;
> +
> +		/*
> +		 * For inline extents we get everything we need out of the
> +		 * extent item.
> +		 */
> +		free_extent_map(em);
> +		em = NULL;
> +		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
> +						&cached_state, extent_start,
> +						count, &encoded, &unlocked);
> +		goto out;
> +	}
> +
> +	/*
> +	 * We only want to return up to EOF even if the extent extends beyond
> +	 * that.
> +	 */
> +	encoded.len = (min_t(u64, extent_map_end(em), inode->i_size) -
> +		       iocb->ki_pos);
> +	if (em->block_start == EXTENT_MAP_HOLE ||
> +	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
> +		offset = EXTENT_MAP_HOLE;
> +	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
> +		offset = em->block_start;
> +		/*
> +		 * Bail if the buffer isn't large enough to return the whole
> +		 * compressed extent.
> +		 */
> +		if (em->block_len > count) {
> +			ret = -EFBIG;
> +			goto out_em;
> +		}
> +		disk_io_size = count = em->block_len;
> +		encoded.unencoded_len = em->ram_bytes;
> +		encoded.unencoded_offset = iocb->ki_pos - em->orig_start;
> +		ret = encoded_iov_compression_from_btrfs(&encoded,
> +							 em->compress_type);
> +		if (ret)
> +			goto out_em;
> +	} else {
> +		offset = em->block_start + (start - em->start);
> +		if (encoded.len > count)
> +			encoded.len = count;
> +		/*
> +		 * Don't read beyond what we locked. This also limits the page
> +		 * allocations that we'll do.
> +		 */
> +		disk_io_size = min(lockend + 1, iocb->ki_pos + encoded.len) - start;
> +		encoded.len = encoded.unencoded_len = count =
> +			start + disk_io_size - iocb->ki_pos;
> +		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
> +	}
> +	free_extent_map(em);
> +	em = NULL;
> +
> +	if (offset == EXTENT_MAP_HOLE) {
> +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> +		inode_unlock(inode);
> +		unlocked = true;
> +		if (copy_to_iter(&encoded, sizeof(encoded), iter) ==
> +		    sizeof(encoded))
> +			ret = 0;
> +		else
> +			ret = -EFAULT;
> +	} else {
> +		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
> +						 &cached_state, em_bdev, offset,
> +						 disk_io_size, count, &encoded,
> +						 &unlocked);
> +	}
> +
> +out:
> +	if (ret >= 0)
> +		iocb->ki_pos += encoded.len;
> +out_em:
> +	free_extent_map(em);
> +out_unlock_extent:
> +	if (!unlocked)
> +		unlock_extent_cached(io_tree, start, lockend, &cached_state);
> +out_unlock_inode:
> +	if (!unlocked)
> +		inode_unlock(inode);
> +	return ret;
> +}
> +
>  #ifdef CONFIG_SWAP
>  /*
>   * Add an entry indicating a block group or device which is pinned by a
> 

^ permalink raw reply

* Re: [PATCHv7 18/33] lib/vdso: Add unlikely() hint into vdso_read_begin()
From: Vincenzo Frascino @ 2019-10-16 11:24 UTC (permalink / raw)
  To: Dmitry Safonov, linux-kernel
  Cc: Dmitry Safonov, Andrei Vagin, Adrian Reber, Andrei Vagin,
	Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
	Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	Thomas Gleixner, containers, criu, linux-api, x86
In-Reply-To: <20191011012341.846266-19-dima@arista.com>

On 10/11/19 2:23 AM, Dmitry Safonov wrote:
> From: Andrei Vagin <avagin@gmail.com>
> 
> Place the branch with no concurrent write before contended case.
> 
> Performance numbers for Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz
> (more clock_gettime() cycles - the better):
>         | before    | after
> -----------------------------------
>         | 150252214 | 153242367
>         | 150301112 | 153324800
>         | 150392773 | 153125401
>         | 150373957 | 153399355
>         | 150303157 | 153489417
>         | 150365237 | 153494270
> -----------------------------------
> avg     | 150331408 | 153345935
> diff %  | 2	    | 0
> -----------------------------------
> stdev % | 0.3	    | 0.1
> 
> Signed-off-by: Andrei Vagin <avagin@gmail.com>
> Co-developed-by: Dmitry Safonov <dima@arista.com>
> Signed-off-by: Dmitry Safonov <dima@arista.com>

Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>

> ---
>  include/vdso/helpers.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h
> index 01641dbb68ef..9a2af9fca45e 100644
> --- a/include/vdso/helpers.h
> +++ b/include/vdso/helpers.h
> @@ -10,7 +10,7 @@ static __always_inline u32 vdso_read_begin(const struct vdso_data *vd)
>  {
>  	u32 seq;
>  
> -	while ((seq = READ_ONCE(vd->seq)) & 1)
> +	while (unlikely((seq = READ_ONCE(vd->seq)) & 1))
>  		cpu_relax();
>  
>  	smp_rmb();
> 

-- 
Regards,
Vincenzo

^ permalink raw reply

* [PATCH v2] usercopy: Avoid soft lockups in test_check_nonzero_user()
From: Michael Ellerman @ 2019-10-16 12:27 UTC (permalink / raw)
  To: cyphar
  Cc: mingo, peterz, alexander.shishkin, jolsa, namhyung, christian,
	keescook, linux, viro, torvalds, linux-api, linux-kernel
In-Reply-To: <20191011022447.24249-1-mpe@ellerman.id.au>

On a machine with a 64K PAGE_SIZE, the nested for loops in
test_check_nonzero_user() can lead to soft lockups, eg:

  watchdog: BUG: soft lockup - CPU#4 stuck for 22s! [modprobe:611]
  Modules linked in: test_user_copy(+) vmx_crypto gf128mul crc32c_vpmsum virtio_balloon ip_tables x_tables autofs4
  CPU: 4 PID: 611 Comm: modprobe Tainted: G             L    5.4.0-rc1-gcc-8.2.0-00001-gf5a1a536fa14-dirty #1151
  ...
  NIP __might_sleep+0x20/0xc0
  LR  __might_fault+0x40/0x60
  Call Trace:
    check_zeroed_user+0x12c/0x200
    test_user_copy_init+0x67c/0x1210 [test_user_copy]
    do_one_initcall+0x60/0x340
    do_init_module+0x7c/0x2f0
    load_module+0x2d94/0x30e0
    __do_sys_finit_module+0xc8/0x150
    system_call+0x5c/0x68

Even with a 4K PAGE_SIZE the test takes multiple seconds. Instead
tweak it to only scan a 1024 byte region, but make it cross the
page boundary.

Fixes: f5a1a536fa14 ("lib: introduce copy_struct_from_user() helper")
Suggested-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 lib/test_user_copy.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

v2: Rework calculation to just use PAGE_SIZE directly.
    Rebase onto Christian's tree.

diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
index ad2372727b1b..5ff04d8fe971 100644
--- a/lib/test_user_copy.c
+++ b/lib/test_user_copy.c
@@ -47,9 +47,25 @@ static bool is_zeroed(void *from, size_t size)
 static int test_check_nonzero_user(char *kmem, char __user *umem, size_t size)
 {
 	int ret = 0;
-	size_t start, end, i;
-	size_t zero_start = size / 4;
-	size_t zero_end = size - zero_start;
+	size_t start, end, i, zero_start, zero_end;
+
+	if (test(size < 2 * PAGE_SIZE, "buffer too small"))
+		return -EINVAL;
+
+	/*
+	 * We want to cross a page boundary to exercise the code more
+	 * effectively. We also don't want to make the size we scan too large,
+	 * otherwise the test can take a long time and cause soft lockups. So
+	 * scan a 1024 byte region across the page boundary.
+	 */
+	size = 1024;
+	start = PAGE_SIZE - (size / 2);
+
+	kmem += start;
+	umem += start;
+
+	zero_start = size / 4;
+	zero_end = size - zero_start;
 
 	/*
 	 * We conduct a series of check_nonzero_user() tests on a block of
-- 
2.21.0

^ permalink raw reply related

* Re: [PATCH] usercopy: Avoid soft lockups in test_check_nonzero_user()
From: Michael Ellerman @ 2019-10-16 12:28 UTC (permalink / raw)
  To: Christian Brauner, Aleksa Sarai
  Cc: mingo, peterz, alexander.shishkin, jolsa, namhyung, keescook,
	linux, viro, torvalds, libc-alpha, linux-api, linux-kernel
In-Reply-To: <20191011094333.7hovhhacrvlf6uq6@wittgenstein>

Christian Brauner <christian.brauner@ubuntu.com> writes:
> On Fri, Oct 11, 2019 at 02:48:10PM +1100, Aleksa Sarai wrote:
>> On 2019-10-11, Michael Ellerman <mpe@ellerman.id.au> wrote:
>> > On a machine with a 64K PAGE_SIZE, the nested for loops in
>> > test_check_nonzero_user() can lead to soft lockups, eg:
>> > 
>> >   watchdog: BUG: soft lockup - CPU#4 stuck for 22s! [modprobe:611]
>> >   Modules linked in: test_user_copy(+) vmx_crypto gf128mul crc32c_vpmsum virtio_balloon ip_tables x_tables autofs4
>> >   CPU: 4 PID: 611 Comm: modprobe Tainted: G             L    5.4.0-rc1-gcc-8.2.0-00001-gf5a1a536fa14-dirty #1151
>> >   ...
>> >   NIP __might_sleep+0x20/0xc0
>> >   LR  __might_fault+0x40/0x60
>> >   Call Trace:
>> >     check_zeroed_user+0x12c/0x200
>> >     test_user_copy_init+0x67c/0x1210 [test_user_copy]
>> >     do_one_initcall+0x60/0x340
>> >     do_init_module+0x7c/0x2f0
>> >     load_module+0x2d94/0x30e0
>> >     __do_sys_finit_module+0xc8/0x150
>> >     system_call+0x5c/0x68
>> > 
>> > Even with a 4K PAGE_SIZE the test takes multiple seconds. Instead
>> > tweak it to only scan a 1024 byte region, but make it cross the
>> > page boundary.
>> > 
>> > Fixes: f5a1a536fa14 ("lib: introduce copy_struct_from_user() helper")
>> > Suggested-by: Aleksa Sarai <cyphar@cyphar.com>
>> > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
>> > ---
>> >  lib/test_user_copy.c | 23 ++++++++++++++++++++---
>> >  1 file changed, 20 insertions(+), 3 deletions(-)
>> > 
>> > How does this look? It runs in < 1s on my machine here.
>> > 
>> > cheers
>> > 
>> > diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
>> > index 950ee88cd6ac..9fb6bc609d4c 100644
>> > --- a/lib/test_user_copy.c
>> > +++ b/lib/test_user_copy.c
>> > @@ -47,9 +47,26 @@ static bool is_zeroed(void *from, size_t size)
>> >  static int test_check_nonzero_user(char *kmem, char __user *umem, size_t size)
>> >  {
>> >  	int ret = 0;
>> > -	size_t start, end, i;
>> > -	size_t zero_start = size / 4;
>> > -	size_t zero_end = size - zero_start;
>> > +	size_t start, end, i, zero_start, zero_end;
>> > +
>> > +	if (test(size < 1024, "buffer too small"))
>> > +		return -EINVAL;
>> > +
>> > +	/*
>> > +	 * We want to cross a page boundary to exercise the code more
>> > +	 * effectively. We assume the buffer we're passed has a page boundary at
>> > +	 * size / 2. We also don't want to make the size we scan too large,
>> > +	 * otherwise the test can take a long time and cause soft lockups. So
>> > +	 * scan a 1024 byte region across the page boundary.
>> > +	 */
>> > +	start = size / 2 - 512;
>> > +	size = 1024;
>> 
>> I don't think it's necessary to do "size / 2" here -- you can just use
>> PAGE_SIZE directly and check above that "size == 2*PAGE_SIZE" (not that
>> this check is exceptionally necessary -- since there's only one caller
>> of this function and it's in the same file).
>
> Michael, in case you resend, can you make my life a little easier and do
> it on top of
> https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=copy_struct_from_user
> please. I have a fix from Aleksa sitting in there laready that _might_
> cause a conflict otherwise.

No worries, done.

cheers

^ permalink raw reply

* Re: [PATCH v2] usercopy: Avoid soft lockups in test_check_nonzero_user()
From: Aleksa Sarai @ 2019-10-16 12:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: mingo, peterz, alexander.shishkin, jolsa, namhyung, christian,
	keescook, linux, viro, torvalds, linux-api, linux-kernel
In-Reply-To: <20191016122732.13467-1-mpe@ellerman.id.au>

[-- Attachment #1: Type: text/plain, Size: 2726 bytes --]

On 2019-10-16, Michael Ellerman <mpe@ellerman.id.au> wrote:
> On a machine with a 64K PAGE_SIZE, the nested for loops in
> test_check_nonzero_user() can lead to soft lockups, eg:
> 
>   watchdog: BUG: soft lockup - CPU#4 stuck for 22s! [modprobe:611]
>   Modules linked in: test_user_copy(+) vmx_crypto gf128mul crc32c_vpmsum virtio_balloon ip_tables x_tables autofs4
>   CPU: 4 PID: 611 Comm: modprobe Tainted: G             L    5.4.0-rc1-gcc-8.2.0-00001-gf5a1a536fa14-dirty #1151
>   ...
>   NIP __might_sleep+0x20/0xc0
>   LR  __might_fault+0x40/0x60
>   Call Trace:
>     check_zeroed_user+0x12c/0x200
>     test_user_copy_init+0x67c/0x1210 [test_user_copy]
>     do_one_initcall+0x60/0x340
>     do_init_module+0x7c/0x2f0
>     load_module+0x2d94/0x30e0
>     __do_sys_finit_module+0xc8/0x150
>     system_call+0x5c/0x68
> 
> Even with a 4K PAGE_SIZE the test takes multiple seconds. Instead
> tweak it to only scan a 1024 byte region, but make it cross the
> page boundary.
> 
> Fixes: f5a1a536fa14 ("lib: introduce copy_struct_from_user() helper")
> Suggested-by: Aleksa Sarai <cyphar@cyphar.com>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

Thanks Michael.

Reviewed-by: Aleksa Sarai <cyphar@cyphar.com>

> ---
>  lib/test_user_copy.c | 22 +++++++++++++++++++---
>  1 file changed, 19 insertions(+), 3 deletions(-)
> 
> v2: Rework calculation to just use PAGE_SIZE directly.
>     Rebase onto Christian's tree.
> 
> diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
> index ad2372727b1b..5ff04d8fe971 100644
> --- a/lib/test_user_copy.c
> +++ b/lib/test_user_copy.c
> @@ -47,9 +47,25 @@ static bool is_zeroed(void *from, size_t size)
>  static int test_check_nonzero_user(char *kmem, char __user *umem, size_t size)
>  {
>  	int ret = 0;
> -	size_t start, end, i;
> -	size_t zero_start = size / 4;
> -	size_t zero_end = size - zero_start;
> +	size_t start, end, i, zero_start, zero_end;
> +
> +	if (test(size < 2 * PAGE_SIZE, "buffer too small"))
> +		return -EINVAL;
> +
> +	/*
> +	 * We want to cross a page boundary to exercise the code more
> +	 * effectively. We also don't want to make the size we scan too large,
> +	 * otherwise the test can take a long time and cause soft lockups. So
> +	 * scan a 1024 byte region across the page boundary.
> +	 */
> +	size = 1024;
> +	start = PAGE_SIZE - (size / 2);
> +
> +	kmem += start;
> +	umem += start;
> +
> +	zero_start = size / 4;
> +	zero_end = size - zero_start;
>  
>  	/*
>  	 * We conduct a series of check_nonzero_user() tests on a block of
> -- 
> 2.21.0
> 


-- 
Aleksa Sarai
Senior Software Engineer (Containers)
SUSE Linux GmbH
<https://www.cyphar.com/>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH] usercopy: Avoid soft lockups in test_check_nonzero_user()
From: Christian Brauner @ 2019-10-16 12:45 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Aleksa Sarai, mingo, peterz, alexander.shishkin, jolsa, namhyung,
	keescook, linux, viro, torvalds, libc-alpha, linux-api,
	linux-kernel
In-Reply-To: <87a7a0ub6j.fsf@mpe.ellerman.id.au>

On Wed, Oct 16, 2019 at 11:28:20PM +1100, Michael Ellerman wrote:
> Christian Brauner <christian.brauner@ubuntu.com> writes:
> > On Fri, Oct 11, 2019 at 02:48:10PM +1100, Aleksa Sarai wrote:
> >> On 2019-10-11, Michael Ellerman <mpe@ellerman.id.au> wrote:
> >> > On a machine with a 64K PAGE_SIZE, the nested for loops in
> >> > test_check_nonzero_user() can lead to soft lockups, eg:
> >> > 
> >> >   watchdog: BUG: soft lockup - CPU#4 stuck for 22s! [modprobe:611]
> >> >   Modules linked in: test_user_copy(+) vmx_crypto gf128mul crc32c_vpmsum virtio_balloon ip_tables x_tables autofs4
> >> >   CPU: 4 PID: 611 Comm: modprobe Tainted: G             L    5.4.0-rc1-gcc-8.2.0-00001-gf5a1a536fa14-dirty #1151
> >> >   ...
> >> >   NIP __might_sleep+0x20/0xc0
> >> >   LR  __might_fault+0x40/0x60
> >> >   Call Trace:
> >> >     check_zeroed_user+0x12c/0x200
> >> >     test_user_copy_init+0x67c/0x1210 [test_user_copy]
> >> >     do_one_initcall+0x60/0x340
> >> >     do_init_module+0x7c/0x2f0
> >> >     load_module+0x2d94/0x30e0
> >> >     __do_sys_finit_module+0xc8/0x150
> >> >     system_call+0x5c/0x68
> >> > 
> >> > Even with a 4K PAGE_SIZE the test takes multiple seconds. Instead
> >> > tweak it to only scan a 1024 byte region, but make it cross the
> >> > page boundary.
> >> > 
> >> > Fixes: f5a1a536fa14 ("lib: introduce copy_struct_from_user() helper")
> >> > Suggested-by: Aleksa Sarai <cyphar@cyphar.com>
> >> > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> >> > ---
> >> >  lib/test_user_copy.c | 23 ++++++++++++++++++++---
> >> >  1 file changed, 20 insertions(+), 3 deletions(-)
> >> > 
> >> > How does this look? It runs in < 1s on my machine here.
> >> > 
> >> > cheers
> >> > 
> >> > diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
> >> > index 950ee88cd6ac..9fb6bc609d4c 100644
> >> > --- a/lib/test_user_copy.c
> >> > +++ b/lib/test_user_copy.c
> >> > @@ -47,9 +47,26 @@ static bool is_zeroed(void *from, size_t size)
> >> >  static int test_check_nonzero_user(char *kmem, char __user *umem, size_t size)
> >> >  {
> >> >  	int ret = 0;
> >> > -	size_t start, end, i;
> >> > -	size_t zero_start = size / 4;
> >> > -	size_t zero_end = size - zero_start;
> >> > +	size_t start, end, i, zero_start, zero_end;
> >> > +
> >> > +	if (test(size < 1024, "buffer too small"))
> >> > +		return -EINVAL;
> >> > +
> >> > +	/*
> >> > +	 * We want to cross a page boundary to exercise the code more
> >> > +	 * effectively. We assume the buffer we're passed has a page boundary at
> >> > +	 * size / 2. We also don't want to make the size we scan too large,
> >> > +	 * otherwise the test can take a long time and cause soft lockups. So
> >> > +	 * scan a 1024 byte region across the page boundary.
> >> > +	 */
> >> > +	start = size / 2 - 512;
> >> > +	size = 1024;
> >> 
> >> I don't think it's necessary to do "size / 2" here -- you can just use
> >> PAGE_SIZE directly and check above that "size == 2*PAGE_SIZE" (not that
> >> this check is exceptionally necessary -- since there's only one caller
> >> of this function and it's in the same file).
> >
> > Michael, in case you resend, can you make my life a little easier and do
> > it on top of
> > https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=copy_struct_from_user
> > please. I have a fix from Aleksa sitting in there laready that _might_
> > cause a conflict otherwise.
> 
> No worries, done.

Thank you!
Christian

^ permalink raw reply

* Re: [PATCH v2] usercopy: Avoid soft lockups in test_check_nonzero_user()
From: Christian Brauner @ 2019-10-16 12:50 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: cyphar, mingo, peterz, alexander.shishkin, jolsa, namhyung,
	christian, keescook, linux, viro, torvalds, linux-api,
	linux-kernel
In-Reply-To: <20191016122732.13467-1-mpe@ellerman.id.au>

On Wed, Oct 16, 2019 at 11:27:32PM +1100, Michael Ellerman wrote:
> On a machine with a 64K PAGE_SIZE, the nested for loops in
> test_check_nonzero_user() can lead to soft lockups, eg:
> 
>   watchdog: BUG: soft lockup - CPU#4 stuck for 22s! [modprobe:611]
>   Modules linked in: test_user_copy(+) vmx_crypto gf128mul crc32c_vpmsum virtio_balloon ip_tables x_tables autofs4
>   CPU: 4 PID: 611 Comm: modprobe Tainted: G             L    5.4.0-rc1-gcc-8.2.0-00001-gf5a1a536fa14-dirty #1151
>   ...
>   NIP __might_sleep+0x20/0xc0
>   LR  __might_fault+0x40/0x60
>   Call Trace:
>     check_zeroed_user+0x12c/0x200
>     test_user_copy_init+0x67c/0x1210 [test_user_copy]
>     do_one_initcall+0x60/0x340
>     do_init_module+0x7c/0x2f0
>     load_module+0x2d94/0x30e0
>     __do_sys_finit_module+0xc8/0x150
>     system_call+0x5c/0x68
> 
> Even with a 4K PAGE_SIZE the test takes multiple seconds. Instead
> tweak it to only scan a 1024 byte region, but make it cross the
> page boundary.
> 
> Fixes: f5a1a536fa14 ("lib: introduce copy_struct_from_user() helper")
> Suggested-by: Aleksa Sarai <cyphar@cyphar.com>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

Thanks!
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>

^ permalink raw reply

* Re: [PATCH v2] usercopy: Avoid soft lockups in test_check_nonzero_user()
From: Christian Brauner @ 2019-10-16 13:03 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: cyphar, mingo, peterz, alexander.shishkin, jolsa, namhyung,
	christian, keescook, linux, viro, torvalds, linux-api,
	linux-kernel
In-Reply-To: <20191016122732.13467-1-mpe@ellerman.id.au>

On Wed, Oct 16, 2019 at 11:27:32PM +1100, Michael Ellerman wrote:
> On a machine with a 64K PAGE_SIZE, the nested for loops in
> test_check_nonzero_user() can lead to soft lockups, eg:
> 
>   watchdog: BUG: soft lockup - CPU#4 stuck for 22s! [modprobe:611]
>   Modules linked in: test_user_copy(+) vmx_crypto gf128mul crc32c_vpmsum virtio_balloon ip_tables x_tables autofs4
>   CPU: 4 PID: 611 Comm: modprobe Tainted: G             L    5.4.0-rc1-gcc-8.2.0-00001-gf5a1a536fa14-dirty #1151
>   ...
>   NIP __might_sleep+0x20/0xc0
>   LR  __might_fault+0x40/0x60
>   Call Trace:
>     check_zeroed_user+0x12c/0x200
>     test_user_copy_init+0x67c/0x1210 [test_user_copy]
>     do_one_initcall+0x60/0x340
>     do_init_module+0x7c/0x2f0
>     load_module+0x2d94/0x30e0
>     __do_sys_finit_module+0xc8/0x150
>     system_call+0x5c/0x68
> 
> Even with a 4K PAGE_SIZE the test takes multiple seconds. Instead
> tweak it to only scan a 1024 byte region, but make it cross the
> page boundary.
> 
> Fixes: f5a1a536fa14 ("lib: introduce copy_struct_from_user() helper")
> Suggested-by: Aleksa Sarai <cyphar@cyphar.com>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

With Aleksa's Reviewed-by I've picked this up:
https://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git/log/?h=copy_struct_from_user

Thanks!
Christian

^ permalink raw reply

* Re: [PATCHv7 01/33] ns: Introduce Time Namespace
From: Dmitry Safonov @ 2019-10-16 13:57 UTC (permalink / raw)
  To: Vincenzo Frascino, Thomas Gleixner
  Cc: Dmitry Safonov, linux-kernel, Andrei Vagin, Adrian Reber,
	Andy Lutomirski, Arnd Bergmann, Christian Brauner,
	Cyrill Gorcunov, Eric W. Biederman, H. Peter Anvin, Ingo Molnar,
	Jann Horn, Jeff Dike, Oleg Nesterov, Pavel Emelyanov, Shuah Khan,
	containers, criu, linux-api, x86, Andrei Vagin
In-Reply-To: <5f4c2f29-ca68-b19c-017f-d23730f6e871@arm.com>

On 10/16/19 11:44 AM, Vincenzo Frascino wrote:
> On 10/16/19 11:39 AM, Thomas Gleixner wrote:
[..]
>> config TIME_NS
>> 	bool "TIME namespace"
>> 	depends on GENERIC_VDSO_TIME_NS
>> 	default y
>>
>> and in lib/vdso/Kconfig
>>
>> config GENERIC_VDSO_TIME_NS
>> 	bool
>>
>> and let architectures which have support for the VDSO bits select it.
>>
> 
> Agreed, this is even better.

Thanks, will fix in v8,
          Dmitry

^ permalink raw reply

* Re: [RFC PATCH 02/21] Add a prelocked wake-up
From: David Howells @ 2019-10-16 14:26 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: dhowells, Tim Chen, Kan Liang, Casey Schaufler, Stephen Smalley,
	Greg Kroah-Hartman, Nicolas Dichtel, raven, Christian Brauner,
	keyrings, linux-usb, linux-block, LSM List, linux-fsdevel,
	Linux API, Linux Kernel Mailing List
In-Reply-To: <CAHk-=whiz1sHXu8SVZKEC2dup=r5JMrftPtEt6ff9Ea8dyH8yQ@mail.gmail.com>

Btw, is there any point in __wake_up_sync_key() taking a nr_exclusive
argument since it clears WF_SYNC if nr_exclusive != 1 and doesn't make sense
to be >1 anyway.

David
---
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 3eb7cae8206c..bb7676d396cd 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -201,9 +201,9 @@ void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void
 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
 		unsigned int mode, void *key, wait_queue_entry_t *bookmark);
-void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
+void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
-void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
 
 #define wake_up(x)			__wake_up(x, TASK_NORMAL, 1, NULL)
 #define wake_up_nr(x, nr)		__wake_up(x, TASK_NORMAL, nr, NULL)
@@ -214,7 +214,7 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 #define wake_up_interruptible(x)	__wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
 #define wake_up_interruptible_nr(x, nr)	__wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
 #define wake_up_interruptible_all(x)	__wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
-#define wake_up_interruptible_sync(x)	__wake_up_sync((x), TASK_INTERRUPTIBLE, 1)
+#define wake_up_interruptible_sync(x)	__wake_up_sync((x), TASK_INTERRUPTIBLE)
 
 /*
  * Wakeup macros to be used to report events to the targets.
@@ -228,7 +228,7 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr);
 #define wake_up_interruptible_poll(x, m)					\
 	__wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
 #define wake_up_interruptible_sync_poll(x, m)					\
-	__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, poll_to_key(m))
+	__wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
 
 #define ___wait_cond_timeout(condition)						\
 ({										\
diff --git a/kernel/exit.c b/kernel/exit.c
index a46a50d67002..a1ff25ef050e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1435,7 +1435,7 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
 void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
 {
 	__wake_up_sync_key(&parent->signal->wait_chldexit,
-				TASK_INTERRUPTIBLE, 1, p);
+			   TASK_INTERRUPTIBLE, p);
 }
 
 static long do_wait(struct wait_opts *wo)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index c1e566a114ca..b4b52361dab7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -169,7 +169,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @wq_head: the waitqueue
  * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: opaque value to be passed to wakeup targets
  *
  * The sync wakeup differs that the waker knows that it will schedule
@@ -183,26 +182,21 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
  * accessing the task state.
  */
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
-			int nr_exclusive, void *key)
+			void *key)
 {
-	int wake_flags = 1; /* XXX WF_SYNC */
-
 	if (unlikely(!wq_head))
 		return;
 
-	if (unlikely(nr_exclusive != 1))
-		wake_flags = 0;
-
-	__wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
+	__wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 
 /*
  * __wake_up_sync - see __wake_up_sync_key()
  */
-void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive)
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
 {
-	__wake_up_sync_key(wq_head, mode, nr_exclusive, NULL);
+	__wake_up_sync_key(wq_head, mode, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox