[PATCH 3/5] uffd: Add fork() event

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Mike Rapoport <rapoport@il.ibm.com>
To: Andrea Arcangeli <aarcange@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>,
	LKML <linux-kernel@vger.kernel.org>,
	linux-mm@kvack.org, Mike Rapoport <mike.rapoport@gmail.com>,
	Mike Rapoport <rapoport@il.ibm.com>
Subject: [PATCH 3/5] uffd: Add fork() event
Date: Sun, 20 Mar 2016 14:42:19 +0200	[thread overview]
Message-ID: <1458477741-6942-4-git-send-email-rapoport@il.ibm.com> (raw)
In-Reply-To: <1458477741-6942-1-git-send-email-rapoport@il.ibm.com>

From: Pavel Emelyanov <xemul@parallels.com>

When the mm with uffd-ed vmas fork()-s the respective vmas
notify their uffds with the event which contains a descriptor
with new uffd. This new descriptor can then be used to get
events from the child and populate its mm with data. Note,
that there can be different uffd-s controlling different
vmas within one mm, so first we should collect all those
uffds (and ctx-s) in a list and then notify them all one by
one but only once per fork().

The context is created at fork() time but the descriptor, file
struct and anon inode object is created at event read time. So
some trickery is added to the userfaultfd_ctx_read() to handle
the ctx queues' locking vs file creation.

Another thing worth noticing is that the task that fork()-s
waits for the uffd event to get processed WITHOUT the mmap sem.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Mike Rapoport <rapoport@il.ibm.com>
---
 fs/userfaultfd.c                 | 146 ++++++++++++++++++++++++++++++++++++++-
 include/linux/userfaultfd_k.h    |  12 ++++
 include/uapi/linux/userfaultfd.h |  13 ++--
 kernel/fork.c                    |  10 ++-
 4 files changed, 169 insertions(+), 12 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index c8e7039..565d8f2 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -65,6 +65,12 @@ struct userfaultfd_ctx {
 
 };
 
+struct userfaultfd_fork_ctx {
+	struct userfaultfd_ctx *orig;
+	struct userfaultfd_ctx *new;
+	struct list_head list;
+};
+
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
@@ -431,9 +437,8 @@ out:
 	return ret;
 }
 
-static int __maybe_unused userfaultfd_event_wait_completion(
-		struct userfaultfd_ctx *ctx,
-		struct userfaultfd_wait_queue *ewq)
+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+					     struct userfaultfd_wait_queue *ewq)
 {
 	int ret = 0;
 
@@ -484,6 +489,79 @@ static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
 	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
 }
 
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+	struct userfaultfd_ctx *ctx = NULL, *octx;
+	struct userfaultfd_fork_ctx *fctx;
+
+	octx = vma->vm_userfaultfd_ctx.ctx;
+	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+		return 0;
+	}
+
+	list_for_each_entry(fctx, fcs, list)
+		if (fctx->orig == octx) {
+			ctx = fctx->new;
+			break;
+		}
+
+	if (!ctx) {
+		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
+		if (!fctx)
+			return -ENOMEM;
+
+		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+		if (!ctx) {
+			kfree(fctx);
+			return -ENOMEM;
+		}
+
+		atomic_set(&ctx->refcount, 1);
+		ctx->flags = octx->flags;
+		ctx->state = UFFD_STATE_RUNNING;
+		ctx->features = octx->features;
+		ctx->released = false;
+		ctx->mm = vma->vm_mm;
+		atomic_inc(&ctx->mm->mm_users);
+
+		userfaultfd_ctx_get(octx);
+		fctx->orig = octx;
+		fctx->new = ctx;
+		list_add_tail(&fctx->list, fcs);
+	}
+
+	vma->vm_userfaultfd_ctx.ctx = ctx;
+	return 0;
+}
+
+static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+{
+	struct userfaultfd_ctx *ctx = fctx->orig;
+	struct userfaultfd_wait_queue ewq;
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_FORK;
+	ewq.msg.arg.reserved.reserved1 = (__u64)fctx->new;
+
+	return userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void dup_userfaultfd_complete(struct list_head *fcs)
+{
+	int ret = 0;
+	struct userfaultfd_fork_ctx *fctx, *n;
+
+	list_for_each_entry_safe(fctx, n, fcs, list) {
+		if (!ret)
+			ret = dup_fctx(fctx);
+		list_del(&fctx->list);
+		kfree(fctx);
+	}
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
@@ -614,12 +692,49 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 	}
 }
 
+static const struct file_operations userfaultfd_fops;
+
+static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
+				  struct userfaultfd_ctx *new,
+				  struct uffd_msg *msg)
+{
+	int fd;
+	struct file *file;
+	unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
+
+	fd = get_unused_fd_flags(flags);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
+				  O_RDWR | flags);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+
+	fd_install(fd, file);
+	msg->arg.reserved.reserved1 = 0;
+	msg->arg.fork.ufd = fd;
+
+	return 0;
+}
+
 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 				    struct uffd_msg *msg)
 {
 	ssize_t ret;
 	DECLARE_WAITQUEUE(wait, current);
 	struct userfaultfd_wait_queue *uwq;
+	/*
+	 * Handling fork event requires sleeping operations, so
+	 * we drop the event_wqh lock, then do these ops, then
+	 * lock it back and wake up the waiter. While the lock is
+	 * dropped the ewq may go away so we keep track of it
+	 * carefully.
+	 */
+	LIST_HEAD(fork_event);
+	struct userfaultfd_ctx *fork_nctx = NULL;
 
 	/* always take the fd_wqh lock before the fault_pending_wqh lock */
 	spin_lock(&ctx->fd_wqh.lock);
@@ -677,6 +792,14 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 		if (uwq) {
 			*msg = uwq->msg;
 
+			if (uwq->msg.event == UFFD_EVENT_FORK) {
+				fork_nctx = (struct userfaultfd_ctx *)uwq->msg.arg.reserved.reserved1;
+				list_move(&uwq->wq.task_list, &fork_event);
+				spin_unlock(&ctx->event_wqh.lock);
+				ret = 0;
+				break;
+			}
+
 			userfaultfd_event_complete(ctx, uwq);
 			spin_unlock(&ctx->event_wqh.lock);
 			ret = 0;
@@ -700,6 +823,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 	__set_current_state(TASK_RUNNING);
 	spin_unlock(&ctx->fd_wqh.lock);
 
+	if (!ret && msg->event == UFFD_EVENT_FORK) {
+		ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+
+		if (!ret) {
+			spin_lock(&ctx->event_wqh.lock);
+			if (!list_empty(&fork_event)) {
+				uwq = list_first_entry(&fork_event,
+						       typeof(*uwq),
+						       wq.task_list);
+				list_del(&uwq->wq.task_list);
+				__add_wait_queue(&ctx->event_wqh, &uwq->wq);
+				userfaultfd_event_complete(ctx, uwq);
+			}
+			spin_unlock(&ctx->event_wqh.lock);
+		}
+	}
+
 	return ret;
 }
 
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 587480a..0c7b723 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -53,6 +53,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
 }
 
+extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *);
+extern void dup_userfaultfd_complete(struct list_head *);
+
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
@@ -80,6 +83,15 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline int dup_userfaultfd(struct vm_area_struct *, struct list_head *)
+{
+	return 0;
+}
+
+static inline void dup_userfaultfd_complete(struct list_head *)
+{
+}
+
 #endif /* CONFIG_USERFAULTFD */
 
 #endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 9057d7a..d89eef6 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -14,10 +14,9 @@
 #define UFFD_API ((__u64)0xAA)
 /*
  * After implementing the respective features it will become:
- * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
- *			      UFFD_FEATURE_EVENT_FORK)
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP)
  */
-#define UFFD_API_FEATURES (0)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -72,6 +71,10 @@ struct uffd_msg {
 		} pagefault;
 
 		struct {
+			__u32	ufd;
+		} fork;
+
+		struct {
 			/* unused reserved fields */
 			__u64	reserved1;
 			__u64	reserved2;
@@ -84,9 +87,7 @@ struct uffd_msg {
  * Start at 0x12 and not at 0 to be more strict against bugs.
  */
 #define UFFD_EVENT_PAGEFAULT	0x12
-#if 0 /* not available yet */
 #define UFFD_EVENT_FORK		0x13
-#endif
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
@@ -107,8 +108,8 @@ struct uffdio_api {
 	 */
 #if 0 /* not available yet */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
-#define UFFD_FEATURE_EVENT_FORK			(1<<1)
 #endif
+#define UFFD_FEATURE_EVENT_FORK			(1<<1)
 	__u64 features;
 
 	__u64 ioctls;
diff --git a/kernel/fork.c b/kernel/fork.c
index accb722..0624762 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -55,6 +55,7 @@
 #include <linux/rmap.h>
 #include <linux/ksm.h>
 #include <linux/acct.h>
+#include <linux/userfaultfd_k.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
@@ -408,6 +409,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
+	LIST_HEAD(uf);
 
 	uprobe_start_dup_mmap();
 	down_write(&oldmm->mmap_sem);
@@ -461,12 +463,13 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		if (retval)
 			goto fail_nomem_policy;
 		tmp->vm_mm = mm;
+		retval = dup_userfaultfd(tmp, &uf);
+		if (retval)
+			goto fail_nomem_anon_vma_fork;
 		if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
-		tmp->vm_flags &=
-			~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
+		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
 		tmp->vm_next = tmp->vm_prev = NULL;
-		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file_inode(file);
@@ -522,6 +525,7 @@ out:
 	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
+	dup_userfaultfd_complete(&uf);
 	uprobe_end_dup_mmap();
 	return retval;
 fail_nomem_anon_vma_fork:
-- 
1.9.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2016-03-20 12:42 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-03-20 12:42 [PATCH 0/5] userfaultfd: extension for non cooperative uffd usage Mike Rapoport
2016-03-20 12:42 ` [PATCH 1/5] uffd: Split the find_userfault() routine Mike Rapoport
2016-03-20 12:42 ` [PATCH 2/5] uffd: Add ability to report non-PF events from uffd descriptor Mike Rapoport
2016-03-20 12:42 ` Mike Rapoport [this message]
2016-03-20 12:53   ` [PATCH 3/5] uffd: Add fork() event kbuild test robot
2016-03-20 12:42 ` [PATCH 4/5] uffd: Add mremap() event Mike Rapoport
2016-03-20 12:42 ` [PATCH 5/5] uffd: Add madvise() event for MADV_DONTNEED request Mike Rapoport
2016-03-21  9:54 ` [PATCH 0/5] userfaultfd: extension for non cooperative uffd usage Pavel Emelyanov
2016-04-06  6:14   ` Mike Rapoport
2016-04-20  9:44 ` Pavel Emelyanov
2016-04-22 16:05   ` Andrea Arcangeli
  -- strict thread matches above, loose matches on Subject: below --
2015-05-13 14:52 [PATCH 0/5] UserfaultFD: Extension for non cooperative uffd usage (v2) Pavel Emelyanov
2015-05-13 14:54 ` [PATCH 3/5] uffd: Add fork() event Pavel Emelyanov

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:c8e7039 dfblob:565d8f2 dfblob:587480a dfblob:0c7b723
dfblob:9057d7a dfblob:d89eef6 dfblob:accb722 dfblob:0624762 )
 OR (
bs:"[PATCH 3/5] uffd: Add fork() event" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1458477741-6942-4-git-send-email-rapoport@il.ibm.com \
    --to=rapoport@il.ibm.com \
    --cc=aarcange@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mike.rapoport@gmail.com \
    --cc=xemul@parallels.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).