linux-api.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Andrea Arcangeli <aarcange@redhat.com>
To: Andrew Morton <akpm@linux-foundation.org>,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	qemu-devel@nongnu.org, kvm@vger.kernel.org,
	linux-api@vger.kernel.org
Cc: Pavel Emelyanov <xemul@parallels.com>,
	Sanidhya Kashyap <sanidhya.gatech@gmail.com>,
	zhang.zhanghailiang@huawei.com,
	Linus Torvalds <torvalds@linux-foundation.org>,
	"Kirill A. Shutemov" <kirill@shutemov.name>,
	Andres Lagar-Cavilla <andreslc@google.com>,
	Dave Hansen <dave.hansen@intel.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Rik van Riel <riel@redhat.com>, Mel Gorman <mgorman@suse.de>,
	Andy Lutomirski <luto@amacapital.net>,
	Hugh Dickins <hughd@google.com>,
	Peter Feiner <pfeiner@google.com>,
	"Dr. David Alan Gilbert" <dgilbert@redhat.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	"Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Subject: [PATCH 14/23] userfaultfd: wake pending userfaults
Date: Thu, 14 May 2015 19:31:11 +0200	[thread overview]
Message-ID: <1431624680-20153-15-git-send-email-aarcange@redhat.com> (raw)
In-Reply-To: <1431624680-20153-1-git-send-email-aarcange@redhat.com>

This is an optimization but it's a userland visible one and it affects
the API.

The downside of this optimization is that if you call poll() and you
get POLLIN, read(ufd) may still return -EAGAIN. The blocked userfault
may be waken by a different thread, before read(ufd) comes
around. This in short means that poll() isn't really usable if the
userfaultfd is opened in blocking mode.

userfaults won't wait in "pending" state to be read anymore and any
UFFDIO_WAKE or similar operations that has the objective of waking
userfaults after their resolution, will wake all blocked userfaults
for the resolved range, including those that haven't been read() by
userland yet.

The behavior of poll() becomes not standard, but this obviates the
need of "spurious" UFFDIO_WAKE and it lets the userland threads to
restart immediately without requiring an UFFDIO_WAKE. This is even
more significant in case of repeated faults on the same address from
multiple threads.

This optimization is justified by the measurement that the number of
spurious UFFDIO_WAKE accounts for 5% and 10% of the total
userfaults for heavy workloads, so it's worth optimizing those away.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 fs/userfaultfd.c | 65 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b45cefe..50edbd8 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -52,6 +52,10 @@ struct userfaultfd_ctx {
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
+	/*
+	 * Only relevant when queued in fault_wqh and only used by the
+	 * read operation to avoid reading the same userfault twice.
+	 */
 	bool pending;
 	struct userfaultfd_ctx *ctx;
 };
@@ -71,9 +75,6 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
 
 	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 	ret = 0;
-	/* don't wake the pending ones to avoid reads to block */
-	if (uwq->pending && !ACCESS_ONCE(uwq->ctx->released))
-		goto out;
 	/* len == 0 means wake all */
 	start = range->start;
 	len = range->len;
@@ -183,12 +184,14 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	struct mm_struct *mm = vma->vm_mm;
 	struct userfaultfd_ctx *ctx;
 	struct userfaultfd_wait_queue uwq;
+	int ret;
 
 	BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
+	ret = VM_FAULT_SIGBUS;
 	ctx = vma->vm_userfaultfd_ctx.ctx;
 	if (!ctx)
-		return VM_FAULT_SIGBUS;
+		goto out;
 
 	BUG_ON(ctx->mm != mm);
 
@@ -201,7 +204,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	 * caller of handle_userfault to release the mmap_sem.
 	 */
 	if (unlikely(ACCESS_ONCE(ctx->released)))
-		return VM_FAULT_SIGBUS;
+		goto out;
 
 	/*
 	 * Check that we can return VM_FAULT_RETRY.
@@ -227,15 +230,16 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 			dump_stack();
 		}
 #endif
-		return VM_FAULT_SIGBUS;
+		goto out;
 	}
 
 	/*
 	 * Handle nowait, not much to do other than tell it to retry
 	 * and wait.
 	 */
+	ret = VM_FAULT_RETRY;
 	if (flags & FAULT_FLAG_RETRY_NOWAIT)
-		return VM_FAULT_RETRY;
+		goto out;
 
 	/* take the reference before dropping the mmap_sem */
 	userfaultfd_ctx_get(ctx);
@@ -255,21 +259,23 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	 * through poll/read().
 	 */
 	__add_wait_queue(&ctx->fault_wqh, &uwq.wq);
-	for (;;) {
-		set_current_state(TASK_KILLABLE);
-		if (!uwq.pending || ACCESS_ONCE(ctx->released) ||
-		    fatal_signal_pending(current))
-			break;
-		spin_unlock(&ctx->fault_wqh.lock);
+	set_current_state(TASK_KILLABLE);
+	spin_unlock(&ctx->fault_wqh.lock);
 
+	if (likely(!ACCESS_ONCE(ctx->released) &&
+		   !fatal_signal_pending(current))) {
 		wake_up_poll(&ctx->fd_wqh, POLLIN);
 		schedule();
+		ret |= VM_FAULT_MAJOR;
+	}
 
+	__set_current_state(TASK_RUNNING);
+	/* see finish_wait() comment for why list_empty_careful() */
+	if (!list_empty_careful(&uwq.wq.task_list)) {
 		spin_lock(&ctx->fault_wqh.lock);
+		list_del_init(&uwq.wq.task_list);
+		spin_unlock(&ctx->fault_wqh.lock);
 	}
-	__remove_wait_queue(&ctx->fault_wqh, &uwq.wq);
-	__set_current_state(TASK_RUNNING);
-	spin_unlock(&ctx->fault_wqh.lock);
 
 	/*
 	 * ctx may go away after this if the userfault pseudo fd is
@@ -277,7 +283,8 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	 */
 	userfaultfd_ctx_put(ctx);
 
-	return VM_FAULT_RETRY;
+out:
+	return ret;
 }
 
 static int userfaultfd_release(struct inode *inode, struct file *file)
@@ -391,6 +398,12 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 	case UFFD_STATE_WAIT_API:
 		return POLLERR;
 	case UFFD_STATE_RUNNING:
+		/*
+		 * poll() never guarantees that read won't block.
+		 * userfaults can be waken before they're read().
+		 */
+		if (unlikely(!(file->f_flags & O_NONBLOCK)))
+			return POLLERR;
 		spin_lock(&ctx->fault_wqh.lock);
 		ret = find_userfault(ctx, NULL);
 		spin_unlock(&ctx->fault_wqh.lock);
@@ -806,11 +819,19 @@ out:
 }
 
 /*
- * This is mostly needed to re-wakeup those userfaults that were still
- * pending when userland wake them up the first time. We don't wake
- * the pending one to avoid blocking reads to block, or non blocking
- * read to return -EAGAIN, if used with POLLIN, to avoid userland
- * doubts on why POLLIN wasn't reliable.
+ * userfaultfd_wake is needed in case an userfault is in flight by the
+ * time a UFFDIO_COPY (or other ioctl variants) completes. The page
+ * may be well get mapped and the page fault if repeated wouldn't lead
+ * to a userfault anymore, but before scheduling in TASK_KILLABLE mode
+ * handle_userfault() doesn't recheck the pagetables and it doesn't
+ * serialize against UFFDO_COPY (or other ioctl variants). Ultimately
+ * the knowledge of which pages are mapped is left to userland who is
+ * responsible for handling the race between read() userfaults and
+ * background UFFDIO_COPY (or other ioctl variants), if done by
+ * separate concurrent threads.
+ *
+ * userfaultfd_wake may be used in combination with the
+ * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
  */
 static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
 			    unsigned long arg)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2015-05-14 17:31 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-05-14 17:30 [PATCH 00/23] userfaultfd v4 Andrea Arcangeli
2015-05-14 17:30 ` [PATCH 01/23] userfaultfd: linux/Documentation/vm/userfaultfd.txt Andrea Arcangeli
     [not found]   ` <1431624680-20153-2-git-send-email-aarcange-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-09-11  8:47     ` Michael Kerrisk (man-pages)
2015-12-04 15:50       ` Michael Kerrisk (man-pages)
     [not found]         ` <5661B62B.2020409-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2015-12-04 17:55           ` Andrea Arcangeli
2015-05-14 17:30 ` [PATCH 02/23] userfaultfd: waitqueue: add nr wake parameter to __wake_up_locked_key Andrea Arcangeli
     [not found] ` <1431624680-20153-1-git-send-email-aarcange-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-05-14 17:31   ` [PATCH 03/23] userfaultfd: uAPI Andrea Arcangeli
2015-05-14 17:31   ` [PATCH 16/23] userfaultfd: allocate the userfaultfd_ctx cacheline aligned Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 04/23] userfaultfd: linux/userfaultfd_k.h Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 05/23] userfaultfd: add vm_userfaultfd_ctx to the vm_area_struct Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 06/23] userfaultfd: add VM_UFFD_MISSING and VM_UFFD_WP Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 07/23] userfaultfd: call handle_userfault() for userfaultfd_missing() faults Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 08/23] userfaultfd: teach vma_merge to merge across vma->vm_userfaultfd_ctx Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 09/23] userfaultfd: prevent khugepaged to merge if userfaultfd is armed Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 10/23] userfaultfd: add new syscall to provide memory externalization Andrea Arcangeli
2015-05-14 17:49   ` Linus Torvalds
2015-05-15 16:04     ` Andrea Arcangeli
2015-05-15 18:22       ` Linus Torvalds
     [not found]   ` <1431624680-20153-11-git-send-email-aarcange-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-06-23 19:00     ` Dave Hansen
     [not found]       ` <5589ACC3.3060401-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
2015-06-23 21:41         ` Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 11/23] userfaultfd: Rename uffd_api.bits into .features Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 12/23] userfaultfd: Rename uffd_api.bits into .features fixup Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 13/23] userfaultfd: change the read API to return a uffd_msg Andrea Arcangeli
2015-05-14 17:31 ` Andrea Arcangeli [this message]
2015-10-22 12:10   ` [PATCH 14/23] userfaultfd: wake pending userfaults Peter Zijlstra
     [not found]     ` <20151022121056.GB7520-ndre7Fmf5hadTX5a5knrm8zTDFooKrT+cvkQGrU6aU0@public.gmane.org>
2015-10-22 13:20       ` Andrea Arcangeli
2015-10-22 13:38         ` Peter Zijlstra
2015-10-22 14:18           ` Andrea Arcangeli
2015-10-22 15:15             ` Peter Zijlstra
2015-10-22 15:30               ` Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 15/23] userfaultfd: optimize read() and poll() to be O(1) Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 17/23] userfaultfd: solve the race between UFFDIO_COPY|ZEROPAGE and read Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 18/23] userfaultfd: buildsystem activation Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 19/23] userfaultfd: activate syscall Andrea Arcangeli
2015-08-11 10:07   ` [Qemu-devel] " Bharata B Rao
2015-08-11 13:48     ` Andrea Arcangeli
     [not found]       ` <20150811134826.GI4520-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-08-12  5:23         ` Bharata B Rao
     [not found]           ` <20150812052346.GC4587-xthvdsQ13ZrQT0dZR+AlfA@public.gmane.org>
2015-09-08  6:08             ` Michael Ellerman
     [not found]               ` <1441692486.14597.17.camel-Gsx/Oe8HsFggBc27wqDAHg@public.gmane.org>
2015-09-08  6:39                 ` Bharata B Rao
     [not found]                   ` <20150908063948.GB678-xthvdsQ13ZrQT0dZR+AlfA@public.gmane.org>
2015-09-08  7:14                     ` Michael Ellerman
2015-09-08 10:40                       ` Michael Ellerman
2015-09-08 12:28                         ` Dr. David Alan Gilbert
2015-09-08  8:59                   ` Dr. David Alan Gilbert
2015-09-08 10:00                     ` Bharata B Rao
2015-09-08 12:46                       ` Dr. David Alan Gilbert
2015-09-08 13:37                         ` Bharata B Rao
2015-09-08 14:13                           ` Dr. David Alan Gilbert
2015-05-14 17:31 ` [PATCH 20/23] userfaultfd: UFFDIO_COPY|UFFDIO_ZEROPAGE uAPI Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 21/23] userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 22/23] userfaultfd: avoid mmap_sem read recursion in mcopy_atomic Andrea Arcangeli
2015-05-22 20:18   ` Andrew Morton
2015-05-22 20:48     ` Andrea Arcangeli
2015-05-22 21:18       ` Andrew Morton
2015-05-23  1:04         ` Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 23/23] userfaultfd: UFFDIO_COPY and UFFDIO_ZEROPAGE Andrea Arcangeli
2015-05-18 14:24 ` [PATCH 00/23] userfaultfd v4 Pavel Emelyanov
2015-05-19 21:38 ` Andrew Morton
2015-05-19 21:59   ` Richard Weinberger
2015-05-20 14:17     ` Andrea Arcangeli
2015-05-20 13:23   ` Andrea Arcangeli
2015-05-21 13:11 ` Kirill Smelkov
2015-05-21 15:52   ` Andrea Arcangeli
2015-05-22 16:35     ` Kirill Smelkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1431624680-20153-15-git-send-email-aarcange@redhat.com \
    --to=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=andreslc@google.com \
    --cc=dave.hansen@intel.com \
    --cc=dgilbert@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=kirill@shutemov.name \
    --cc=kvm@vger.kernel.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@amacapital.net \
    --cc=mgorman@suse.de \
    --cc=pbonzini@redhat.com \
    --cc=peter.huangpeng@huawei.com \
    --cc=pfeiner@google.com \
    --cc=qemu-devel@nongnu.org \
    --cc=riel@redhat.com \
    --cc=sanidhya.gatech@gmail.com \
    --cc=torvalds@linux-foundation.org \
    --cc=xemul@parallels.com \
    --cc=zhang.zhanghailiang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).