From: Andrea Arcangeli <aarcange@redhat.com>
To: Andrew Morton <akpm@linux-foundation.org>,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
qemu-devel@nongnu.org, kvm@vger.kernel.org,
linux-api@vger.kernel.org
Cc: Pavel Emelyanov <xemul@parallels.com>,
Sanidhya Kashyap <sanidhya.gatech@gmail.com>,
zhang.zhanghailiang@huawei.com,
Linus Torvalds <torvalds@linux-foundation.org>,
"Kirill A. Shutemov" <kirill@shutemov.name>,
Andres Lagar-Cavilla <andreslc@google.com>,
Dave Hansen <dave.hansen@intel.com>,
Paolo Bonzini <pbonzini@redhat.com>,
Rik van Riel <riel@redhat.com>, Mel Gorman <mgorman@suse.de>,
Andy Lutomirski <luto@amacapital.net>,
Hugh Dickins <hughd@google.com>,
Peter Feiner <pfeiner@google.com>,
"Dr. David Alan Gilbert" <dgilbert@redhat.com>,
Johannes Weiner <hannes@cmpxchg.org>,
"Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Subject: [PATCH 15/23] userfaultfd: optimize read() and poll() to be O(1)
Date: Thu, 14 May 2015 19:31:12 +0200 [thread overview]
Message-ID: <1431624680-20153-16-git-send-email-aarcange@redhat.com> (raw)
In-Reply-To: <1431624680-20153-1-git-send-email-aarcange@redhat.com>
This makes read O(1) and poll that was already O(1) becomes lockless.
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
fs/userfaultfd.c | 172 +++++++++++++++++++++++++++++++------------------------
1 file changed, 98 insertions(+), 74 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 50edbd8..3d26f41 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -35,7 +35,9 @@ enum userfaultfd_state {
struct userfaultfd_ctx {
/* pseudo fd refcounting */
atomic_t refcount;
- /* waitqueue head for the userfaultfd page faults */
+ /* waitqueue head for the pending (i.e. not read) userfaults */
+ wait_queue_head_t fault_pending_wqh;
+ /* waitqueue head for the userfaults */
wait_queue_head_t fault_wqh;
/* waitqueue head for the pseudo fd to wakeup poll/read */
wait_queue_head_t fd_wqh;
@@ -52,11 +54,6 @@ struct userfaultfd_ctx {
struct userfaultfd_wait_queue {
struct uffd_msg msg;
wait_queue_t wq;
- /*
- * Only relevant when queued in fault_wqh and only used by the
- * read operation to avoid reading the same userfault twice.
- */
- bool pending;
struct userfaultfd_ctx *ctx;
};
@@ -250,17 +247,21 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
uwq.msg = userfault_msg(address, flags, reason);
- uwq.pending = true;
uwq.ctx = ctx;
- spin_lock(&ctx->fault_wqh.lock);
+ spin_lock(&ctx->fault_pending_wqh.lock);
/*
* After the __add_wait_queue the uwq is visible to userland
* through poll/read().
*/
- __add_wait_queue(&ctx->fault_wqh, &uwq.wq);
+ __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
+ /*
+ * The smp_mb() after __set_current_state prevents the reads
+ * following the spin_unlock to happen before the list_add in
+ * __add_wait_queue.
+ */
set_current_state(TASK_KILLABLE);
- spin_unlock(&ctx->fault_wqh.lock);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
if (likely(!ACCESS_ONCE(ctx->released) &&
!fatal_signal_pending(current))) {
@@ -270,11 +271,28 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
}
__set_current_state(TASK_RUNNING);
- /* see finish_wait() comment for why list_empty_careful() */
+
+ /*
+ * Here we race with the list_del; list_add in
+ * userfaultfd_ctx_read(), however because we don't ever run
+ * list_del_init() to refile across the two lists, the prev
+ * and next pointers will never point to self. list_add also
+ * would never let any of the two pointers to point to
+ * self. So list_empty_careful won't risk to see both pointers
+ * pointing to self at any time during the list refile. The
+ * only case where list_del_init() is called is the full
+ * removal in the wake function and there we don't re-list_add
+ * and it's fine not to block on the spinlock. The uwq on this
+ * kernel stack can be released after the list_del_init.
+ */
if (!list_empty_careful(&uwq.wq.task_list)) {
- spin_lock(&ctx->fault_wqh.lock);
- list_del_init(&uwq.wq.task_list);
- spin_unlock(&ctx->fault_wqh.lock);
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /*
+ * No need of list_del_init(), the uwq on the stack
+ * will be freed shortly anyway.
+ */
+ list_del(&uwq.wq.task_list);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
}
/*
@@ -332,59 +350,38 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
up_write(&mm->mmap_sem);
/*
- * After no new page faults can wait on this fault_wqh, flush
+ * After no new page faults can wait on this fault_*wqh, flush
* the last page faults that may have been already waiting on
- * the fault_wqh.
+ * the fault_*wqh.
*/
- spin_lock(&ctx->fault_wqh.lock);
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
- spin_unlock(&ctx->fault_wqh.lock);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
wake_up_poll(&ctx->fd_wqh, POLLHUP);
userfaultfd_ctx_put(ctx);
return 0;
}
-/* fault_wqh.lock must be hold by the caller */
-static inline unsigned int find_userfault(struct userfaultfd_ctx *ctx,
- struct userfaultfd_wait_queue **uwq)
+/* fault_pending_wqh.lock must be hold by the caller */
+static inline struct userfaultfd_wait_queue *find_userfault(
+ struct userfaultfd_ctx *ctx)
{
wait_queue_t *wq;
- struct userfaultfd_wait_queue *_uwq;
- unsigned int ret = 0;
-
- VM_BUG_ON(!spin_is_locked(&ctx->fault_wqh.lock));
+ struct userfaultfd_wait_queue *uwq;
- list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
- _uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
- if (_uwq->pending) {
- ret = POLLIN;
- if (!uwq)
- /*
- * If there's at least a pending and
- * we don't care which one it is,
- * break immediately and leverage the
- * efficiency of the LIFO walk.
- */
- break;
- /*
- * If we need to find which one was pending we
- * keep walking until we find the first not
- * pending one, so we read() them in FIFO order.
- */
- *uwq = _uwq;
- } else
- /*
- * break the loop at the first not pending
- * one, there cannot be pending userfaults
- * after the first not pending one, because
- * all new pending ones are inserted at the
- * head and we walk it in LIFO.
- */
- break;
- }
+ VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
- return ret;
+ uwq = NULL;
+ if (!waitqueue_active(&ctx->fault_pending_wqh))
+ goto out;
+ /* walk in reverse to provide FIFO behavior to read userfaults */
+ wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
+ typeof(*wq), task_list);
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+out:
+ return uwq;
}
static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
@@ -404,9 +401,20 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
*/
if (unlikely(!(file->f_flags & O_NONBLOCK)))
return POLLERR;
- spin_lock(&ctx->fault_wqh.lock);
- ret = find_userfault(ctx, NULL);
- spin_unlock(&ctx->fault_wqh.lock);
+ /*
+ * lockless access to see if there are pending faults
+ * __pollwait last action is the add_wait_queue but
+ * the spin_unlock would allow the waitqueue_active to
+ * pass above the actual list_add inside
+ * add_wait_queue critical section. So use a full
+ * memory barrier to serialize the list_add write of
+ * add_wait_queue() with the waitqueue_active read
+ * below.
+ */
+ ret = 0;
+ smp_mb();
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ ret = POLLIN;
return ret;
default:
BUG();
@@ -418,27 +426,34 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
{
ssize_t ret;
DECLARE_WAITQUEUE(wait, current);
- struct userfaultfd_wait_queue *uwq = NULL;
+ struct userfaultfd_wait_queue *uwq;
- /* always take the fd_wqh lock before the fault_wqh lock */
+ /* always take the fd_wqh lock before the fault_pending_wqh lock */
spin_lock(&ctx->fd_wqh.lock);
__add_wait_queue(&ctx->fd_wqh, &wait);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- spin_lock(&ctx->fault_wqh.lock);
- if (find_userfault(ctx, &uwq)) {
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ uwq = find_userfault(ctx);
+ if (uwq) {
/*
- * The fault_wqh.lock prevents the uwq to
- * disappear from under us.
+ * The fault_pending_wqh.lock prevents the uwq
+ * to disappear from under us.
+ *
+ * Refile this userfault from
+ * fault_pending_wqh to fault_wqh, it's not
+ * pending anymore after we read it.
*/
- uwq->pending = false;
+ list_del(&uwq->wq.task_list);
+ __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+
/* careful to always initialize msg if ret == 0 */
*msg = uwq->msg;
- spin_unlock(&ctx->fault_wqh.lock);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
ret = 0;
break;
}
- spin_unlock(&ctx->fault_wqh.lock);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
@@ -497,16 +512,21 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx,
start = range->start;
end = range->start + range->len;
- spin_lock(&ctx->fault_wqh.lock);
+ spin_lock(&ctx->fault_pending_wqh.lock);
/* wake all in the range and autoremove */
- __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
- spin_unlock(&ctx->fault_wqh.lock);
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
+ range);
+ if (waitqueue_active(&ctx->fault_wqh))
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
}
static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
struct userfaultfd_wake_range *range)
{
- if (waitqueue_active(&ctx->fault_wqh))
+ if (waitqueue_active(&ctx->fault_pending_wqh) ||
+ waitqueue_active(&ctx->fault_wqh))
__wake_userfault(ctx, range);
}
@@ -932,14 +952,17 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
struct userfaultfd_wait_queue *uwq;
unsigned long pending = 0, total = 0;
- spin_lock(&ctx->fault_wqh.lock);
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ pending++;
+ total++;
+ }
list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
- if (uwq->pending)
- pending++;
total++;
}
- spin_unlock(&ctx->fault_wqh.lock);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
/*
* If more protocols will be added, there will be all shown
@@ -999,6 +1022,7 @@ static struct file *userfaultfd_file_create(int flags)
goto out;
atomic_set(&ctx->refcount, 1);
+ init_waitqueue_head(&ctx->fault_pending_wqh);
init_waitqueue_head(&ctx->fault_wqh);
init_waitqueue_head(&ctx->fd_wqh);
ctx->flags = flags;
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2015-05-14 17:31 UTC|newest]
Thread overview: 63+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-05-14 17:30 [PATCH 00/23] userfaultfd v4 Andrea Arcangeli
2015-05-14 17:30 ` [PATCH 01/23] userfaultfd: linux/Documentation/vm/userfaultfd.txt Andrea Arcangeli
2015-09-11 8:47 ` Michael Kerrisk (man-pages)
2015-12-04 15:50 ` Michael Kerrisk (man-pages)
2015-12-04 17:55 ` Andrea Arcangeli
2015-05-14 17:30 ` [PATCH 02/23] userfaultfd: waitqueue: add nr wake parameter to __wake_up_locked_key Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 03/23] userfaultfd: uAPI Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 04/23] userfaultfd: linux/userfaultfd_k.h Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 05/23] userfaultfd: add vm_userfaultfd_ctx to the vm_area_struct Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 06/23] userfaultfd: add VM_UFFD_MISSING and VM_UFFD_WP Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 07/23] userfaultfd: call handle_userfault() for userfaultfd_missing() faults Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 08/23] userfaultfd: teach vma_merge to merge across vma->vm_userfaultfd_ctx Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 09/23] userfaultfd: prevent khugepaged to merge if userfaultfd is armed Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 10/23] userfaultfd: add new syscall to provide memory externalization Andrea Arcangeli
2015-05-14 17:49 ` Linus Torvalds
2015-05-15 16:04 ` Andrea Arcangeli
2015-05-15 18:22 ` Linus Torvalds
2015-06-23 19:00 ` Dave Hansen
2015-06-23 21:41 ` Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 11/23] userfaultfd: Rename uffd_api.bits into .features Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 12/23] userfaultfd: Rename uffd_api.bits into .features fixup Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 13/23] userfaultfd: change the read API to return a uffd_msg Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 14/23] userfaultfd: wake pending userfaults Andrea Arcangeli
2015-10-22 12:10 ` Peter Zijlstra
2015-10-22 13:20 ` Andrea Arcangeli
2015-10-22 13:38 ` Peter Zijlstra
2015-10-22 14:18 ` Andrea Arcangeli
2015-10-22 15:15 ` Peter Zijlstra
2015-10-22 15:30 ` Andrea Arcangeli
2015-05-14 17:31 ` Andrea Arcangeli [this message]
2015-05-14 17:31 ` [PATCH 16/23] userfaultfd: allocate the userfaultfd_ctx cacheline aligned Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 17/23] userfaultfd: solve the race between UFFDIO_COPY|ZEROPAGE and read Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 18/23] userfaultfd: buildsystem activation Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 19/23] userfaultfd: activate syscall Andrea Arcangeli
2015-08-11 10:07 ` [Qemu-devel] " Bharata B Rao
2015-08-11 13:48 ` Andrea Arcangeli
2015-08-12 5:23 ` Bharata B Rao
2015-09-08 6:08 ` Michael Ellerman
2015-09-08 6:39 ` Bharata B Rao
2015-09-08 7:14 ` Michael Ellerman
2015-09-08 10:40 ` Michael Ellerman
2015-09-08 12:28 ` Dr. David Alan Gilbert
2015-09-08 8:59 ` Dr. David Alan Gilbert
2015-09-08 10:00 ` Bharata B Rao
2015-09-08 12:46 ` Dr. David Alan Gilbert
2015-09-08 13:37 ` Bharata B Rao
2015-09-08 14:13 ` Dr. David Alan Gilbert
2015-05-14 17:31 ` [PATCH 20/23] userfaultfd: UFFDIO_COPY|UFFDIO_ZEROPAGE uAPI Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 21/23] userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 22/23] userfaultfd: avoid mmap_sem read recursion in mcopy_atomic Andrea Arcangeli
2015-05-22 20:18 ` Andrew Morton
2015-05-22 20:48 ` Andrea Arcangeli
2015-05-22 21:18 ` Andrew Morton
2015-05-23 1:04 ` Andrea Arcangeli
2015-05-14 17:31 ` [PATCH 23/23] userfaultfd: UFFDIO_COPY and UFFDIO_ZEROPAGE Andrea Arcangeli
2015-05-18 14:24 ` [PATCH 00/23] userfaultfd v4 Pavel Emelyanov
2015-05-19 21:38 ` Andrew Morton
2015-05-19 21:59 ` Richard Weinberger
2015-05-20 14:17 ` Andrea Arcangeli
2015-05-20 13:23 ` Andrea Arcangeli
2015-05-21 13:11 ` Kirill Smelkov
2015-05-21 15:52 ` Andrea Arcangeli
2015-05-22 16:35 ` Kirill Smelkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1431624680-20153-16-git-send-email-aarcange@redhat.com \
--to=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=andreslc@google.com \
--cc=dave.hansen@intel.com \
--cc=dgilbert@redhat.com \
--cc=hannes@cmpxchg.org \
--cc=hughd@google.com \
--cc=kirill@shutemov.name \
--cc=kvm@vger.kernel.org \
--cc=linux-api@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=luto@amacapital.net \
--cc=mgorman@suse.de \
--cc=pbonzini@redhat.com \
--cc=peter.huangpeng@huawei.com \
--cc=pfeiner@google.com \
--cc=qemu-devel@nongnu.org \
--cc=riel@redhat.com \
--cc=sanidhya.gatech@gmail.com \
--cc=torvalds@linux-foundation.org \
--cc=xemul@parallels.com \
--cc=zhang.zhanghailiang@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).