Re: [PATCH v3 8/8] mm: handle userfaults under VMA lock

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Xu <peterx@redhat.com>
To: Suren Baghdasaryan <surenb@google.com>
Cc: akpm@linux-foundation.org, willy@infradead.org,
	hannes@cmpxchg.org, mhocko@suse.com, josef@toxicpanda.com,
	jack@suse.cz, ldufour@linux.ibm.com, laurent.dufour@fr.ibm.com,
	michel@lespinasse.org, liam.howlett@oracle.com,
	jglisse@google.com, vbabka@suse.cz, minchan@google.com,
	dave@stgolabs.net, punit.agrawal@bytedance.com,
	lstoakes@gmail.com, hdanton@sina.com, apopple@nvidia.com,
	ying.huang@intel.com, david@redhat.com, yuzhao@google.com,
	dhowells@redhat.com, hughd@google.com, viro@zeniv.linux.org.uk,
	brauner@kernel.org, pasha.tatashin@soleen.com,
	linux-mm@kvack.org, linux-fsdevel@vger.kernel.org,
	linux-kernel@vger.kernel.org, kernel-team@android.com
Subject: Re: [PATCH v3 8/8] mm: handle userfaults under VMA lock
Date: Tue, 27 Jun 2023 11:54:24 -0400	[thread overview]
Message-ID: <ZJsGMDqcYopSW8QL@x1n> (raw)
In-Reply-To: <20230627042321.1763765-9-surenb@google.com>

On Mon, Jun 26, 2023 at 09:23:21PM -0700, Suren Baghdasaryan wrote:
> Enable handle_userfault to operate under VMA lock by releasing VMA lock
> instead of mmap_lock and retrying.

This mostly good to me (besides the new DROP flag.. of course), thanks.
Still some nitpicks below.

> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  fs/userfaultfd.c | 42 ++++++++++++++++++++++--------------------
>  mm/memory.c      |  9 ---------
>  2 files changed, 22 insertions(+), 29 deletions(-)
> 
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 4e800bb7d2ab..b88632c404b6 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -277,17 +277,17 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
>   * hugepmd ranges.
>   */
>  static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> -					 struct vm_area_struct *vma,
> -					 unsigned long address,
> -					 unsigned long flags,
> -					 unsigned long reason)
> +					      struct vm_fault *vmf,
> +					      unsigned long reason)
>  {
> +	struct vm_area_struct *vma = vmf->vma;
>  	pte_t *ptep, pte;
>  	bool ret = true;
>  
> -	mmap_assert_locked(ctx->mm);
> +	if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
> +		mmap_assert_locked(ctx->mm);

Maybe we can have a helper asserting proper vma protector locks (mmap for
!VMA_LOCK and vma read lock for VMA_LOCK)?  It basically tells the context
the vma is still safe to access.

>  
> -	ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
> +	ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
>  	if (!ptep)
>  		goto out;
>  
> @@ -308,10 +308,8 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
>  }
>  #else
>  static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
> -					 struct vm_area_struct *vma,
> -					 unsigned long address,
> -					 unsigned long flags,
> -					 unsigned long reason)
> +					      struct vm_fault *vmf,
> +					      unsigned long reason)
>  {
>  	return false;	/* should never get here */
>  }
> @@ -325,11 +323,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
>   * threads.
>   */
>  static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
> -					 unsigned long address,
> -					 unsigned long flags,
> +					 struct vm_fault *vmf,
>  					 unsigned long reason)
>  {
>  	struct mm_struct *mm = ctx->mm;
> +	unsigned long address = vmf->address;
>  	pgd_t *pgd;
>  	p4d_t *p4d;
>  	pud_t *pud;
> @@ -337,7 +335,8 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
>  	pte_t *pte;
>  	bool ret = true;
>  
> -	mmap_assert_locked(mm);
> +	if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
> +		mmap_assert_locked(mm);

(the assert helper can also be used here)

>  
>  	pgd = pgd_offset(mm, address);
>  	if (!pgd_present(*pgd))
> @@ -445,7 +444,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
>  	 * Coredumping runs without mmap_lock so we can only check that
>  	 * the mmap_lock is held, if PF_DUMPCORE was not set.
>  	 */
> -	mmap_assert_locked(mm);
> +	if (!(vmf->flags & FAULT_FLAG_VMA_LOCK))
> +		mmap_assert_locked(mm);
>  
>  	ctx = vma->vm_userfaultfd_ctx.ctx;
>  	if (!ctx)
> @@ -561,15 +561,17 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
>  	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
>  
>  	if (!is_vm_hugetlb_page(vma))
> -		must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
> -						  reason);
> +		must_wait = userfaultfd_must_wait(ctx, vmf, reason);
>  	else
> -		must_wait = userfaultfd_huge_must_wait(ctx, vma,
> -						       vmf->address,
> -						       vmf->flags, reason);
> +		must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
>  	if (is_vm_hugetlb_page(vma))
>  		hugetlb_vma_unlock_read(vma);
> -	mmap_read_unlock(mm);
> +	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> +		/* WARNING: VMA can't be used after this */
> +		vma_end_read(vma);
> +	} else
> +		mmap_read_unlock(mm);

I also think maybe we should have a helper mm_release_fault_lock() just
release different locks for with/without VMA_LOCK.  It can also be used in
the other patch of folio_lock_or_retry().

> +	vmf->flags |= FAULT_FLAG_LOCK_DROPPED;
>  
>  	if (likely(must_wait && !READ_ONCE(ctx->released))) {
>  		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
> diff --git a/mm/memory.c b/mm/memory.c
> index bdf46fdc58d6..923c1576bd14 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5316,15 +5316,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
>  	if (!vma_start_read(vma))
>  		goto inval;
>  
> -	/*
> -	 * Due to the possibility of userfault handler dropping mmap_lock, avoid
> -	 * it for now and fall back to page fault handling under mmap_lock.
> -	 */
> -	if (userfaultfd_armed(vma)) {
> -		vma_end_read(vma);
> -		goto inval;
> -	}
> -
>  	/* Check since vm_start/vm_end might change before we lock the VMA */
>  	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
>  		vma_end_read(vma);
> -- 
> 2.41.0.178.g377b9f9a00-goog
> 

-- 
Peter Xu

next prev parent reply	other threads:[~2023-06-27 15:55 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-27  4:23 [PATCH v3 0/8] Per-VMA lock support for swap and userfaults Suren Baghdasaryan
2023-06-27  4:23 ` [PATCH v3 1/8] swap: remove remnants of polling from read_swap_cache_async Suren Baghdasaryan
2023-06-27  4:23 ` [PATCH v3 2/8] mm: add missing VM_FAULT_RESULT_TRACE name for VM_FAULT_COMPLETED Suren Baghdasaryan
2023-06-27  4:23 ` [PATCH v3 3/8] mm: drop per-VMA lock in handle_mm_fault if retrying or when finished Suren Baghdasaryan
2023-06-27 15:27   ` Peter Xu
2023-06-27 16:25     ` Suren Baghdasaryan
2023-06-27  4:23 ` [PATCH v3 4/8] mm: replace folio_lock_or_retry with folio_lock_fault Suren Baghdasaryan
2023-06-27 15:22   ` Peter Xu
2023-06-27 16:27     ` Suren Baghdasaryan
2023-06-27  4:23 ` [PATCH v3 5/8] mm: make folio_lock_fault indicate the state of mmap_lock upon return Suren Baghdasaryan
2023-06-27  8:06   ` Alistair Popple
2023-06-27 16:01     ` Suren Baghdasaryan
2023-06-27 15:32   ` Peter Xu
2023-06-27 16:00     ` Suren Baghdasaryan
2023-06-27  4:23 ` [PATCH v3 6/8] mm: handle swap page faults under per-VMA lock Suren Baghdasaryan
2023-06-27 15:41   ` Peter Xu
2023-06-27 16:05     ` Suren Baghdasaryan
2023-06-27 16:24       ` Peter Xu
2023-06-27  4:23 ` [PATCH v3 7/8] mm: drop VMA lock before waiting for migration Suren Baghdasaryan
2023-06-27  8:02   ` Alistair Popple
2023-06-27 15:35     ` Suren Baghdasaryan
2023-06-27 15:49   ` Peter Xu
2023-06-27 16:23     ` Suren Baghdasaryan
2023-06-28  3:22       ` Alistair Popple
2023-06-27  4:23 ` [PATCH v3 8/8] mm: handle userfaults under VMA lock Suren Baghdasaryan
2023-06-27 15:54   ` Peter Xu [this message]
2023-06-27 16:10     ` Suren Baghdasaryan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZJsGMDqcYopSW8QL@x1n \
    --to=peterx@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=apopple@nvidia.com \
    --cc=brauner@kernel.org \
    --cc=dave@stgolabs.net \
    --cc=david@redhat.com \
    --cc=dhowells@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=hdanton@sina.com \
    --cc=hughd@google.com \
    --cc=jack@suse.cz \
    --cc=jglisse@google.com \
    --cc=josef@toxicpanda.com \
    --cc=kernel-team@android.com \
    --cc=laurent.dufour@fr.ibm.com \
    --cc=ldufour@linux.ibm.com \
    --cc=liam.howlett@oracle.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lstoakes@gmail.com \
    --cc=mhocko@suse.com \
    --cc=michel@lespinasse.org \
    --cc=minchan@google.com \
    --cc=pasha.tatashin@soleen.com \
    --cc=punit.agrawal@bytedance.com \
    --cc=surenb@google.com \
    --cc=vbabka@suse.cz \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --cc=ying.huang@intel.com \
    --cc=yuzhao@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.