[RFC PATCH v2 2/7] mm: Prepare for FAULT_FLAG_SPECULATIVE

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
To: "Kirill A . Shutemov" <kirill@shutemov.name>,
	Peter Zijlstra <peterz@infradead.org>
Cc: Linux MM <linux-mm@kvack.org>, Michal Hocko <mhocko@suse.cz>
Subject: [RFC PATCH v2 2/7] mm: Prepare for FAULT_FLAG_SPECULATIVE
Date: Fri, 18 Nov 2016 12:08:46 +0100	[thread overview]
Message-ID: <a07d2d6952e6904ce6bbabfd549f397f3c1c631d.1479465699.git.ldufour@linux.vnet.ibm.com> (raw)
In-Reply-To: <cover.1479465699.git.ldufour@linux.vnet.ibm.com>
In-Reply-To: <cover.1479465699.git.ldufour@linux.vnet.ibm.com>

From: Peter Zijlstra <peterz@infradead.org>

When speculating faults (without holding mmap_sem) we need to validate
that the vma against which we loaded pages is still valid when we're
ready to install the new PTE.

Therefore, replace the pte_offset_map_lock() calls that (re)take the
PTL with pte_map_lock() which can fail in case we find the VMA changed
since we started the fault.

Instead of passing around the endless list of function arguments,
replace the lot with a single structure so we can change context
without endless function signature changes.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[port to 4.8 kernel]
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
---
 include/linux/mm.h |  1 +
 mm/memory.c        | 73 +++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ef815b9cd426..e8e9e3dc4a0d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -280,6 +280,7 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_USER		0x40	/* The fault originated in userspace */
 #define FAULT_FLAG_REMOTE	0x80	/* faulting for non current tsk/mm */
 #define FAULT_FLAG_INSTRUCTION  0x100	/* The fault was during an instruction fetch */
+#define FAULT_FLAG_SPECULATIVE	0x200	/* Speculative fault, not holding mmap_sem */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
diff --git a/mm/memory.c b/mm/memory.c
index 53e0abb35c2e..08922b34575d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2095,6 +2095,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
 	return VM_FAULT_WRITE;
 }
 
+static bool pte_map_lock(struct fault_env *fe)
+{
+	fe->pte = pte_offset_map_lock(fe->vma->vm_mm, fe->pmd, fe->address, &fe->ptl);
+	return true;
+}
+
 /*
  * Handle the case of a page which we actually need to copy to a new page.
  *
@@ -2122,6 +2128,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
 	const unsigned long mmun_start = fe->address & PAGE_MASK;
 	const unsigned long mmun_end = mmun_start + PAGE_SIZE;
 	struct mem_cgroup *memcg;
+	int ret = VM_FAULT_OOM;
 
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
@@ -2148,7 +2155,11 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
-	fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
+	if (!pte_map_lock(fe)) {
+		mem_cgroup_cancel_charge(new_page, memcg, false);
+		ret = VM_FAULT_RETRY;
+		goto oom_free_new;
+	}
 	if (likely(pte_same(*fe->pte, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
@@ -2236,7 +2247,7 @@ oom_free_new:
 oom:
 	if (old_page)
 		put_page(old_page);
-	return VM_FAULT_OOM;
+	return ret;
 }
 
 /*
@@ -2261,8 +2272,12 @@ static int wp_pfn_shared(struct fault_env *fe,  pte_t orig_pte)
 		ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
 		if (ret & VM_FAULT_ERROR)
 			return ret;
-		fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-				&fe->ptl);
+
+		if (!pte_map_lock(fe)) {
+			ret |= VM_FAULT_RETRY;
+			return ret;
+		}
+
 		/*
 		 * We might have raced with another page fault while we
 		 * released the pte_offset_map_lock.
@@ -2300,8 +2315,11 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
 		 * they did, we just return, as we can count on the
 		 * MMU to tell us if they didn't also make it writable.
 		 */
-		fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-						 &fe->ptl);
+		if (!pte_map_lock(fe)) {
+			unlock_page(old_page);
+			put_page(old_page);
+			return VM_FAULT_RETRY;
+		}
 		if (!pte_same(*fe->pte, orig_pte)) {
 			unlock_page(old_page);
 			pte_unmap_unlock(fe->pte, fe->ptl);
@@ -2365,8 +2383,11 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
 			get_page(old_page);
 			pte_unmap_unlock(fe->pte, fe->ptl);
 			lock_page(old_page);
-			fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
-					fe->address, &fe->ptl);
+			if (!pte_map_lock(fe)) {
+				unlock_page(old_page);
+				put_page(old_page);
+				return VM_FAULT_RETRY;
+			}
 			if (!pte_same(*fe->pte, orig_pte)) {
 				unlock_page(old_page);
 				pte_unmap_unlock(fe->pte, fe->ptl);
@@ -2522,8 +2543,10 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
 			 * Back out if somebody else faulted in this pte
 			 * while we released the pte lock.
 			 */
-			fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
-					fe->address, &fe->ptl);
+			if (!pte_map_lock(fe)) {
+				delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+				return VM_FAULT_RETRY;
+			}
 			if (likely(pte_same(*fe->pte, orig_pte)))
 				ret = VM_FAULT_OOM;
 			delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2579,8 +2602,11 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
 	/*
 	 * Back out if somebody else already faulted in this pte.
 	 */
-	fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-			&fe->ptl);
+	if (!pte_map_lock(fe)) {
+		ret = VM_FAULT_RETRY;
+		mem_cgroup_cancel_charge(page, memcg, false);
+		goto out_page;
+	}
 	if (unlikely(!pte_same(*fe->pte, orig_pte)))
 		goto out_nomap;
 
@@ -2712,6 +2738,7 @@ static int do_anonymous_page(struct fault_env *fe)
 	struct mem_cgroup *memcg;
 	struct page *page;
 	pte_t entry;
+	int ret = 0;
 
 	/* File mapping without ->vm_ops ? */
 	if (vma->vm_flags & VM_SHARED)
@@ -2743,8 +2770,8 @@ static int do_anonymous_page(struct fault_env *fe)
 			!mm_forbids_zeropage(vma->vm_mm)) {
 		entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
 						vma->vm_page_prot));
-		fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-				&fe->ptl);
+		if (!pte_map_lock(fe))
+			return VM_FAULT_RETRY;
 		if (!pte_none(*fe->pte))
 			goto unlock;
 		/* Deliver the page fault to userland, check inside PT lock */
@@ -2776,8 +2803,12 @@ static int do_anonymous_page(struct fault_env *fe)
 	if (vma->vm_flags & VM_WRITE)
 		entry = pte_mkwrite(pte_mkdirty(entry));
 
-	fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-			&fe->ptl);
+	if (!pte_map_lock(fe)) {
+		/* XXX: should be factorized */
+		mem_cgroup_cancel_charge(page, memcg, false);
+		put_page(page);
+		return VM_FAULT_RETRY;
+	}
 	if (!pte_none(*fe->pte))
 		goto release;
 
@@ -2800,7 +2831,7 @@ setpte:
 	update_mmu_cache(vma, fe->address, fe->pte);
 unlock:
 	pte_unmap_unlock(fe->pte, fe->ptl);
-	return 0;
+	return ret;
 release:
 	mem_cgroup_cancel_charge(page, memcg, false);
 	put_page(page);
@@ -2842,7 +2873,7 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
 		if (ret & VM_FAULT_LOCKED)
 			unlock_page(vmf.page);
 		put_page(vmf.page);
-		return VM_FAULT_HWPOISON;
+		return ret | VM_FAULT_HWPOISON;
 	}
 
 	if (unlikely(!(ret & VM_FAULT_LOCKED)))
@@ -2889,8 +2920,9 @@ map_pte:
 	if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
 		return VM_FAULT_NOPAGE;
 
-	fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
-			&fe->ptl);
+	if (!pte_map_lock(fe))
+		return VM_FAULT_RETRY;
+
 	return 0;
 }
 
@@ -3152,6 +3184,7 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
 	 * something).
 	 */
 	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
+		/* XXX: is a call to pte_map_lock(fe) required here ? */
 		ret = do_fault_around(fe, pgoff);
 		if (ret)
 			return ret;
-- 
2.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2016-11-18 11:09 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-17 12:33 mmap_sem bottleneck Laurent Dufour
2016-10-17 12:51 ` Peter Zijlstra
2016-10-18 14:50   ` Laurent Dufour
2016-10-18 15:01     ` Kirill A. Shutemov
2016-10-18 15:02     ` Peter Zijlstra
2016-11-18 11:08       ` [RFC PATCH v2 0/7] Speculative page faults Laurent Dufour
2016-11-18 11:08         ` [RFC PATCH v2 1/7] mm: Dont assume page-table invariance during faults Laurent Dufour
2016-11-18 11:08         ` Laurent Dufour [this message]
2016-11-18 11:08         ` [RFC PATCH v2 3/7] mm: Introduce pte_spinlock Laurent Dufour
2016-11-18 11:08         ` [RFC PATCH v2 4/7] mm: VMA sequence count Laurent Dufour
2016-11-18 11:08         ` [RFC PATCH v2 5/7] SRCU free VMAs Laurent Dufour
2016-11-18 11:08         ` [RFC PATCH v2 6/7] mm: Provide speculative fault infrastructure Laurent Dufour
2016-11-18 11:08         ` [RFC PATCH v2 7/7] mm,x86: Add speculative pagefault handling Laurent Dufour
2016-11-18 14:08         ` [RFC PATCH v2 0/7] Speculative page faults Andi Kleen
2016-12-01  8:34           ` Laurent Dufour
2016-12-01 12:50             ` Balbir Singh
2016-12-01 13:26               ` Laurent Dufour
2016-12-02 14:10         ` Michal Hocko
2016-10-17 12:57 ` mmap_sem bottleneck Michal Hocko
2016-10-20  7:23   ` Laurent Dufour
2016-10-20 10:55     ` Michal Hocko

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:ef815b9cd42 dfblob:e8e9e3dc4a0 dfblob:53e0abb35c2
dfblob:08922b34575 )
 OR (
bs:"[RFC PATCH v2 2/7] mm: Prepare for FAULT_FLAG_SPECULATIVE" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=a07d2d6952e6904ce6bbabfd549f397f3c1c631d.1479465699.git.ldufour@linux.vnet.ibm.com \
    --to=ldufour@linux.vnet.ibm.com \
    --cc=kirill@shutemov.name \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.cz \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).