linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Peter Zijlstra <peterz@infradead.org>
To: torvalds@linux-foundation.org, paulmck@linux.vnet.ibm.com,
	tglx@linutronix.de, akpm@linux-foundation.org, riel@redhat.com,
	mgorman@suse.de, oleg@redhat.com, mingo@redhat.com,
	minchan@kernel.org, kamezawa.hiroyu@jp.fujitsu.com,
	viro@zeniv.linux.org.uk, laijs@cn.fujitsu.com, dave@stgolabs.net
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Peter Zijlstra <peterz@infradead.org>
Subject: [RFC][PATCH 1/6] mm: Dont assume page-table invariance during faults
Date: Mon, 20 Oct 2014 23:56:34 +0200	[thread overview]
Message-ID: <20141020222841.244195829@infradead.org> (raw)
In-Reply-To: 20141020215633.717315139@infradead.org

[-- Attachment #1: peterz-mm-kill-pte-pointer.patch --]
[-- Type: text/plain, Size: 6982 bytes --]

One of the side effects of speculating on faults (without holding
mmap_sem) is that we can race with free_pgtables() and therefore we
cannot assume the page-tables will stick around.

Remove the relyance on the pte pointer.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 mm/memory.c |   76 ++++++++++++++++--------------------------------------------
 1 file changed, 21 insertions(+), 55 deletions(-)

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1933,31 +1933,6 @@ int apply_to_page_range(struct mm_struct
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
-/*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically.  Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page can safely check later on).
- */
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
-				pte_t *page_table, pte_t orig_pte)
-{
-	int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-	if (sizeof(pte_t) > sizeof(unsigned long)) {
-		spinlock_t *ptl = pte_lockptr(mm, pmd);
-		spin_lock(ptl);
-		same = pte_same(*page_table, orig_pte);
-		spin_unlock(ptl);
-	}
-#endif
-	pte_unmap(page_table);
-	return same;
-}
-
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
 	debug_dma_assert_idle(src);
@@ -2407,21 +2382,18 @@ EXPORT_SYMBOL(unmap_mapping_range);
  * as does filemap_fault().
  */
 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned long address, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
 	struct page *page, *swapcache;
 	struct mem_cgroup *memcg;
 	swp_entry_t entry;
-	pte_t pte;
+	pte_t *page_table, pte;
 	int locked;
 	int exclusive = 0;
 	int ret = 0;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		goto out;
-
 	entry = pte_to_swp_entry(orig_pte);
 	if (unlikely(non_swap_entry(entry))) {
 		if (is_migration_entry(entry)) {
@@ -2624,15 +2596,13 @@ static inline int check_stack_guard_page
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned long address, pmd_t *pmd,
 		unsigned int flags)
 {
 	struct mem_cgroup *memcg;
 	struct page *page;
 	spinlock_t *ptl;
-	pte_t entry;
-
-	pte_unmap(page_table);
+	pte_t entry, *page_table;
 
 	/* Check if we need to add a guard page to the stack */
 	if (check_stack_guard_page(vma, address) < 0)
@@ -3031,13 +3001,12 @@ static int do_shared_fault(struct mm_str
  * return value.  See filemap_fault() and __lock_page_or_retry().
  */
 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned long address, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	pgoff_t pgoff = (((address & PAGE_MASK)
 			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
-	pte_unmap(page_table);
 	if (!(flags & FAULT_FLAG_WRITE))
 		return do_read_fault(mm, vma, address, pmd, pgoff, flags,
 				orig_pte);
@@ -3059,16 +3028,13 @@ static int do_linear_fault(struct mm_str
  * return value.  See filemap_fault() and __lock_page_or_retry().
  */
 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		unsigned long address, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
 	pgoff_t pgoff;
 
 	flags |= FAULT_FLAG_NONLINEAR;
 
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		return 0;
-
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
 		/*
 		 * Page table corrupted: show pte and kill process.
@@ -3103,7 +3069,7 @@ static int numa_migrate_prep(struct page
 }
 
 static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-		   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+		   unsigned long addr, pte_t pte, pmd_t *pmd)
 {
 	struct page *page = NULL;
 	spinlock_t *ptl;
@@ -3112,6 +3078,7 @@ static int do_numa_page(struct mm_struct
 	int target_nid;
 	bool migrated = false;
 	int flags = 0;
+	pte_t *ptep;
 
 	/*
 	* The "pte" at this point cannot be used safely without
@@ -3122,8 +3089,7 @@ static int do_numa_page(struct mm_struct
 	* the _PAGE_NUMA bit and it is not really expected that there
 	* would be concurrent hardware modifications to the PTE.
 	*/
-	ptl = pte_lockptr(mm, pmd);
-	spin_lock(ptl);
+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	if (unlikely(!pte_same(*ptep, pte))) {
 		pte_unmap_unlock(ptep, ptl);
 		goto out;
@@ -3195,34 +3161,32 @@ static int do_numa_page(struct mm_struct
  */
 static int handle_pte_fault(struct mm_struct *mm,
 		     struct vm_area_struct *vma, unsigned long address,
-		     pte_t *pte, pmd_t *pmd, unsigned int flags)
+		     pte_t entry, pmd_t *pmd, unsigned int flags)
 {
-	pte_t entry;
 	spinlock_t *ptl;
+	pte_t *pte;
 
-	entry = ACCESS_ONCE(*pte);
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (vma->vm_ops) {
 				if (likely(vma->vm_ops->fault))
 					return do_linear_fault(mm, vma, address,
-						pte, pmd, flags, entry);
+						pmd, flags, entry);
 			}
 			return do_anonymous_page(mm, vma, address,
-						 pte, pmd, flags);
+						 pmd, flags);
 		}
 		if (pte_file(entry))
 			return do_nonlinear_fault(mm, vma, address,
-					pte, pmd, flags, entry);
+					pmd, flags, entry);
 		return do_swap_page(mm, vma, address,
-					pte, pmd, flags, entry);
+					pmd, flags, entry);
 	}
 
 	if (pte_numa(entry))
-		return do_numa_page(mm, vma, address, entry, pte, pmd);
+		return do_numa_page(mm, vma, address, entry, pmd);
 
-	ptl = pte_lockptr(mm, pmd);
-	spin_lock(ptl);
+	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
 	if (flags & FAULT_FLAG_WRITE) {
@@ -3261,7 +3225,7 @@ static int __handle_mm_fault(struct mm_s
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte;
+	pte_t *pte, entry;
 
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, flags);
@@ -3331,8 +3295,10 @@ static int __handle_mm_fault(struct mm_s
 	 * safe to run pte_offset_map().
 	 */
 	pte = pte_offset_map(pmd, address);
+	entry = ACCESS_ONCE(*pte);
+	pte_unmap(pte);
 
-	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+	return handle_pte_fault(mm, vma, address, entry, pmd, flags);
 }
 
 /*


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  reply	other threads:[~2014-10-20 22:41 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-10-20 21:56 [RFC][PATCH 0/6] Another go at speculative page faults Peter Zijlstra
2014-10-20 21:56 ` Peter Zijlstra [this message]
2014-10-20 21:56 ` [RFC][PATCH 2/6] mm: Prepare for FAULT_FLAG_SPECULATIVE Peter Zijlstra
2014-10-20 21:56 ` [RFC][PATCH 3/6] mm: VMA sequence count Peter Zijlstra
2014-10-22 11:26   ` Kirill A. Shutemov
2014-10-22 11:39     ` Peter Zijlstra
2014-10-22 11:53       ` Kirill A. Shutemov
2014-10-22 12:15         ` Peter Zijlstra
2014-10-22 13:44           ` Peter Zijlstra
2014-10-23 12:36             ` Kirill A. Shutemov
2014-10-23 14:22               ` Peter Zijlstra
2014-10-23 15:05                 ` Kirill A. Shutemov
2014-10-20 21:56 ` [RFC][PATCH 4/6] SRCU free VMAs Peter Zijlstra
2014-10-20 23:41   ` Linus Torvalds
2014-10-21  8:07     ` Peter Zijlstra
2014-10-24 15:16       ` Christoph Lameter
2014-10-24 15:51         ` Peter Zijlstra
2014-10-24 17:08           ` Christoph Lameter
2014-10-21  8:22     ` Peter Zijlstra
2014-10-23 10:14   ` Lai Jiangshan
2014-10-23 11:03     ` Peter Zijlstra
2014-10-24  3:33       ` Lai Jiangshan
2014-10-24  7:26         ` Peter Zijlstra
2014-10-20 21:56 ` [RFC][PATCH 5/6] mm: Provide speculative fault infrastructure Peter Zijlstra
2014-10-21  8:35   ` Kirill A. Shutemov
2014-10-21 10:41     ` Peter Zijlstra
2014-10-21 19:00   ` Peter Zijlstra
2014-10-20 21:56 ` [RFC][PATCH 6/6] mm,x86: Add speculative pagefault handling Peter Zijlstra
2014-10-21  0:07 ` [RFC][PATCH 0/6] Another go at speculative page faults Andy Lutomirski
2014-10-21  8:11   ` Peter Zijlstra
2014-10-21 16:23 ` Ingo Molnar
2014-10-21 17:09   ` Kirill A. Shutemov
2014-10-21 17:56     ` Peter Zijlstra
2014-10-23 10:40       ` Lai Jiangshan
2014-10-23 11:04         ` Peter Zijlstra
2014-10-24  7:54           ` Ingo Molnar
2014-10-24 13:14             ` Peter Zijlstra
2014-10-28  5:32               ` Namhyung Kim
2014-10-21 17:25   ` Peter Zijlstra
2014-10-22 12:35     ` Ingo Molnar
2014-10-22  7:34 ` Davidlohr Bueso
2014-10-22 11:29   ` Kirill A. Shutemov
2014-10-22 11:45     ` Peter Zijlstra
2014-10-22 11:55       ` Kirill A. Shutemov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20141020222841.244195829@infradead.org \
    --to=peterz@infradead.org \
    --cc=akpm@linux-foundation.org \
    --cc=dave@stgolabs.net \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=laijs@cn.fujitsu.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=minchan@kernel.org \
    --cc=mingo@redhat.com \
    --cc=oleg@redhat.com \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=riel@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).