[PATCH 05/19] mm, mpol: Create special PROT_NONE infrastructure

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Peter Zijlstra <a.p.zijlstra@chello.nl>
To: mingo@kernel.org, riel@redhat.com, oleg@redhat.com,
	pjt@google.com, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, tglx@linutronix.de,
	Lee.Schermerhorn@hp.com
Cc: linux-kernel@vger.kernel.org, Peter Zijlstra <a.p.zijlstra@chello.nl>
Subject: [PATCH 05/19] mm, mpol: Create special PROT_NONE infrastructure
Date: Tue, 31 Jul 2012 21:12:09 +0200	[thread overview]
Message-ID: <20120731192808.647625186@chello.nl> (raw)
In-Reply-To: 20120731191204.540691987@chello.nl

[-- Attachment #1: numa-prot-none.patch --]
[-- Type: text/plain, Size: 9878 bytes --]

In order to facilitate a lazy -- fault driven -- migration of pages,
create a special transient PROT_NONE variant, we can then use the
'spurious' protection faults to drive our migrations from.

Pages that already had an effective PROT_NONE mapping will not
be detected to generate these 'spuriuos' faults for the simple reason
that we cannot distinguish them on their protection bits, see
pte_prot_none.

This isn't a problem since PROT_NONE (and possible PROT_WRITE with
dirty tracking) aren't used or are rare enough for us to not care
about their placement.

Suggested-by: Rik van Riel <riel@redhat.com>
Cc: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/huge_mm.h   |    3 +
 include/linux/mempolicy.h |    4 +-
 include/linux/mm.h        |   12 ++++++
 mm/huge_memory.c          |   21 +++++++++++
 mm/memory.c               |   86 ++++++++++++++++++++++++++++++++++++++++++----
 mm/mempolicy.c            |   24 ++++++++++++
 mm/mprotect.c             |   24 +++++++++---
 7 files changed, 159 insertions(+), 15 deletions(-)
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,6 +11,9 @@ extern int copy_huge_pmd(struct mm_struc
 extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			       unsigned long address, pmd_t *pmd,
 			       pmd_t orig_pmd);
+extern void do_huge_pmd_prot_none(struct mm_struct *mm, struct vm_area_struct *vma,
+				  unsigned long address, pmd_t *pmd,
+				  unsigned int flags, pmd_t orig_pmd);
 extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm);
 extern struct page *follow_trans_huge_pmd(struct mm_struct *mm,
 					  unsigned long addr,
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -254,7 +254,9 @@ static inline int vma_migratable(struct 
 	return 1;
 }
 
-#else
+extern void lazy_migrate_process(struct mm_struct *mm);
+
+#else /* CONFIG_NUMA */
 
 struct mempolicy {};
 
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1046,6 +1046,9 @@ extern unsigned long move_page_tables(st
 extern unsigned long do_mremap(unsigned long addr,
 			       unsigned long old_len, unsigned long new_len,
 			       unsigned long flags, unsigned long new_addr);
+extern void change_protection(struct vm_area_struct *vma, unsigned long start,
+			      unsigned long end, pgprot_t newprot,
+			      int dirty_accountable);
 extern int mprotect_fixup(struct vm_area_struct *vma,
 			  struct vm_area_struct **pprev, unsigned long start,
 			  unsigned long end, unsigned long newflags);
@@ -1495,6 +1498,15 @@ static inline pgprot_t vm_get_page_prot(
 }
 #endif
 
+static inline pgprot_t vma_prot_none(struct vm_area_struct *vma)
+{
+	/*
+	 * obtain PROT_NONE by removing READ|WRITE|EXEC privs
+	 */
+	vm_flags_t vmflags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
+	return pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vmflags));
+}
+
 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 			unsigned long pfn, unsigned long size, pgprot_t);
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -750,6 +750,27 @@ int do_huge_pmd_anonymous_page(struct mm
 	return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
 
+void do_huge_pmd_prot_none(struct mm_struct *mm, struct vm_area_struct *vma,
+			   unsigned long address, pmd_t *pmd,
+			   unsigned int flags, pmd_t entry)
+{
+	unsigned long haddr = address & HPAGE_PMD_MASK;
+
+	spin_lock(&mm->page_table_lock);
+	if (unlikely(!pmd_same(*pmd, entry)))
+		goto out_unlock;
+
+	/* do fancy stuff */
+
+	/* change back to regular protection */
+	entry = pmd_modify(entry, vma->vm_page_prot);
+	if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
+		update_mmu_cache(vma, address, entry);
+
+out_unlock:
+	spin_unlock(&mm->page_table_lock);
+}
+
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *vma)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3415,6 +3415,71 @@ static int do_nonlinear_fault(struct mm_
 	return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
+static bool pte_prot_none(struct vm_area_struct *vma, pte_t pte)
+{
+	/*
+	 * If we have the normal vma->vm_page_prot protections we're not a
+	 * 'special' PROT_NONE page.
+	 *
+	 * This means we cannot get 'special' PROT_NONE faults from genuine
+	 * PROT_NONE maps, nor from PROT_WRITE file maps that do dirty
+	 * tracking.
+	 *
+	 * Neither case is really interesting for our current use though so we
+	 * don't care.
+	 */
+	if (pte_same(pte, pte_modify(pte, vma->vm_page_prot)))
+		return false;
+
+	return pte_same(pte, pte_modify(pte, vma_prot_none(vma)));
+}
+
+static bool pmd_prot_none(struct vm_area_struct *vma, pmd_t pmd)
+{
+	/*
+	 * See pte_prot_none().
+	 */
+	if (pmd_same(pmd, pmd_modify(pmd, vma->vm_page_prot)))
+		return false;
+
+	return pmd_same(pmd, pmd_modify(pmd, vma_prot_none(vma)));
+}
+
+static int do_prot_none(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long address, pte_t *ptep, pmd_t *pmd,
+			unsigned int flags, pte_t entry)
+{
+	spinlock_t *ptl;
+	int ret = 0;
+
+	if (!pte_unmap_same(mm, pmd, ptep, entry))
+		goto out;
+
+	/*
+	 * Do fancy stuff...
+	 */
+
+	/*
+	 * OK, nothing to do,.. change the protection back to what it
+	 * ought to be.
+	 */
+	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+	if (unlikely(!pte_same(*ptep, entry)))
+		goto unlock;
+
+	flush_cache_page(vma, address, pte_pfn(entry));
+
+	ptep_modify_prot_start(mm, address, ptep);
+	entry = pte_modify(entry, vma->vm_page_prot);
+	ptep_modify_prot_commit(mm, address, ptep, entry);
+
+	update_mmu_cache(vma, address, ptep);
+unlock:
+	pte_unmap_unlock(ptep, ptl);
+out:
+	return ret;
+}
+
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -3453,6 +3518,9 @@ int handle_pte_fault(struct mm_struct *m
 					pte, pmd, flags, entry);
 	}
 
+	if (pte_prot_none(vma, entry))
+		return do_prot_none(mm, vma, address, pte, pmd, flags, entry);
+
 	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (unlikely(!pte_same(*pte, entry)))
@@ -3517,13 +3585,16 @@ int handle_mm_fault(struct mm_struct *mm
 							  pmd, flags);
 	} else {
 		pmd_t orig_pmd = *pmd;
-		int ret;
+		int ret = 0;
 
 		barrier();
-		if (pmd_trans_huge(orig_pmd)) {
-			if (flags & FAULT_FLAG_WRITE &&
-			    !pmd_write(orig_pmd) &&
-			    !pmd_trans_splitting(orig_pmd)) {
+		if (pmd_trans_huge(orig_pmd) && !pmd_trans_splitting(orig_pmd)) {
+			if (pmd_prot_none(vma, orig_pmd)) {
+				do_huge_pmd_prot_none(mm, vma, address, pmd,
+						      flags, orig_pmd);
+			}
+
+			if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) {
 				ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
 							  orig_pmd);
 				/*
@@ -3533,12 +3604,13 @@ int handle_mm_fault(struct mm_struct *mm
 				 */
 				if (unlikely(ret & VM_FAULT_OOM))
 					goto retry;
-				return ret;
 			}
-			return 0;
+
+			return ret;
 		}
 	}
 
+
 	/*
 	 * Use __pte_alloc instead of pte_alloc_map, because we can't
 	 * run pte_offset_map on the pmd, if an huge pmd could
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -565,6 +565,12 @@ static inline int check_pgd_range(struct
 	return 0;
 }
 
+static void
+change_prot_none(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+	change_protection(vma, start, end, vma_prot_none(vma), 0);
+}
+
 /*
  * Check if all pages in a range are on a set of nodes.
  * If pagelist != NULL then isolate pages from the LRU and
@@ -1197,6 +1203,24 @@ static long do_mbind(unsigned long start
 	return err;
 }
 
+static void lazy_migrate_vma(struct vm_area_struct *vma)
+{
+	if (!vma_migratable(vma))
+		return;
+
+	change_prot_none(vma, vma->vm_start, vma->vm_end);
+}
+
+void lazy_migrate_process(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	down_read(&mm->mmap_sem);
+	for (vma = mm->mmap; vma; vma = vma->vm_next)
+		lazy_migrate_vma(vma);
+	up_read(&mm->mmap_sem);
+}
+
 /*
  * User space interface with variable sized bitmaps for nodelists.
  */
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -119,7 +119,7 @@ static inline void change_pud_range(stru
 	} while (pud++, addr = next, addr != end);
 }
 
-static void change_protection(struct vm_area_struct *vma,
+static void change_protection_range(struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end, pgprot_t newprot,
 		int dirty_accountable)
 {
@@ -141,6 +141,20 @@ static void change_protection(struct vm_
 	flush_tlb_range(vma, start, end);
 }
 
+void change_protection(struct vm_area_struct *vma, unsigned long start,
+		       unsigned long end, pgprot_t newprot,
+		       int dirty_accountable)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	mmu_notifier_invalidate_range_start(mm, start, end);
+	if (is_vm_hugetlb_page(vma))
+		hugetlb_change_protection(vma, start, end, newprot);
+	else
+		change_protection_range(vma, start, end, newprot, dirty_accountable);
+	mmu_notifier_invalidate_range_end(mm, start, end);
+}
+
 int
 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	unsigned long start, unsigned long end, unsigned long newflags)
@@ -213,12 +227,8 @@ mprotect_fixup(struct vm_area_struct *vm
 		dirty_accountable = 1;
 	}
 
-	mmu_notifier_invalidate_range_start(mm, start, end);
-	if (is_vm_hugetlb_page(vma))
-		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
-	else
-		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
-	mmu_notifier_invalidate_range_end(mm, start, end);
+	change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	perf_event_mmap(vma);

next prev parent reply	other threads:[~2012-07-31 19:47 UTC|newest]

Thread overview: 53+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-07-31 19:12 [PATCH 00/19] sched-numa rewrite Peter Zijlstra
2012-07-31 19:12 ` [PATCH 01/19] task_work: Remove dependency on sched.h Peter Zijlstra
2012-07-31 20:52   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 02/19] mm/mpol: Remove NUMA_INTERLEAVE_HIT Peter Zijlstra
2012-07-31 20:52   ` Rik van Riel
2012-08-09 21:41   ` Andrea Arcangeli
2012-08-10  0:50     ` Andi Kleen
2012-07-31 19:12 ` [PATCH 03/19] mm/mpol: Make MPOL_LOCAL a real policy Peter Zijlstra
2012-07-31 20:52   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 04/19] mm, thp: Preserve pgprot across huge page split Peter Zijlstra
2012-07-31 20:53   ` Rik van Riel
2012-08-09 21:42   ` Andrea Arcangeli
2012-07-31 19:12 ` Peter Zijlstra [this message]
2012-07-31 20:55   ` [PATCH 05/19] mm, mpol: Create special PROT_NONE infrastructure Rik van Riel
2012-08-09 21:43   ` Andrea Arcangeli
2012-07-31 19:12 ` [PATCH 06/19] mm/mpol: Add MPOL_MF_LAZY Peter Zijlstra
2012-07-31 21:04   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 07/19] mm/mpol: Add MPOL_MF_NOOP Peter Zijlstra
2012-07-31 21:06   ` Rik van Riel
2012-08-09 21:44   ` Andrea Arcangeli
2012-10-01  9:36   ` Michael Kerrisk
2012-10-01  9:45     ` Ingo Molnar
2012-07-31 19:12 ` [PATCH 08/19] mm/mpol: Check for misplaced page Peter Zijlstra
2012-07-31 21:13   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 09/19] mm, migrate: Introduce migrate_misplaced_page() Peter Zijlstra
2012-07-31 21:16   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 10/19] mm, mpol: Use special PROT_NONE to migrate pages Peter Zijlstra
2012-07-31 21:24   ` Rik van Riel
2012-08-09 21:44   ` Andrea Arcangeli
2012-07-31 19:12 ` [PATCH 11/19] sched, mm: Introduce tsk_home_node() Peter Zijlstra
2012-07-31 21:30   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 12/19] mm/mpol: Make mempolicy home-node aware Peter Zijlstra
2012-07-31 21:33   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 13/19] sched: Introduce sched_feat_numa() Peter Zijlstra
2012-07-31 21:34   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 14/19] sched: Make find_busiest_queue() a method Peter Zijlstra
2012-07-31 21:34   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 15/19] sched: Implement home-node awareness Peter Zijlstra
2012-07-31 21:52   ` Rik van Riel
2012-08-09 21:51   ` Andrea Arcangeli
2012-07-31 19:12 ` [PATCH 16/19] sched, numa: NUMA home-node selection code Peter Zijlstra
2012-07-31 21:52   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 17/19] sched, numa: Detect big processes Peter Zijlstra
2012-07-31 21:53   ` Rik van Riel
2012-07-31 19:12 ` [PATCH 18/19] sched, numa: Per task memory placement for " Peter Zijlstra
2012-07-31 21:56   ` Rik van Riel
2012-08-08 21:35   ` Peter Zijlstra
2012-08-09 21:57   ` Andrea Arcangeli
2012-07-31 19:12 ` [PATCH 19/19] mm, numa: retry failed page migrations Peter Zijlstra
2012-08-02 20:40   ` Christoph Lameter
2012-08-08 17:17 ` [PATCH 00/19] sched-numa rewrite Andrea Arcangeli
2012-08-08 18:43   ` Rik van Riel
2012-08-17 18:08     ` Andrea Arcangeli

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120731192808.647625186@chello.nl \
    --to=a.p.zijlstra@chello.nl \
    --cc=Lee.Schermerhorn@hp.com \
    --cc=akpm@linux-foundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@kernel.org \
    --cc=oleg@redhat.com \
    --cc=pjt@google.com \
    --cc=riel@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.