mincore and transparent huge pages

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* mincore and transparent huge pages
@ 2010-03-23 14:34 Johannes Weiner
  2010-03-23 14:34 ` [patch 1/5] mincore: cleanups Johannes Weiner
                   ` (5 more replies)
  0 siblings, 6 replies; 11+ messages in thread
From: Johannes Weiner @ 2010-03-23 14:34 UTC (permalink / raw)
  To: Andrew Morton, Andrea Arcangeli; +Cc: Naoya Horiguchi, linux-mm, linux-kernel


Hi,

I wanted to make mincore() handle huge pmds natively over the weekend
but I chose do beef up the code a bit first (1-4).

Andrew, 1-4 may have merit without transparent huge pages, so they
could go in independently.  They are based on Andrea's patches but the
only thing huge page in them is the split_huge_page_vma() call, so it
would be easy to rebase (I can do that).

Below is also an ugly hack I used to test transparent huge pages on my
32bit netbook.  The VM_ flags, oh, the VM_ flags!

	Hannes

diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982..98391db 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
 #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
+{
+	return __pmd(xchg((pmdval_t *)xp, 0));
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
 /*
  * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
  * split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b016..cc62f48 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
 #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
 #endif
 
+#ifdef CONFIG_SMP
+union split_pmd {
+	struct {
+		u32 pmd_low;
+		u32 pmd_high;
+	};
+	pmd_t pmd;
+};
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
+{
+	union split_pmd res, *orig = (union pmd_parts *)pmdp;
+
+	/* xchg acts as a barrier before setting of the high bits */
+	res.pmd_low = xchg(&orig->pmd_low, 0);
+	res.pmd_high = orig->pmd_high;
+	orig->pmd_high = 0;
+
+	return res.pmd;
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
 /*
  * Bits 0, 6 and 7 are taken in the low part of the pte,
  * put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d26f1cf..59c4fdb 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -95,6 +95,11 @@ static inline int pte_young(pte_t pte)
 	return pte_flags(pte) & _PAGE_ACCESSED;
 }
 
+static inline int pmd_young(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
+
 static inline int pte_write(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_RW;
@@ -143,6 +148,18 @@ static inline int pmd_large(pmd_t pte)
 		(_PAGE_PSE | _PAGE_PRESENT);
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+	return pmd_val(pmd) & _PAGE_SPLITTING;
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+	return pmd_val(pmd) & _PAGE_PSE;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
 {
 	pteval_t v = native_pte_val(pte);
@@ -217,6 +234,50 @@ static inline pte_t pte_mkspecial(pte_t pte)
 	return pte_set_flags(pte, _PAGE_SPECIAL);
 }
 
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+	pmdval_t v = native_pmd_val(pmd);
+
+	return __pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+	pmdval_t v = native_pmd_val(pmd);
+
+	return __pmd(v & ~clear);
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_DIRTY);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_PSE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_RW);
+}
+
 /*
  * Mask out unsupported bits in a present pgprot.  Non-present pgprots
  * can use those bits for other purposes, so leave them be.
@@ -525,6 +586,14 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
 	return res;
 }
 
+static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
+{
+	pmd_t res = *pmdp;
+
+	native_pmd_clear(pmdp);
+	return res;
+}
+
 static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
 				     pte_t *ptep , pte_t pte)
 {
@@ -612,6 +681,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 	pte_update(mm, addr, ptep);
 }
 
+#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
+
+#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+				 unsigned long address, pmd_t *pmdp,
+				 pmd_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+				     unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+				  unsigned long address, pmd_t *pmdp);
+
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+				 unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+	return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+				       pmd_t *pmdp)
+{
+	pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+	pmd_update(mm, addr, pmdp);
+	return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+				      unsigned long addr, pmd_t *pmdp)
+{
+	clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+	pmd_update(mm, addr, pmdp);
+}
+
 /*
  * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
  *
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index b8b801d..5962bac 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -182,110 +182,6 @@ extern void cleanup_highmap(void);
 
 #define __HAVE_ARCH_PTE_SAME
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
-	return pmd_val(pmd) & _PAGE_SPLITTING;
-}
-
-static inline int pmd_trans_huge(pmd_t pmd)
-{
-	return pmd_val(pmd) & _PAGE_PSE;
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-#define mk_pmd(page, pgprot)   pfn_pmd(page_to_pfn(page), (pgprot))
-
-#define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
-extern int pmdp_set_access_flags(struct vm_area_struct *vma,
-				 unsigned long address, pmd_t *pmdp,
-				 pmd_t entry, int dirty);
-
-#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
-				     unsigned long addr, pmd_t *pmdp);
-
-#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
-extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
-				  unsigned long address, pmd_t *pmdp);
-
-
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
-				 unsigned long addr, pmd_t *pmdp);
-
-#define __HAVE_ARCH_PMD_WRITE
-static inline int pmd_write(pmd_t pmd)
-{
-	return pmd_flags(pmd) & _PAGE_RW;
-}
-
-#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
-static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
-				       pmd_t *pmdp)
-{
-	pmd_t pmd = native_pmdp_get_and_clear(pmdp);
-	pmd_update(mm, addr, pmdp);
-	return pmd;
-}
-
-#define __HAVE_ARCH_PMDP_SET_WRPROTECT
-static inline void pmdp_set_wrprotect(struct mm_struct *mm,
-				      unsigned long addr, pmd_t *pmdp)
-{
-	clear_bit(_PAGE_BIT_RW, (unsigned long *)&pmdp->pmd);
-	pmd_update(mm, addr, pmdp);
-}
-
-static inline int pmd_young(pmd_t pmd)
-{
-	return pmd_flags(pmd) & _PAGE_ACCESSED;
-}
-
-static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
-{
-	pmdval_t v = native_pmd_val(pmd);
-
-	return native_make_pmd(v | set);
-}
-
-static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
-{
-	pmdval_t v = native_pmd_val(pmd);
-
-	return native_make_pmd(v & ~clear);
-}
-
-static inline pmd_t pmd_mkold(pmd_t pmd)
-{
-	return pmd_clear_flags(pmd, _PAGE_ACCESSED);
-}
-
-static inline pmd_t pmd_wrprotect(pmd_t pmd)
-{
-	return pmd_clear_flags(pmd, _PAGE_RW);
-}
-
-static inline pmd_t pmd_mkdirty(pmd_t pmd)
-{
-	return pmd_set_flags(pmd, _PAGE_DIRTY);
-}
-
-static inline pmd_t pmd_mkhuge(pmd_t pmd)
-{
-	return pmd_set_flags(pmd, _PAGE_PSE);
-}
-
-static inline pmd_t pmd_mkyoung(pmd_t pmd)
-{
-	return pmd_set_flags(pmd, _PAGE_ACCESSED);
-}
-
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
-{
-	return pmd_set_flags(pmd, _PAGE_RW);
-}
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d360616..d4470f6 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -351,7 +351,7 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 
 	if (pmd_young(*pmdp))
 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
-					 (unsigned long *) &pmdp->pmd);
+					 (unsigned long *)pmdp);
 
 	if (ret)
 		pmd_update(vma->vm_mm, addr, pmdp);
@@ -393,7 +393,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
 	int set;
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 	set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
-				(unsigned long *)&pmdp->pmd);
+				(unsigned long *)pmdp);
 	if (set) {
 		pmd_update(vma->vm_mm, address, pmdp);
 		/* need tlb flush only to serialize against gup-fast */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 85fa92a..b6aec57 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -106,10 +106,11 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
 #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
 #define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
+#ifdef CONFIG_KSM
+#error no more VM_ flags
 #define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
-#if BITS_PER_LONG > 32
-#define VM_HUGEPAGE	0x100000000UL	/* MADV_HUGEPAGE marked this vma */
 #endif
+#define VM_HUGEPAGE	0x80000000	/* MADV_HUGEPAGE marked this vma */
 
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff --git a/mm/Kconfig b/mm/Kconfig
index 2a771ef..89a7fe9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -290,7 +290,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
 
 config TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage support" if EMBEDDED
-	depends on X86_64
+	depends on X86
 	default y
 	help
 	  Transparent Hugepages allows the kernel to use huge pages and

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [patch 1/5] mincore: cleanups
  2010-03-23 14:34 mincore and transparent huge pages Johannes Weiner
@ 2010-03-23 14:34 ` Johannes Weiner
  2010-03-23 14:34 ` [patch 2/5] mincore: break do_mincore() into logical pieces Johannes Weiner
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Johannes Weiner @ 2010-03-23 14:34 UTC (permalink / raw)
  To: Andrew Morton, Andrea Arcangeli; +Cc: Naoya Horiguchi, linux-mm, linux-kernel

This fixes some minor issues that bugged me while going over the code:

o adjust argument order of do_mincore() to match the syscall
o simplify range length calculation
o drop superfluous shift in huge tlb calculation, address is page aligned
o drop dead nr_huge calculation
o check pte_none() before pte_present()
o comment and whitespace fixes

No semantic changes intended.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/mincore.c |   76 ++++++++++++++++++++-------------------------------------
 1 files changed, 27 insertions(+), 49 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index fe360ab..c35f8f0 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -54,7 +54,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
  * all the arguments, we hold the mmap semaphore: we should
  * just return the amount of info we're asked for.
  */
-static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
+static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -64,35 +64,29 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag
 	unsigned long nr;
 	int i;
 	pgoff_t pgoff;
-	struct vm_area_struct *vma = find_vma(current->mm, addr);
+	struct vm_area_struct *vma;
 
-	/*
-	 * find_vma() didn't find anything above us, or we're
-	 * in an unmapped hole in the address space: ENOMEM.
-	 */
+	vma = find_vma(current->mm, addr);
 	if (!vma || addr < vma->vm_start)
 		return -ENOMEM;
 
+	nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
+
 #ifdef CONFIG_HUGETLB_PAGE
 	if (is_vm_hugetlb_page(vma)) {
 		struct hstate *h;
-		unsigned long nr_huge;
-		unsigned char present;
 
 		i = 0;
-		nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
 		h = hstate_vma(vma);
-		nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h))
-			  - (addr >> huge_page_shift(h)) + 1;
-		nr_huge = min(nr_huge,
-			      (vma->vm_end - addr) >> huge_page_shift(h));
 		while (1) {
-			/* hugepage always in RAM for now,
-			 * but generally it needs to be check */
+			unsigned char present;
+			/*
+			 * Huge pages are always in RAM for now, but
+			 * theoretically it needs to be checked.
+			 */
 			ptep = huge_pte_offset(current->mm,
 					       addr & huge_page_mask(h));
-			present = !!(ptep &&
-				     !huge_pte_none(huge_ptep_get(ptep)));
+			present = ptep && !huge_pte_none(huge_ptep_get(ptep));
 			while (1) {
 				vec[i++] = present;
 				addr += PAGE_SIZE;
@@ -100,8 +94,7 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag
 				if (i == nr)
 					return nr;
 				/* check hugepage border */
-				if (!((addr & ~huge_page_mask(h))
-				      >> PAGE_SHIFT))
+				if (!(addr & ~huge_page_mask(h)))
 					break;
 			}
 		}
@@ -113,17 +106,7 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag
 	 * Calculate how many pages there are left in the last level of the
 	 * PTE array for our address.
 	 */
-	nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1));
-
-	/*
-	 * Don't overrun this vma
-	 */
-	nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT);
-
-	/*
-	 * Don't return more than the caller asked for
-	 */
-	nr = min(nr, pages);
+	nr = min(nr, PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1)));
 
 	pgd = pgd_offset(vma->vm_mm, addr);
 	if (pgd_none_or_clear_bad(pgd))
@@ -138,43 +121,38 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag
 
 	ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) {
-		unsigned char present;
 		pte_t pte = *ptep;
 
-		if (pte_present(pte)) {
-			present = 1;
-
-		} else if (pte_none(pte)) {
+		if (pte_none(pte)) {
 			if (vma->vm_file) {
 				pgoff = linear_page_index(vma, addr);
-				present = mincore_page(vma->vm_file->f_mapping,
-							pgoff);
+				vec[i] = mincore_page(vma->vm_file->f_mapping,
+						pgoff);
 			} else
-				present = 0;
-
-		} else if (pte_file(pte)) {
+				vec[i] = 0;
+		} else if (pte_present(pte))
+			vec[i] = 1;
+		else if (pte_file(pte)) {
 			pgoff = pte_to_pgoff(pte);
-			present = mincore_page(vma->vm_file->f_mapping, pgoff);
-
+			vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
 		} else { /* pte is a swap entry */
 			swp_entry_t entry = pte_to_swp_entry(pte);
+
 			if (is_migration_entry(entry)) {
 				/* migration entries are always uptodate */
-				present = 1;
+				vec[i] = 1;
 			} else {
 #ifdef CONFIG_SWAP
 				pgoff = entry.val;
-				present = mincore_page(&swapper_space, pgoff);
+				vec[i] = mincore_page(&swapper_space, pgoff);
 #else
 				WARN_ON(1);
-				present = 1;
+				vec[i] = 1;
 #endif
 			}
 		}
-
-		vec[i] = present;
 	}
-	pte_unmap_unlock(ptep-1, ptl);
+	pte_unmap_unlock(ptep - 1, ptl);
 
 	return nr;
 
@@ -248,7 +226,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 		 * the temporary buffer size.
 		 */
 		down_read(&current->mm->mmap_sem);
-		retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
+		retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
 		up_read(&current->mm->mmap_sem);
 
 		if (retval <= 0)
-- 
1.7.0.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [patch 2/5] mincore: break do_mincore() into logical pieces
  2010-03-23 14:34 mincore and transparent huge pages Johannes Weiner
  2010-03-23 14:34 ` [patch 1/5] mincore: cleanups Johannes Weiner
@ 2010-03-23 14:34 ` Johannes Weiner
  2010-03-23 14:35 ` [patch 3/5] mincore: pass ranges as start,end address pairs Johannes Weiner
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Johannes Weiner @ 2010-03-23 14:34 UTC (permalink / raw)
  To: Andrew Morton, Andrea Arcangeli; +Cc: Naoya Horiguchi, linux-mm, linux-kernel

Split out functions to handle hugetlb ranges, pte ranges and unmapped
ranges, to improve readability but also to prepare the file structure
for nested page table walks.

No semantic changes intended.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/mincore.c |  171 +++++++++++++++++++++++++++++++++-------------------------
 1 files changed, 97 insertions(+), 74 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index c35f8f0..ba80bb8 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,6 +19,42 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
+static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long nr,
+				unsigned char *vec)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+	struct hstate *h;
+	int i;
+
+	i = 0;
+	h = hstate_vma(vma);
+	while (1) {
+		unsigned char present;
+		pte_t *ptep;
+		/*
+		 * Huge pages are always in RAM for now, but
+		 * theoretically it needs to be checked.
+		 */
+		ptep = huge_pte_offset(current->mm,
+				       addr & huge_page_mask(h));
+		present = ptep && !huge_pte_none(huge_ptep_get(ptep));
+		while (1) {
+			vec[i++] = present;
+			addr += PAGE_SIZE;
+			/* reach buffer limit */
+			if (i == nr)
+				return;
+			/* check hugepage border */
+			if (!(addr & ~huge_page_mask(h)))
+				break;
+		}
+	}
+#else
+	BUG();
+#endif
+}
+
 /*
  * Later we can get more picky about what "in core" means precisely.
  * For now, simply check to see if the page is in the page cache,
@@ -49,6 +85,64 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 	return present;
 }
 
+static void mincore_unmapped_range(struct vm_area_struct *vma,
+				unsigned long addr, unsigned long nr,
+				unsigned char *vec)
+{
+	int i;
+
+	if (vma->vm_file) {
+		pgoff_t pgoff;
+
+		pgoff = linear_page_index(vma, addr);
+		for (i = 0; i < nr; i++, pgoff++)
+			vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+	} else {
+		for (i = 0; i < nr; i++)
+			vec[i] = 0;
+	}
+}
+
+static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+			unsigned long addr, unsigned long nr,
+			unsigned char *vec)
+{
+	spinlock_t *ptl;
+	pte_t *ptep;
+	int i;
+
+	ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+	for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) {
+		pte_t pte = *ptep;
+		pgoff_t pgoff;
+
+		if (pte_none(pte))
+			mincore_unmapped_range(vma, addr, 1, vec);
+		else if (pte_present(pte))
+			vec[i] = 1;
+		else if (pte_file(pte)) {
+			pgoff = pte_to_pgoff(pte);
+			vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+		} else { /* pte is a swap entry */
+			swp_entry_t entry = pte_to_swp_entry(pte);
+
+			if (is_migration_entry(entry)) {
+				/* migration entries are always uptodate */
+				vec[i] = 1;
+			} else {
+#ifdef CONFIG_SWAP
+				pgoff = entry.val;
+				vec[i] = mincore_page(&swapper_space, pgoff);
+#else
+				WARN_ON(1);
+				vec[i] = 1;
+#endif
+			}
+		}
+	}
+	pte_unmap_unlock(ptep - 1, ptl);
+}
+
 /*
  * Do a chunk of "sys_mincore()". We've already checked
  * all the arguments, we hold the mmap semaphore: we should
@@ -59,11 +153,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *ptep;
-	spinlock_t *ptl;
 	unsigned long nr;
-	int i;
-	pgoff_t pgoff;
 	struct vm_area_struct *vma;
 
 	vma = find_vma(current->mm, addr);
@@ -72,35 +162,10 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 
 	nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
 
-#ifdef CONFIG_HUGETLB_PAGE
 	if (is_vm_hugetlb_page(vma)) {
-		struct hstate *h;
-
-		i = 0;
-		h = hstate_vma(vma);
-		while (1) {
-			unsigned char present;
-			/*
-			 * Huge pages are always in RAM for now, but
-			 * theoretically it needs to be checked.
-			 */
-			ptep = huge_pte_offset(current->mm,
-					       addr & huge_page_mask(h));
-			present = ptep && !huge_pte_none(huge_ptep_get(ptep));
-			while (1) {
-				vec[i++] = present;
-				addr += PAGE_SIZE;
-				/* reach buffer limit */
-				if (i == nr)
-					return nr;
-				/* check hugepage border */
-				if (!(addr & ~huge_page_mask(h)))
-					break;
-			}
-		}
+		mincore_hugetlb_page_range(vma, addr, nr, vec);
 		return nr;
 	}
-#endif
 
 	/*
 	 * Calculate how many pages there are left in the last level of the
@@ -119,53 +184,11 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 	if (pmd_none_or_clear_bad(pmd))
 		goto none_mapped;
 
-	ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) {
-		pte_t pte = *ptep;
-
-		if (pte_none(pte)) {
-			if (vma->vm_file) {
-				pgoff = linear_page_index(vma, addr);
-				vec[i] = mincore_page(vma->vm_file->f_mapping,
-						pgoff);
-			} else
-				vec[i] = 0;
-		} else if (pte_present(pte))
-			vec[i] = 1;
-		else if (pte_file(pte)) {
-			pgoff = pte_to_pgoff(pte);
-			vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
-		} else { /* pte is a swap entry */
-			swp_entry_t entry = pte_to_swp_entry(pte);
-
-			if (is_migration_entry(entry)) {
-				/* migration entries are always uptodate */
-				vec[i] = 1;
-			} else {
-#ifdef CONFIG_SWAP
-				pgoff = entry.val;
-				vec[i] = mincore_page(&swapper_space, pgoff);
-#else
-				WARN_ON(1);
-				vec[i] = 1;
-#endif
-			}
-		}
-	}
-	pte_unmap_unlock(ptep - 1, ptl);
-
+	mincore_pte_range(vma, pmd, addr, nr, vec);
 	return nr;
 
 none_mapped:
-	if (vma->vm_file) {
-		pgoff = linear_page_index(vma, addr);
-		for (i = 0; i < nr; i++, pgoff++)
-			vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
-	} else {
-		for (i = 0; i < nr; i++)
-			vec[i] = 0;
-	}
-
+	mincore_unmapped_range(vma, addr, nr, vec);
 	return nr;
 }
 
-- 
1.7.0.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [patch 3/5] mincore: pass ranges as start,end address pairs
  2010-03-23 14:34 mincore and transparent huge pages Johannes Weiner
  2010-03-23 14:34 ` [patch 1/5] mincore: cleanups Johannes Weiner
  2010-03-23 14:34 ` [patch 2/5] mincore: break do_mincore() into logical pieces Johannes Weiner
@ 2010-03-23 14:35 ` Johannes Weiner
  2010-03-23 14:35 ` [patch 4/5] mincore: do nested page table walks Johannes Weiner
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Johannes Weiner @ 2010-03-23 14:35 UTC (permalink / raw)
  To: Andrew Morton, Andrea Arcangeli; +Cc: Naoya Horiguchi, linux-mm, linux-kernel

Instead of passing a start address and a number of pages into the
helper functions, convert them to use a start and an end address.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/mincore.c |   57 +++++++++++++++++++++++++++------------------------------
 1 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index ba80bb8..eb50daa 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -20,14 +20,12 @@
 #include <asm/pgtable.h>
 
 static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
-				unsigned long addr, unsigned long nr,
+				unsigned long addr, unsigned long end,
 				unsigned char *vec)
 {
 #ifdef CONFIG_HUGETLB_PAGE
 	struct hstate *h;
-	int i;
 
-	i = 0;
 	h = hstate_vma(vma);
 	while (1) {
 		unsigned char present;
@@ -40,10 +38,10 @@ static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
 				       addr & huge_page_mask(h));
 		present = ptep && !huge_pte_none(huge_ptep_get(ptep));
 		while (1) {
-			vec[i++] = present;
+			*vec = present;
+			vec++;
 			addr += PAGE_SIZE;
-			/* reach buffer limit */
-			if (i == nr)
+			if (addr == end)
 				return;
 			/* check hugepage border */
 			if (!(addr & ~huge_page_mask(h)))
@@ -86,9 +84,10 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 }
 
 static void mincore_unmapped_range(struct vm_area_struct *vma,
-				unsigned long addr, unsigned long nr,
+				unsigned long addr, unsigned long end,
 				unsigned char *vec)
 {
+	unsigned long nr = (end - addr) >> PAGE_SHIFT;
 	int i;
 
 	if (vma->vm_file) {
@@ -104,42 +103,44 @@ static void mincore_unmapped_range(struct vm_area_struct *vma,
 }
 
 static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-			unsigned long addr, unsigned long nr,
+			unsigned long addr, unsigned long end,
 			unsigned char *vec)
 {
+	unsigned long next;
 	spinlock_t *ptl;
 	pte_t *ptep;
-	int i;
 
 	ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) {
+	do {
 		pte_t pte = *ptep;
 		pgoff_t pgoff;
 
+		next = addr + PAGE_SIZE;
 		if (pte_none(pte))
-			mincore_unmapped_range(vma, addr, 1, vec);
+			mincore_unmapped_range(vma, addr, next, vec);
 		else if (pte_present(pte))
-			vec[i] = 1;
+			*vec = 1;
 		else if (pte_file(pte)) {
 			pgoff = pte_to_pgoff(pte);
-			vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+			*vec = mincore_page(vma->vm_file->f_mapping, pgoff);
 		} else { /* pte is a swap entry */
 			swp_entry_t entry = pte_to_swp_entry(pte);
 
 			if (is_migration_entry(entry)) {
 				/* migration entries are always uptodate */
-				vec[i] = 1;
+				*vec = 1;
 			} else {
 #ifdef CONFIG_SWAP
 				pgoff = entry.val;
-				vec[i] = mincore_page(&swapper_space, pgoff);
+				*vec = mincore_page(&swapper_space, pgoff);
 #else
 				WARN_ON(1);
-				vec[i] = 1;
+				*vec = 1;
 #endif
 			}
 		}
-	}
+		vec++;
+	} while (ptep++, addr = next, addr != end);
 	pte_unmap_unlock(ptep - 1, ptl);
 }
 
@@ -153,25 +154,21 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	unsigned long nr;
 	struct vm_area_struct *vma;
+	unsigned long end;
 
 	vma = find_vma(current->mm, addr);
 	if (!vma || addr < vma->vm_start)
 		return -ENOMEM;
 
-	nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
+	end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
 
 	if (is_vm_hugetlb_page(vma)) {
-		mincore_hugetlb_page_range(vma, addr, nr, vec);
-		return nr;
+		mincore_hugetlb_page_range(vma, addr, end, vec);
+		return (end - addr) >> PAGE_SHIFT;
 	}
 
-	/*
-	 * Calculate how many pages there are left in the last level of the
-	 * PTE array for our address.
-	 */
-	nr = min(nr, PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1)));
+	end = pmd_addr_end(addr, end);
 
 	pgd = pgd_offset(vma->vm_mm, addr);
 	if (pgd_none_or_clear_bad(pgd))
@@ -184,12 +181,12 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 	if (pmd_none_or_clear_bad(pmd))
 		goto none_mapped;
 
-	mincore_pte_range(vma, pmd, addr, nr, vec);
-	return nr;
+	mincore_pte_range(vma, pmd, addr, end, vec);
+	return (end - addr) >> PAGE_SHIFT;
 
 none_mapped:
-	mincore_unmapped_range(vma, addr, nr, vec);
-	return nr;
+	mincore_unmapped_range(vma, addr, end, vec);
+	return (end - addr) >> PAGE_SHIFT;
 }
 
 /*
-- 
1.7.0.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [patch 4/5] mincore: do nested page table walks
  2010-03-23 14:34 mincore and transparent huge pages Johannes Weiner
                   ` (2 preceding siblings ...)
  2010-03-23 14:35 ` [patch 3/5] mincore: pass ranges as start,end address pairs Johannes Weiner
@ 2010-03-23 14:35 ` Johannes Weiner
  2010-03-23 14:35 ` [rfc 5/5] mincore: transparent huge page support Johannes Weiner
  2010-03-24 22:32 ` mincore and transparent huge pages Andrea Arcangeli
  5 siblings, 0 replies; 11+ messages in thread
From: Johannes Weiner @ 2010-03-23 14:35 UTC (permalink / raw)
  To: Andrew Morton, Andrea Arcangeli; +Cc: Naoya Horiguchi, linux-mm, linux-kernel

Do page table walks with the well-known nested loops we use in several
other places already.

This avoids doing full page table walks after every pte range and also
allows to handle unmapped areas bigger than one pte range in one go.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/mincore.c |   82 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index eb50daa..28cab9d 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -144,6 +144,61 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	pte_unmap_unlock(ptep - 1, ptl);
 }
 
+static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+			unsigned long addr, unsigned long end,
+			unsigned char *vec)
+{
+	unsigned long next;
+	pmd_t *pmd;
+
+	pmd = pmd_offset(pud, addr);
+	split_huge_page_vma(vma, pmd);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
+			mincore_unmapped_range(vma, addr, next, vec);
+		else
+			mincore_pte_range(vma, pmd, addr, next, vec);
+		vec += (next - addr) >> PAGE_SHIFT;
+	} while (pmd++, addr = next, addr != end);
+}
+
+static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+			unsigned long addr, unsigned long end,
+			unsigned char *vec)
+{
+	unsigned long next;
+	pud_t *pud;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			mincore_unmapped_range(vma, addr, next, vec);
+		else
+			mincore_pmd_range(vma, pud, addr, next, vec);
+		vec += (next - addr) >> PAGE_SHIFT;
+	} while (pud++, addr = next, addr != end);
+}
+
+static void mincore_page_range(struct vm_area_struct *vma,
+			unsigned long addr, unsigned long end,
+			unsigned char *vec)
+{
+	unsigned long next;
+	pgd_t *pgd;
+
+	pgd = pgd_offset(vma->vm_mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			mincore_unmapped_range(vma, addr, next, vec);
+		else
+			mincore_pud_range(vma, pgd, addr, next, vec);
+		vec += (next - addr) >> PAGE_SHIFT;
+	} while (pgd++, addr = next, addr != end);
+}
+
 /*
  * Do a chunk of "sys_mincore()". We've already checked
  * all the arguments, we hold the mmap semaphore: we should
@@ -151,9 +206,6 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
  */
 static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
 	struct vm_area_struct *vma;
 	unsigned long end;
 
@@ -163,29 +215,11 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 
 	end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
 
-	if (is_vm_hugetlb_page(vma)) {
+	if (is_vm_hugetlb_page(vma))
 		mincore_hugetlb_page_range(vma, addr, end, vec);
-		return (end - addr) >> PAGE_SHIFT;
-	}
-
-	end = pmd_addr_end(addr, end);
-
-	pgd = pgd_offset(vma->vm_mm, addr);
-	if (pgd_none_or_clear_bad(pgd))
-		goto none_mapped;
-	pud = pud_offset(pgd, addr);
-	if (pud_none_or_clear_bad(pud))
-		goto none_mapped;
-	pmd = pmd_offset(pud, addr);
-	split_huge_page_vma(vma, pmd);
-	if (pmd_none_or_clear_bad(pmd))
-		goto none_mapped;
-
-	mincore_pte_range(vma, pmd, addr, end, vec);
-	return (end - addr) >> PAGE_SHIFT;
+	else
+		mincore_page_range(vma, addr, end, vec);
 
-none_mapped:
-	mincore_unmapped_range(vma, addr, end, vec);
 	return (end - addr) >> PAGE_SHIFT;
 }
 
-- 
1.7.0.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [rfc 5/5] mincore: transparent huge page support
  2010-03-23 14:34 mincore and transparent huge pages Johannes Weiner
                   ` (3 preceding siblings ...)
  2010-03-23 14:35 ` [patch 4/5] mincore: do nested page table walks Johannes Weiner
@ 2010-03-23 14:35 ` Johannes Weiner
  2010-03-24 22:48   ` Andrea Arcangeli
  2010-03-24 22:32 ` mincore and transparent huge pages Andrea Arcangeli
  5 siblings, 1 reply; 11+ messages in thread
From: Johannes Weiner @ 2010-03-23 14:35 UTC (permalink / raw)
  To: Andrew Morton, Andrea Arcangeli; +Cc: Naoya Horiguchi, linux-mm, linux-kernel

Handle transparent huge page pmd entries natively instead of splitting
them into subpages.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/mincore.c |   37 ++++++++++++++++++++++++++++++++++---
 1 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index 28cab9d..d4cddc1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -15,6 +15,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
+#include <linux/rmap.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -144,6 +145,35 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	pte_unmap_unlock(ptep - 1, ptl);
 }
 
+static int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+			unsigned long addr, unsigned long end,
+			unsigned char *vec)
+{
+	int huge = 0;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	spin_lock(&vma->vm_mm->page_table_lock);
+	if (likely(pmd_trans_huge(*pmd))) {
+		huge = !pmd_trans_splitting(*pmd);
+		spin_unlock(&vma->vm_mm->page_table_lock);
+		/*
+		 * If we have an intact huge pmd entry, all pages in
+		 * the range are present in the mincore() sense of
+		 * things.
+		 *
+		 * But if the entry is currently being split into
+		 * normal page mappings, wait for it to finish and
+		 * signal the fallback to ptes.
+		 */
+		if (huge)
+			memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+		else
+			wait_split_huge_page(vma->anon_vma, pmd);
+	} else
+		spin_unlock(&vma->vm_mm->page_table_lock);
+#endif
+	return huge;
+}
+
 static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 			unsigned long addr, unsigned long end,
 			unsigned char *vec)
@@ -152,12 +182,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 	pmd_t *pmd;
 
 	pmd = pmd_offset(pud, addr);
-	split_huge_page_vma(vma, pmd);
 	do {
 		next = pmd_addr_end(addr, end);
-		if (pmd_none_or_clear_bad(pmd))
+		/* XXX: pmd_none_or_clear_bad() triggers on _PAGE_PSE */
+		if (pmd_none(*pmd))
 			mincore_unmapped_range(vma, addr, next, vec);
-		else
+		else if (!pmd_trans_huge(*pmd) ||
+			 !mincore_huge_pmd(vma, pmd, addr, next, vec))
 			mincore_pte_range(vma, pmd, addr, next, vec);
 		vec += (next - addr) >> PAGE_SHIFT;
 	} while (pmd++, addr = next, addr != end);
-- 
1.7.0.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: mincore and transparent huge pages
  2010-03-23 14:34 mincore and transparent huge pages Johannes Weiner
                   ` (4 preceding siblings ...)
  2010-03-23 14:35 ` [rfc 5/5] mincore: transparent huge page support Johannes Weiner
@ 2010-03-24 22:32 ` Andrea Arcangeli
  5 siblings, 0 replies; 11+ messages in thread
From: Andrea Arcangeli @ 2010-03-24 22:32 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Andrew Morton, Naoya Horiguchi, linux-mm, linux-kernel

Hi Johannes,

On Tue, Mar 23, 2010 at 03:34:57PM +0100, Johannes Weiner wrote:
> 
> Hi,
> 
> I wanted to make mincore() handle huge pmds natively over the weekend
> but I chose do beef up the code a bit first (1-4).
> 
> Andrew, 1-4 may have merit without transparent huge pages, so they
> could go in independently.  They are based on Andrea's patches but the
> only thing huge page in them is the split_huge_page_vma() call, so it
> would be easy to rebase (I can do that).
> 
> Below is also an ugly hack I used to test transparent huge pages on my
> 32bit netbook.  The VM_ flags, oh, the VM_ flags!

Thanks a lot for this effort.

> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 85fa92a..b6aec57 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -106,10 +106,11 @@ extern unsigned int kobjsize(const void *objp);
>  #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
>  #define VM_SAO		0x20000000	/* Strong Access Ordering (powerpc) */
>  #define VM_PFN_AT_MMAP	0x40000000	/* PFNMAP vma that is fully mapped at mmap time */
> +#ifdef CONFIG_KSM
> +#error no more VM_ flags
>  #define VM_MERGEABLE	0x80000000	/* KSM may merge identical pages */
> -#if BITS_PER_LONG > 32
> -#define VM_HUGEPAGE	0x100000000UL	/* MADV_HUGEPAGE marked this vma */
>  #endif
> +#define VM_HUGEPAGE	0x80000000	/* MADV_HUGEPAGE marked this vma */
>  
>  #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
>  #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS

The moment we say we need 32bit archs, I suggest to takeover VM_SAO, I
think it's more likely you need ksm on 32bit x86, than transparent
hugepage on ppc32. I also doubt VM_RESERVED is still actual these
days, but I guess I won't take the tangent to go after it (if somebody
does that's welcome, otherwise later after transparent hugepage is in,
after ksm works on transparent hugepages, after memory compaction is
in, and after slab gets its front huge-allocator to make sure it allocates
fine with 4k granularity but if a hugepage is available it eats from
there first). It's not like a big priority to nuke VM_RESERVED
compared to all the rest...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc 5/5] mincore: transparent huge page support
  2010-03-23 14:35 ` [rfc 5/5] mincore: transparent huge page support Johannes Weiner
@ 2010-03-24 22:48   ` Andrea Arcangeli
  2010-03-25  0:07     ` Johannes Weiner
  2010-03-25  1:23     ` Johannes Weiner
  0 siblings, 2 replies; 11+ messages in thread
From: Andrea Arcangeli @ 2010-03-24 22:48 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Andrew Morton, Naoya Horiguchi, linux-mm, linux-kernel

On Tue, Mar 23, 2010 at 03:35:02PM +0100, Johannes Weiner wrote:
> +static int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> +			unsigned long addr, unsigned long end,
> +			unsigned char *vec)
> +{
> +	int huge = 0;
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	spin_lock(&vma->vm_mm->page_table_lock);
> +	if (likely(pmd_trans_huge(*pmd))) {
> +		huge = !pmd_trans_splitting(*pmd);

Under mmap_sem (read or write) a hugepage can't materialize under
us. So here the pmd_trans_huge can be lockless and run _before_ taking
the page_table_lock. That's the invariant I used to keep identical
performance for all fast paths.

And if it wasn't the case it wouldn't be safe to return huge = 0 as
the page_table_lock is released at that point.

> +		spin_unlock(&vma->vm_mm->page_table_lock);
> +		/*
> +		 * If we have an intact huge pmd entry, all pages in
> +		 * the range are present in the mincore() sense of
> +		 * things.
> +		 *
> +		 * But if the entry is currently being split into
> +		 * normal page mappings, wait for it to finish and
> +		 * signal the fallback to ptes.
> +		 */
> +		if (huge)
> +			memset(vec, 1, (end - addr) >> PAGE_SHIFT);
> +		else
> +			wait_split_huge_page(vma->anon_vma, pmd);
> +	} else
> +		spin_unlock(&vma->vm_mm->page_table_lock);
> +#endif
> +	return huge;
> +}
> +

It's probably cleaner to move the block into huge_memory.c and create
a dummy for the #ifndef version like I did for all the rest.


I'll incorporate and take care of those changes myself if you don't
mind, as I'm going to do a new submit for -mm. I greatly appreciated
you taken the time to port to transhuge it helps a lot! ;)

Thanks,
Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc 5/5] mincore: transparent huge page support
  2010-03-24 22:48   ` Andrea Arcangeli
@ 2010-03-25  0:07     ` Johannes Weiner
  2010-03-25  0:42       ` Andrea Arcangeli
  2010-03-25  1:23     ` Johannes Weiner
  1 sibling, 1 reply; 11+ messages in thread
From: Johannes Weiner @ 2010-03-25  0:07 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, Naoya Horiguchi, linux-mm, linux-kernel

On Wed, Mar 24, 2010 at 11:48:58PM +0100, Andrea Arcangeli wrote:
> On Tue, Mar 23, 2010 at 03:35:02PM +0100, Johannes Weiner wrote:
> > +static int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > +			unsigned long addr, unsigned long end,
> > +			unsigned char *vec)
> > +{
> > +	int huge = 0;
> > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > +	spin_lock(&vma->vm_mm->page_table_lock);
> > +	if (likely(pmd_trans_huge(*pmd))) {
> > +		huge = !pmd_trans_splitting(*pmd);
> 
> Under mmap_sem (read or write) a hugepage can't materialize under
> us. So here the pmd_trans_huge can be lockless and run _before_ taking
> the page_table_lock. That's the invariant I used to keep identical
> performance for all fast paths.

Stupid me.  I knew that, I just hadn't internalized it enough to do it
right :)

Btw, unless I miss something else, this is the same in follow_page()?

diff --git a/mm/memory.c b/mm/memory.c
index 22ee158..6c26042 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1301,18 +1301,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 	}
 	if (pmd_trans_huge(*pmd)) {
 		spin_lock(&mm->page_table_lock);
-		if (likely(pmd_trans_huge(*pmd))) {
-			if (unlikely(pmd_trans_splitting(*pmd))) {
-				spin_unlock(&mm->page_table_lock);
-				wait_split_huge_page(vma->anon_vma, pmd);
-			} else {
-				page = follow_trans_huge_pmd(mm, address,
-							     pmd, flags);
-				spin_unlock(&mm->page_table_lock);
-				goto out;
-			}
-		} else
+		if (unlikely(pmd_trans_splitting(*pmd))) {
 			spin_unlock(&mm->page_table_lock);
+			wait_split_huge_page(vma->anon_vma, pmd);
+		} else {
+			page = follow_trans_huge_pmd(mm, address, pmd, flags);
+			spin_unlock(&mm->page_table_lock);
+			goto out;
+		}
 		/* fall through */
 	}
 	if (unlikely(pmd_bad(*pmd)))

> And if it wasn't the case it wouldn't be safe to return huge = 0 as
> the page_table_lock is released at that point.

True.

> > +		spin_unlock(&vma->vm_mm->page_table_lock);
> > +		/*
> > +		 * If we have an intact huge pmd entry, all pages in
> > +		 * the range are present in the mincore() sense of
> > +		 * things.
> > +		 *
> > +		 * But if the entry is currently being split into
> > +		 * normal page mappings, wait for it to finish and
> > +		 * signal the fallback to ptes.
> > +		 */
> > +		if (huge)
> > +			memset(vec, 1, (end - addr) >> PAGE_SHIFT);
> > +		else
> > +			wait_split_huge_page(vma->anon_vma, pmd);
> > +	} else
> > +		spin_unlock(&vma->vm_mm->page_table_lock);
> > +#endif
> > +	return huge;
> > +}
> > +
> 
> It's probably cleaner to move the block into huge_memory.c and create
> a dummy for the #ifndef version like I did for all the rest.

Agreed.

> I'll incorporate and take care of those changes myself if you don't
> mind, as I'm going to do a new submit for -mm.

Knock yourself out :-)

	Hannes

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [rfc 5/5] mincore: transparent huge page support
  2010-03-25  0:07     ` Johannes Weiner
@ 2010-03-25  0:42       ` Andrea Arcangeli
  0 siblings, 0 replies; 11+ messages in thread
From: Andrea Arcangeli @ 2010-03-25  0:42 UTC (permalink / raw)
  To: Johannes Weiner; +Cc: Andrew Morton, Naoya Horiguchi, linux-mm, linux-kernel

On Thu, Mar 25, 2010 at 01:07:49AM +0100, Johannes Weiner wrote:
> Stupid me.  I knew that, I just hadn't internalized it enough to do it
> right :)

Never mind ;)

> Btw, unless I miss something else, this is the same in follow_page()?
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 22ee158..6c26042 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1301,18 +1301,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
>  	}
>  	if (pmd_trans_huge(*pmd)) {
>  		spin_lock(&mm->page_table_lock);

follow_page already checked pmd_trans_huge first outside the lock.

> -		if (likely(pmd_trans_huge(*pmd))) {

And then again inside. We have to re-check it inside to be safe,
otherwise we've to /* fall through */

> -			if (unlikely(pmd_trans_splitting(*pmd))) {
> -				spin_unlock(&mm->page_table_lock);
> -				wait_split_huge_page(vma->anon_vma, pmd);
> -			} else {
> -				page = follow_trans_huge_pmd(mm, address,
> -							     pmd, flags);

What I want to do in mincore is to call something like
mincore_trans_huge_pmd() to remove the #ifdef to make everyone happy.

> -				spin_unlock(&mm->page_table_lock);
> -				goto out;
> -			}
> -		} else
> +		if (unlikely(pmd_trans_splitting(*pmd))) {
>  			spin_unlock(&mm->page_table_lock);
> +			wait_split_huge_page(vma->anon_vma, pmd);
> +		} else {
> +			page = follow_trans_huge_pmd(mm, address, pmd, flags);
> +			spin_unlock(&mm->page_table_lock);
> +			goto out;
> +		}

So it would miss one needed check inside the lock and it could lead to
call follow_trans_huge_pmd with a not huge pmd which is invalid.

Also note, touching with C language stuff that can change under you
like we do in the first check outside the lock (and isn't marked
volatile) may have unpredictable results in theory. But in practice we
do stuff like this all the time. Specifically in this case it's just
one bitflag check so it should be safe enough considering that
test_bit is implemented in C on x86 and does the same thing,
spin_is_locked likely is also implemented in C etc... In other cases
like "switch" parameters, gcc can implement it with a jump table, and
if the value changes under gcc it may jump beyond the end of the
table. It's a tradeoff between writing perfect C and having to deal
with asm all the time (or worse slowdown marking pmd volatile). Some
ages ago I guess I proposed to fix this including not assuming that
writes to the ptes are atomic with one instruction if done in C, but
nobody cared and in effect it works reliably also this way and it's a
bit simpler... ;).

> Knock yourself out :-)

As you wish! ;)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [rfc 5/5] mincore: transparent huge page support
  2010-03-24 22:48   ` Andrea Arcangeli
  2010-03-25  0:07     ` Johannes Weiner
@ 2010-03-25  1:23     ` Johannes Weiner
  1 sibling, 0 replies; 11+ messages in thread
From: Johannes Weiner @ 2010-03-25  1:23 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andrew Morton, Naoya Horiguchi, linux-mm, linux-kernel

On Wed, Mar 24, 2010 at 11:48:58PM +0100, Andrea Arcangeli wrote:
> On Tue, Mar 23, 2010 at 03:35:02PM +0100, Johannes Weiner wrote:
> > +static int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > +			unsigned long addr, unsigned long end,
> > +			unsigned char *vec)
> > +{
> > +	int huge = 0;
> > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > +	spin_lock(&vma->vm_mm->page_table_lock);
> > +	if (likely(pmd_trans_huge(*pmd))) {
> > +		huge = !pmd_trans_splitting(*pmd);
> 
> Under mmap_sem (read or write) a hugepage can't materialize under
> us. So here the pmd_trans_huge can be lockless and run _before_ taking
> the page_table_lock. That's the invariant I used to keep identical
> performance for all fast paths.

Wait, there _is_ an unlocked fast-path pmd_trans_huge()
in mincore_pmd_range(), maybe you missed it?

This function is never called if the pmd is not huge.

So the above is the _second check_ under lock to get a stable
read on the entry that could be splitting or already have been
split while we checked locklessly.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2010-03-25  1:23 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-03-23 14:34 mincore and transparent huge pages Johannes Weiner
2010-03-23 14:34 ` [patch 1/5] mincore: cleanups Johannes Weiner
2010-03-23 14:34 ` [patch 2/5] mincore: break do_mincore() into logical pieces Johannes Weiner
2010-03-23 14:35 ` [patch 3/5] mincore: pass ranges as start,end address pairs Johannes Weiner
2010-03-23 14:35 ` [patch 4/5] mincore: do nested page table walks Johannes Weiner
2010-03-23 14:35 ` [rfc 5/5] mincore: transparent huge page support Johannes Weiner
2010-03-24 22:48   ` Andrea Arcangeli
2010-03-25  0:07     ` Johannes Weiner
2010-03-25  0:42       ` Andrea Arcangeli
2010-03-25  1:23     ` Johannes Weiner
2010-03-24 22:32 ` mincore and transparent huge pages Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).