Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 1/8] arm64: hugetlb: Refactor find_num_contig
From: Punit Agrawal @ 2017-05-24 11:54 UTC (permalink / raw)
  To: akpm
  Cc: Steve Capper, linux-mm, linux-kernel, linux-arm-kernel,
	catalin.marinas, will.deacon, mark.rutland, linux-arch,
	David Woods, Punit Agrawal
In-Reply-To: <20170524115409.31309-1-punit.agrawal@arm.com>

From: Steve Capper <steve.capper@arm.com>

As we regularly check for contiguous pte's in the huge accessors, remove
this extra check from find_num_contig.

Cc: David Woods <dwoods@mellanox.com>
Signed-off-by: Steve Capper <steve.capper@arm.com>
[ Resolved rebase conflicts due to patch re-ordering ]
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
---
 arch/arm64/mm/hugetlbpage.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 69b8200b1cfd..710bf935a473 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -42,15 +42,13 @@ int pud_huge(pud_t pud)
 }
 
 static int find_num_contig(struct mm_struct *mm, unsigned long addr,
-			   pte_t *ptep, pte_t pte, size_t *pgsize)
+			   pte_t *ptep, size_t *pgsize)
 {
 	pgd_t *pgd = pgd_offset(mm, addr);
 	pud_t *pud;
 	pmd_t *pmd;
 
 	*pgsize = PAGE_SIZE;
-	if (!pte_cont(pte))
-		return 1;
 	pud = pud_offset(pgd, addr);
 	pmd = pmd_offset(pud, addr);
 	if ((pte_t *)pmd == ptep) {
@@ -65,15 +63,16 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 {
 	size_t pgsize;
 	int i;
-	int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize);
+	int ncontig;
 	unsigned long pfn;
 	pgprot_t hugeprot;
 
-	if (ncontig == 1) {
+	if (!pte_cont(pte)) {
 		set_pte_at(mm, addr, ptep, pte);
 		return;
 	}
 
+	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
 	pfn = pte_pfn(pte);
 	hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
 	for (i = 0; i < ncontig; i++) {
@@ -188,7 +187,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 		bool is_dirty = false;
 
 		cpte = huge_pte_offset(mm, addr);
-		ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
+		ncontig = find_num_contig(mm, addr, cpte, &pgsize);
 		/* save the 1st pte to return */
 		pte = ptep_get_and_clear(mm, addr, cpte);
 		for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) {
@@ -228,7 +227,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 		cpte = huge_pte_offset(vma->vm_mm, addr);
 		pfn = pte_pfn(*cpte);
 		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
-					  *cpte, &pgsize);
+					  &pgsize);
 		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) {
 			changed |= ptep_set_access_flags(vma, addr, cpte,
 							pfn_pte(pfn,
@@ -251,7 +250,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 		size_t pgsize = 0;
 
 		cpte = huge_pte_offset(mm, addr);
-		ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
+		ncontig = find_num_contig(mm, addr, cpte, &pgsize);
 		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
 			ptep_set_wrprotect(mm, addr, cpte);
 	} else {
@@ -269,7 +268,7 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma,
 
 		cpte = huge_pte_offset(vma->vm_mm, addr);
 		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
-					  *cpte, &pgsize);
+					  &pgsize);
 		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
 			ptep_clear_flush(vma, addr, cpte);
 	} else {
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 2/8] arm64: hugetlb: Remove spurious calls to huge_ptep_offset
From: Punit Agrawal @ 2017-05-24 11:54 UTC (permalink / raw)
  To: akpm
  Cc: Steve Capper, linux-mm, linux-kernel, linux-arm-kernel,
	catalin.marinas, will.deacon, mark.rutland, linux-arch,
	David Woods, Punit Agrawal
In-Reply-To: <20170524115409.31309-1-punit.agrawal@arm.com>

From: Steve Capper <steve.capper@arm.com>

We don't need to call huge_ptep_offset as our accessors are already
supplied with the pte_t *. This patch removes those spurious calls.

Cc: David Woods <dwoods@mellanox.com>
Signed-off-by: Steve Capper <steve.capper@arm.com>
[ Resolved rebase conflicts due to patch re-ordering ]
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
---
 arch/arm64/mm/hugetlbpage.c | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 710bf935a473..f89aa8fa5855 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -183,21 +183,19 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 	if (pte_cont(*ptep)) {
 		int ncontig, i;
 		size_t pgsize;
-		pte_t *cpte;
 		bool is_dirty = false;
 
-		cpte = huge_pte_offset(mm, addr);
-		ncontig = find_num_contig(mm, addr, cpte, &pgsize);
+		ncontig = find_num_contig(mm, addr, ptep, &pgsize);
 		/* save the 1st pte to return */
-		pte = ptep_get_and_clear(mm, addr, cpte);
+		pte = ptep_get_and_clear(mm, addr, ptep);
 		for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) {
 			/*
 			 * If HW_AFDBM is enabled, then the HW could
 			 * turn on the dirty bit for any of the page
 			 * in the set, so check them all.
 			 */
-			++cpte;
-			if (pte_dirty(ptep_get_and_clear(mm, addr, cpte)))
+			++ptep;
+			if (pte_dirty(ptep_get_and_clear(mm, addr, ptep)))
 				is_dirty = true;
 		}
 		if (is_dirty)
@@ -213,8 +211,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 			       unsigned long addr, pte_t *ptep,
 			       pte_t pte, int dirty)
 {
-	pte_t *cpte;
-
 	if (pte_cont(pte)) {
 		int ncontig, i, changed = 0;
 		size_t pgsize = 0;
@@ -224,12 +220,11 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 			__pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^
 				 pte_val(pte));
 
-		cpte = huge_pte_offset(vma->vm_mm, addr);
-		pfn = pte_pfn(*cpte);
-		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
+		pfn = pte_pfn(pte);
+		ncontig = find_num_contig(vma->vm_mm, addr, ptep,
 					  &pgsize);
-		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) {
-			changed |= ptep_set_access_flags(vma, addr, cpte,
+		for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) {
+			changed |= ptep_set_access_flags(vma, addr, ptep,
 							pfn_pte(pfn,
 								hugeprot),
 							dirty);
@@ -246,13 +241,11 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 {
 	if (pte_cont(*ptep)) {
 		int ncontig, i;
-		pte_t *cpte;
 		size_t pgsize = 0;
 
-		cpte = huge_pte_offset(mm, addr);
-		ncontig = find_num_contig(mm, addr, cpte, &pgsize);
-		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
-			ptep_set_wrprotect(mm, addr, cpte);
+		ncontig = find_num_contig(mm, addr, ptep, &pgsize);
+		for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize)
+			ptep_set_wrprotect(mm, addr, ptep);
 	} else {
 		ptep_set_wrprotect(mm, addr, ptep);
 	}
@@ -263,14 +256,12 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma,
 {
 	if (pte_cont(*ptep)) {
 		int ncontig, i;
-		pte_t *cpte;
 		size_t pgsize = 0;
 
-		cpte = huge_pte_offset(vma->vm_mm, addr);
-		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
+		ncontig = find_num_contig(vma->vm_mm, addr, ptep,
 					  &pgsize);
-		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
-			ptep_clear_flush(vma, addr, cpte);
+		for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize)
+			ptep_clear_flush(vma, addr, ptep);
 	} else {
 		ptep_clear_flush(vma, addr, ptep);
 	}
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 3/8] mm, gup: Remove broken VM_BUG_ON_PAGE compound check for hugepages
From: Punit Agrawal @ 2017-05-24 11:54 UTC (permalink / raw)
  To: akpm
  Cc: Will Deacon, linux-mm, linux-kernel, linux-arm-kernel,
	catalin.marinas, n-horiguchi, kirill.shutemov, mike.kravetz,
	steve.capper, mark.rutland, linux-arch, aneesh.kumar,
	Punit Agrawal
In-Reply-To: <20170524115409.31309-1-punit.agrawal@arm.com>

From: Will Deacon <will.deacon@arm.com>

When operating on hugepages with DEBUG_VM enabled, the GUP code checks the
compound head for each tail page prior to calling page_cache_add_speculative.
This is broken, because on the fast-GUP path (where we don't hold any page
table locks) we can be racing with a concurrent invocation of
split_huge_page_to_list.

split_huge_page_to_list deals with this race by using page_ref_freeze to
freeze the page and force concurrent GUPs to fail whilst the component
pages are modified. This modification includes clearing the compound_head
field for the tail pages, so checking this prior to a successful call
to page_cache_add_speculative can lead to false positives: In fact,
page_cache_add_speculative *already* has this check once the page refcount
has been successfully updated, so we can simply remove the broken calls
to VM_BUG_ON_PAGE.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 mm/gup.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index d9e6fddcc51f..ccf8cb38234f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1361,7 +1361,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 	head = pmd_page(orig);
 	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 	do {
-		VM_BUG_ON_PAGE(compound_head(page) != head, page);
 		pages[*nr] = page;
 		(*nr)++;
 		page++;
@@ -1400,7 +1399,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 	head = pud_page(orig);
 	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 	do {
-		VM_BUG_ON_PAGE(compound_head(page) != head, page);
 		pages[*nr] = page;
 		(*nr)++;
 		page++;
@@ -1438,7 +1436,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 	head = pgd_page(orig);
 	page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
 	do {
-		VM_BUG_ON_PAGE(compound_head(page) != head, page);
 		pages[*nr] = page;
 		(*nr)++;
 		page++;
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 4/8] mm, gup: Ensure real head page is ref-counted when using hugepages
From: Punit Agrawal @ 2017-05-24 11:54 UTC (permalink / raw)
  To: akpm
  Cc: Punit Agrawal, linux-mm, linux-kernel, linux-arm-kernel,
	catalin.marinas, will.deacon, n-horiguchi, kirill.shutemov,
	mike.kravetz, steve.capper, mark.rutland, linux-arch,
	aneesh.kumar, Michal Hocko
In-Reply-To: <20170524115409.31309-1-punit.agrawal@arm.com>

When speculatively taking references to a hugepage using
page_cache_add_speculative() in gup_huge_pmd(), it is assumed that the
page returned by pmd_page() is the head page. Although normally true,
this assumption doesn't hold when the hugepage comprises of successive
page table entries such as when using contiguous bit on arm64 at PTE or
PMD levels.

This can be addressed by ensuring that the page passed to
page_cache_add_speculative() is the real head or by de-referencing the
head page within the function.

We take the first approach to keep the usage pattern aligned with
page_cache_get_speculative() where users already pass the appropriate
page, i.e., the de-referenced head.

Apply the same logic to fix gup_huge_[pud|pgd]() as well.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Steve Capper <steve.capper@arm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 mm/gup.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index ccf8cb38234f..be67996513be 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1358,8 +1358,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		return __gup_device_huge_pmd(orig, addr, end, pages, nr);
 
 	refs = 0;
-	head = pmd_page(orig);
-	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 	do {
 		pages[*nr] = page;
 		(*nr)++;
@@ -1367,6 +1366,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
 
+	head = compound_head(page);
 	if (!page_cache_add_speculative(head, refs)) {
 		*nr -= refs;
 		return 0;
@@ -1396,8 +1396,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 		return __gup_device_huge_pud(orig, addr, end, pages, nr);
 
 	refs = 0;
-	head = pud_page(orig);
-	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 	do {
 		pages[*nr] = page;
 		(*nr)++;
@@ -1405,6 +1404,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
 
+	head = compound_head(page);
 	if (!page_cache_add_speculative(head, refs)) {
 		*nr -= refs;
 		return 0;
@@ -1433,8 +1433,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 
 	BUILD_BUG_ON(pgd_devmap(orig));
 	refs = 0;
-	head = pgd_page(orig);
-	page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
+	page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
 	do {
 		pages[*nr] = page;
 		(*nr)++;
@@ -1442,6 +1441,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
 
+	head = compound_head(page);
 	if (!page_cache_add_speculative(head, refs)) {
 		*nr -= refs;
 		return 0;
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 5/8] mm/hugetlb: add size parameter to huge_pte_offset()
From: Punit Agrawal @ 2017-05-24 11:54 UTC (permalink / raw)
  To: akpm
  Cc: Punit Agrawal, linux-mm, linux-kernel, linux-arm-kernel,
	catalin.marinas, will.deacon, n-horiguchi, kirill.shutemov,
	mike.kravetz, steve.capper, mark.rutland, linux-arch,
	aneesh.kumar, Tony Luck, Fenghua Yu, James Hogan, Ralf Baechle,
	James E.J. Bottomley, Helge Deller, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Martin Schwidefsky,
	Heiko Carstens, Yoshinori Sato, Rich Felker, David S. Miller,
	Chris Metcalf, Thomas Gleixner, Ingo Molnar, H. Peter Anvin,
	Alexander Viro, Michal Hocko, Hillf Danton
In-Reply-To: <20170524115409.31309-1-punit.agrawal@arm.com>

A poisoned or migrated hugepage is stored as a swap entry in the page
tables. On architectures that support hugepages consisting of contiguous
page table entries (such as on arm64) this leads to ambiguity in
determining the page table entry to return in huge_pte_offset() when a
poisoned entry is encountered.

Let's remove the ambiguity by adding a size parameter to convey
additional information about the requested address. Also fixup the
definition/usage of huge_pte_offset() throughout the tree.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Steve Capper <steve.capper@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: James Hogan <james.hogan@imgtec.com> (odd fixer:METAG ARCHITECTURE)
Cc: Ralf Baechle <ralf@linux-mips.org> (supporter:MIPS)
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
---
 arch/arm64/mm/hugetlbpage.c   |  3 ++-
 arch/ia64/mm/hugetlbpage.c    |  4 ++--
 arch/metag/mm/hugetlbpage.c   |  3 ++-
 arch/mips/mm/hugetlbpage.c    |  3 ++-
 arch/parisc/mm/hugetlbpage.c  |  3 ++-
 arch/powerpc/mm/hugetlbpage.c |  2 +-
 arch/s390/mm/hugetlbpage.c    |  3 ++-
 arch/sh/mm/hugetlbpage.c      |  3 ++-
 arch/sparc/mm/hugetlbpage.c   |  3 ++-
 arch/tile/mm/hugetlbpage.c    |  3 ++-
 arch/x86/mm/hugetlbpage.c     |  2 +-
 fs/userfaultfd.c              |  7 +++++--
 include/linux/hugetlb.h       |  5 +++--
 mm/hugetlb.c                  | 23 ++++++++++++++---------
 mm/page_vma_mapped.c          |  3 ++-
 mm/pagewalk.c                 |  3 ++-
 16 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index f89aa8fa5855..656e0ece2289 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -131,7 +131,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 85de86d36fdf..ae35140332f7 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -44,7 +44,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 }
 
 pte_t *
-huge_pte_offset (struct mm_struct *mm, unsigned long addr)
+huge_pte_offset (struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
 	unsigned long taddr = htlbpage_to_page(addr);
 	pgd_t *pgd;
@@ -92,7 +92,7 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int writ
 	if (REGION_NUMBER(addr) != RGN_HPAGE)
 		return ERR_PTR(-EINVAL);
 
-	ptep = huge_pte_offset(mm, addr);
+	ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
 	if (!ptep || pte_none(*ptep))
 		return NULL;
 	page = pte_page(*ptep);
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index db1b7da91e4f..67fd53e2935a 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -74,7 +74,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 74aa6f62468f..cef152234312 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -36,7 +36,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
+		       unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index aa50ac090e9b..5eb8f633b282 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -69,7 +69,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a4f33de4008e..e46744d3b4ae 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -55,7 +55,7 @@ static unsigned nr_gpages;
 
 #define hugepd_none(hpd)	(hpd_val(hpd) == 0)
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
 	/* Only called for hugetlbfs pages, hence can ignore THP */
 	return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 9b4050caa4e9..ae23afc18493 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -176,7 +176,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return (pte_t *) pmdp;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgdp;
 	pud_t *pudp;
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index cc948db74878..d2412d2d6462 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -42,7 +42,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 7c29d38e6b99..8989c5e155b3 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -277,7 +277,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index cb10153b5c9f..1f0993945521 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -102,7 +102,8 @@ static pte_t *get_pte(pte_t *base, int index, int level)
 	return ptep;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 302f43fd9c28..ccf509063dfd 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -33,7 +33,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 	if (!vma || !is_vm_hugetlb_page(vma))
 		return ERR_PTR(-EINVAL);
 
-	pte = huge_pte_offset(mm, address);
+	pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
 
 	/* hugetlb should be locked, and hence, prefaulted */
 	WARN_ON(!pte || pte_none(*pte));
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f7555fc25877..7b9c94837895 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -214,6 +214,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
  * hugepmd ranges.
  */
 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+					 struct vm_area_struct *vma,
 					 unsigned long address,
 					 unsigned long flags,
 					 unsigned long reason)
@@ -224,7 +225,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 
 	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
-	pte = huge_pte_offset(mm, address);
+	pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
 	if (!pte)
 		goto out;
 
@@ -243,6 +244,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 }
 #else
 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+					 struct vm_area_struct *vma,
 					 unsigned long address,
 					 unsigned long flags,
 					 unsigned long reason)
@@ -435,7 +437,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 		must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
 						  reason);
 	else
-		must_wait = userfaultfd_huge_must_wait(ctx, vmf->address,
+		must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
+						       vmf->address,
 						       vmf->flags, reason);
 	up_read(&mm->mmap_sem);
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b857fc8cc2ec..23010a3b2047 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -113,7 +113,8 @@ extern struct list_head huge_boot_pages;
 
 pte_t *huge_pte_alloc(struct mm_struct *mm,
 			unsigned long addr, unsigned long sz);
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz);
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
@@ -157,7 +158,7 @@ static inline void hugetlb_show_meminfo(void)
 #define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
 #define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
 				src_addr, pagep)	({ BUG(); 0; })
-#define huge_pte_offset(mm, address)	0
+#define huge_pte_offset(mm, address, sz)	0
 static inline int dequeue_hwpoisoned_huge_page(struct page *page)
 {
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e5828875f7bb..0e4d1fb3122f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3233,7 +3233,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
 		spinlock_t *src_ptl, *dst_ptl;
-		src_pte = huge_pte_offset(src, addr);
+		src_pte = huge_pte_offset(src, addr, sz);
 		if (!src_pte)
 			continue;
 		dst_pte = huge_pte_alloc(dst, addr, sz);
@@ -3317,7 +3317,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	address = start;
 	for (; address < end; address += sz) {
-		ptep = huge_pte_offset(mm, address);
+		ptep = huge_pte_offset(mm, address, sz);
 		if (!ptep)
 			continue;
 
@@ -3535,7 +3535,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unmap_ref_private(mm, vma, old_page, address);
 			BUG_ON(huge_pte_none(pte));
 			spin_lock(ptl);
-			ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+			ptep = huge_pte_offset(mm, address & huge_page_mask(h),
+					       huge_page_size(h));
 			if (likely(ptep &&
 				   pte_same(huge_ptep_get(ptep), pte)))
 				goto retry_avoidcopy;
@@ -3574,7 +3575,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * before the page tables are altered
 	 */
 	spin_lock(ptl);
-	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+	ptep = huge_pte_offset(mm, address & huge_page_mask(h),
+			       huge_page_size(h));
 	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
 		ClearPagePrivate(new_page);
 
@@ -3861,7 +3863,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	address &= huge_page_mask(h);
 
-	ptep = huge_pte_offset(mm, address);
+	ptep = huge_pte_offset(mm, address, huge_page_size(h));
 	if (ptep) {
 		entry = huge_ptep_get(ptep);
 		if (unlikely(is_hugetlb_entry_migration(entry))) {
@@ -4118,7 +4120,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 *
 		 * Note that page table lock is not held when pte is null.
 		 */
-		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
+				      huge_page_size(h));
 		if (pte)
 			ptl = huge_pte_lock(h, mm, pte);
 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
@@ -4252,7 +4255,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 	for (; address < end; address += huge_page_size(h)) {
 		spinlock_t *ptl;
-		ptep = huge_pte_offset(mm, address);
+		ptep = huge_pte_offset(mm, address, huge_page_size(h));
 		if (!ptep)
 			continue;
 		ptl = huge_pte_lock(h, mm, ptep);
@@ -4516,7 +4519,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 
 		saddr = page_table_shareable(svma, vma, addr, idx);
 		if (saddr) {
-			spte = huge_pte_offset(svma->vm_mm, saddr);
+			spte = huge_pte_offset(svma->vm_mm, saddr,
+					       vma_mmu_pagesize(svma));
 			if (spte) {
 				get_page(virt_to_page(spte));
 				break;
@@ -4612,7 +4616,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index de9c40d7304a..8ec6ba230bb9 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -116,7 +116,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 
 	if (unlikely(PageHuge(pvmw->page))) {
 		/* when pud is not present, pte will be NULL */
-		pvmw->pte = huge_pte_offset(mm, pvmw->address);
+		pvmw->pte = huge_pte_offset(mm, pvmw->address,
+					    PAGE_SIZE << compound_order(page));
 		if (!pvmw->pte)
 			return false;
 
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 60f7856e508f..1a4197965415 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -180,12 +180,13 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 	struct hstate *h = hstate_vma(vma);
 	unsigned long next;
 	unsigned long hmask = huge_page_mask(h);
+	unsigned long sz = huge_page_size(h);
 	pte_t *pte;
 	int err = 0;
 
 	do {
 		next = hugetlb_entry_end(h, addr, end);
-		pte = huge_pte_offset(walk->mm, addr & hmask);
+		pte = huge_pte_offset(walk->mm, addr & hmask, sz);
 		if (pte && walk->hugetlb_entry)
 			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
 		if (err)
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 6/8] mm/hugetlb: Allow architectures to override huge_pte_clear()
From: Punit Agrawal @ 2017-05-24 11:54 UTC (permalink / raw)
  To: akpm
  Cc: Punit Agrawal, linux-mm, linux-kernel, linux-arm-kernel,
	catalin.marinas, will.deacon, n-horiguchi, kirill.shutemov,
	mike.kravetz, steve.capper, mark.rutland, linux-arch,
	aneesh.kumar, Heiko Carstens
In-Reply-To: <20170524115409.31309-1-punit.agrawal@arm.com>

When unmapping a hugepage range, huge_pte_clear() is used to clear the
page table entries that are marked as not present. huge_pte_clear()
internally just ends up calling pte_clear() which does not correctly
deal with hugepages consisting of contiguous page table entries.

Add a size argument to address this issue and allow architectures to
override huge_pte_clear() by wrapping it in a #ifndef block.

Update s390 implementation with the size parameter as well.

Note that the change only affects huge_pte_clear() - the other generic
hugetlb functions don't need any change.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
---
 arch/s390/include/asm/hugetlb.h | 2 +-
 include/asm-generic/hugetlb.h   | 4 +++-
 mm/hugetlb.c                    | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index cd546a245c68..c0443500baec 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -39,7 +39,7 @@ static inline int prepare_hugepage_range(struct file *file,
 #define arch_clear_hugepage_flags(page)		do { } while (0)
 
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
-				  pte_t *ptep)
+				  pte_t *ptep, unsigned long sz)
 {
 	if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
 		pte_val(*ptep) = _REGION3_ENTRY_EMPTY;
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 99b490b4d05a..540354f94f83 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -31,10 +31,12 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
 	return pte_modify(pte, newprot);
 }
 
+#ifndef huge_pte_clear
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
-				  pte_t *ptep)
+		    pte_t *ptep, unsigned long sz)
 {
 	pte_clear(mm, addr, ptep);
 }
+#endif
 
 #endif /* _ASM_GENERIC_HUGETLB_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0e4d1fb3122f..ddfed20cd637 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3338,7 +3338,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 * unmapped and its refcount is dropped, so just clear pte here.
 		 */
 		if (unlikely(!pte_present(pte))) {
-			huge_pte_clear(mm, address, ptep);
+			huge_pte_clear(mm, address, ptep, sz);
 			spin_unlock(ptl);
 			continue;
 		}
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 7/8] mm/hugetlb: Introduce set_huge_swap_pte_at() helper
From: Punit Agrawal @ 2017-05-24 11:54 UTC (permalink / raw)
  To: akpm
  Cc: Punit Agrawal, linux-mm, linux-kernel, linux-arm-kernel,
	catalin.marinas, will.deacon, n-horiguchi, kirill.shutemov,
	mike.kravetz, steve.capper, mark.rutland, linux-arch,
	aneesh.kumar
In-Reply-To: <20170524115409.31309-1-punit.agrawal@arm.com>

set_huge_pte_at(), an architecture callback to populate hugepage ptes,
does not provide the range of virtual memory that is targeted. This
leads to ambiguity when dealing with swap entries on architectures that
support hugepages consisting of contiguous ptes.

Fix the problem by introducing an overridable helper for architectures
needing this support. The helper is called when populating the page
tables with swap entries. The size of the targeted region is provided to
the helper to help determine the number of entries to be updated.

Provide a default implementation that maintains the current behaviour.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Steve Capper <steve.capper@arm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
---
 include/linux/hugetlb.h | 8 ++++++++
 mm/hugetlb.c            | 8 +++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 23010a3b2047..879eb063fb95 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -435,6 +435,14 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
 }
 #endif
 
+#ifndef set_huge_swap_pte_at
+static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, pte_t pte, unsigned long sz)
+{
+	set_huge_pte_at(mm, addr, ptep, pte);
+}
+#endif
+
 static inline struct hstate *page_hstate(struct page *page)
 {
 	VM_BUG_ON_PAGE(!PageHuge(page), page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ddfed20cd637..e3052c16d29a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3263,9 +3263,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 				 */
 				make_migration_entry_read(&swp_entry);
 				entry = swp_entry_to_pte(swp_entry);
-				set_huge_pte_at(src, addr, src_pte, entry);
+				set_huge_swap_pte_at(src, addr, src_pte,
+						     entry, sz);
 			}
-			set_huge_pte_at(dst, addr, dst_pte, entry);
+			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
 		} else {
 			if (cow) {
 				huge_ptep_set_wrprotect(src, addr, src_pte);
@@ -4277,7 +4278,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 
 				make_migration_entry_read(&entry);
 				newpte = swp_entry_to_pte(entry);
-				set_huge_pte_at(mm, address, ptep, newpte);
+				set_huge_swap_pte_at(mm, address, ptep,
+						     newpte, huge_page_size(h));
 				pages++;
 			}
 			spin_unlock(ptl);
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 8/8] mm: rmap: Use correct helper when poisoning hugepages
From: Punit Agrawal @ 2017-05-24 11:54 UTC (permalink / raw)
  To: akpm
  Cc: Punit Agrawal, linux-mm, linux-kernel, linux-arm-kernel,
	catalin.marinas, will.deacon, n-horiguchi, kirill.shutemov,
	mike.kravetz, steve.capper, mark.rutland, linux-arch,
	aneesh.kumar
In-Reply-To: <20170524115409.31309-1-punit.agrawal@arm.com>

Using set_pte_at() does not do the right thing when putting down
HWPOISON swap entries for hugepages on architectures that support
contiguous ptes.

Fix this problem by using set_huge_swap_pte_at() which was introduced to
fix exactly this problem.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Steve Capper <steve.capper@arm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
---
 mm/rmap.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index d405f0e0ee96..feb2352aa95f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1379,15 +1379,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		update_hiwater_rss(mm);
 
 		if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
 			if (PageHuge(page)) {
 				int nr = 1 << compound_order(page);
 				hugetlb_count_sub(nr, mm);
+				set_huge_swap_pte_at(mm, address,
+						     pvmw.pte, pteval,
+						     vma_mmu_pagesize(vma));
 			} else {
 				dec_mm_counter(mm, mm_counter(page));
+				set_pte_at(mm, address, pvmw.pte, pteval);
 			}
 
-			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
-			set_pte_at(mm, address, pvmw.pte, pteval);
 		} else if (pte_unused(pteval)) {
 			/*
 			 * The guest indicated that the page content is of no
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* Re: [PATCH 0/6] refine and rename slub sysfs
From: Michal Hocko @ 2017-05-24 12:03 UTC (permalink / raw)
  To: Wei Yang; +Cc: cl, penberg, rientjes, akpm, linux-mm, linux-kernel
In-Reply-To: <20170524095450.GA7706@WeideMBP.lan>

On Wed 24-05-17 17:54:50, Wei Yang wrote:
> On Tue, May 23, 2017 at 08:39:11AM +0200, Michal Hocko wrote:
[...]
> >Is this worth risking breakage of the userspace which consume this data
> >now? Do you have any user space code which will greatly benefit from the
> >new data and which couldn't do the same with the current format/output?
> >
> >If yes this all should be in the changelog.
> 
> The answer is no.
> 
> I have the same concern as yours. So this patch set could be divided into two
> parts: 1. add some new entry with current name convention, 2. change the name
> convention.

Who is going to use those new entries and for what purpose? Why do we
want to expose even more details of the slab allocator to the userspace.
Is the missing information something fundamental that some user space
cannot work without it? Seriously these are essential questions you
should have answer for _before_ posting the patch and mention all those
reasons in the changelog.
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] mm/vmalloc: a slight change of compare target in __insert_vmap_area()
From: Michal Hocko @ 2017-05-24 12:11 UTC (permalink / raw)
  To: Wei Yang; +Cc: akpm, linux-mm, linux-kernel
In-Reply-To: <20170524100347.8131-1-richard.weiyang@gmail.com>

On Wed 24-05-17 18:03:47, Wei Yang wrote:
> The vmap RB tree store the elements in order and no overlap between any of
> them. The comparison in __insert_vmap_area() is to decide which direction
> the search should follow and make sure the new vmap_area is not overlap
> with any other.
> 
> Current implementation fails to do the overlap check.
> 
> When first "if" is not true, it means
> 
>     va->va_start >= tmp_va->va_end
> 
> And with the truth
> 
>     xxx->va_end > xxx->va_start
> 
> The deduction is
> 
>     va->va_end > tmp_va->va_start
> 
> which is the condition in second "if".
> 
> This patch changes a little of the comparison in __insert_vmap_area() to
> make sure it forbids the overlapped vmap_area.

Why do we care about overlapping vmap areas at this level. This is an
internal function and all the sanity checks should have been done by
that time AFAIR. Could you describe the problem which you are trying to
fix/address?

> Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
> ---
>  mm/vmalloc.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 0b057628a7ba..8087451cb332 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -360,9 +360,9 @@ static void __insert_vmap_area(struct vmap_area *va)
>  
>  		parent = *p;
>  		tmp_va = rb_entry(parent, struct vmap_area, rb_node);
> -		if (va->va_start < tmp_va->va_end)
> +		if (va->va_end <= tmp_va->va_start)
>  			p = &(*p)->rb_left;
> -		else if (va->va_end > tmp_va->va_start)
> +		else if (va->va_start >= tmp_va->va_end)
>  			p = &(*p)->rb_right;
>  		else
>  			BUG();
> -- 
> 2.11.0
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [Question] Mlocked count will not be decreased
From: Xishi Qiu @ 2017-05-24 12:10 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: Yisheng Xie, Kefeng Wang, linux-mm, linux-kernel, zhongjiang
In-Reply-To: <d354b321-0d11-4308-0b0e-aacef5a5e34b@suse.cz>

On 2017/5/24 19:52, Vlastimil Babka wrote:

> On 05/24/2017 01:38 PM, Xishi Qiu wrote:
>>>
>>> Race condition with what? Who else would isolate our pages?
>>>
>>
>> Hi Vlastimil,
>>
>> I find the root cause, if the page was not cached on the current cpu,
>> lru_add_drain() will not push it to LRU. So we should handle fail
>> case in mlock_vma_page().
> 
> Yeah that would explain it.
> 
>> follow_page_pte()
>> 		...
>> 		if (page->mapping && trylock_page(page)) {
>> 			lru_add_drain();  /* push cached pages to LRU */
>> 			/*
>> 			 * Because we lock page here, and migration is
>> 			 * blocked by the pte's page reference, and we
>> 			 * know the page is still mapped, we don't even
>> 			 * need to check for file-cache page truncation.
>> 			 */
>> 			mlock_vma_page(page);
>> 			unlock_page(page);
>> 		}
>> 		...
>>
>> I think we should add yisheng's patch, also we should add the following change.
>> I think it is better than use lru_add_drain_all().
> 
> I agree about yisheng's fix (but v2 didn't address my comments). I don't
> think we should add the hunk below, as that deviates from the rest of
> the design.

Hi Vlastimil,

The rest of the design is that mlock should always success here, right?

If we don't handle the fail case, the page will be in anon/file lru list
later when call __pagevec_lru_add(), but NR_MLOCK increased,
this is wrong, right?

Thanks,
Xishi Qiu

> 
> Thanks,
> Vlastimil
> 
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index 3d3ee6c..ca2aeb9 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -88,6 +88,11 @@ void mlock_vma_page(struct page *page)
>>  		count_vm_event(UNEVICTABLE_PGMLOCKED);
>>  		if (!isolate_lru_page(page))
>>  			putback_lru_page(page);
>> +		else {
>> +			ClearPageMlocked(page);
>> +			mod_zone_page_state(page_zone(page), NR_MLOCK,
>> +					-hpage_nr_pages(page));
>> +		}
>>  	}
>>  }
>>
>> Thanks,
>> Xishi Qiu
>>
> 
> 
> .
> 



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [RFC PATCH 0/2] remove CONFIG_MOVABLE_NODE
From: Michal Hocko @ 2017-05-24 12:24 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Mel Gorman, Vlastimil Babka, Andrea Arcangeli,
	Jerome Glisse, Reza Arbab, Yasuaki Ishimatsu, qiuxishi,
	Kani Toshimitsu, slaoub, Joonsoo Kim, Andi Kleen, David Rientjes,
	Daniel Kiper, Igor Mammedov, Vitaly Kuznetsov, LKML

Hi,
I am continuing to cleanup the memory hotplug code and
CONFIG_MOVABLE_NODE seems dubious at best. The following two patches
simply removes the flag and make it de-facto always enabled.

The current semantic of the config option is twofold 1) it automatically
binds hotplugable nodes to have memory in zone_movable by default when
movable_node is enabled 2) forbids memory hotplug to online all the memory
as movable when !CONFIG_MOVABLE_NODE.

The later restriction is quite dubious because there is no clear cut of
how much normal memory do we need for a reasonable system operation. A
single memory block which is sufficient to allow further movable
onlines is far from sufficient (e.g a node with >2GB and memblocks
128MB will fill up this zone with struct pages leaving nothing for
other allocations). Removing the config option will not only reduce the
configuration space it also removes quite some code.

The semantic of the movable_node command line parameter is preserved.

The first patch removes the restriction mentioned above and the second
one simply removes all the CONFIG_MOVABLE_NODE related stuff.

Shortlog
Michal Hocko (2):
      mm, memory_hotplug: drop artificial restriction on online/offline
      mm, memory_hotplug: drop CONFIG_MOVABLE_NODE

Diffstat:
 Documentation/admin-guide/kernel-parameters.txt |  7 ++-
 drivers/base/node.c                             |  4 --
 include/linux/memblock.h                        | 18 -------
 include/linux/nodemask.h                        |  4 --
 mm/Kconfig                                      | 26 -----------
 mm/memblock.c                                   |  2 -
 mm/memory_hotplug.c                             | 62 -------------------------
 mm/page_alloc.c                                 |  2 -
 8 files changed, 5 insertions(+), 120 deletions(-)


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [RFC PATCH 1/2] mm, memory_hotplug: drop artificial restriction on online/offline
From: Michal Hocko @ 2017-05-24 12:24 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Mel Gorman, Vlastimil Babka, Andrea Arcangeli,
	Jerome Glisse, Reza Arbab, Yasuaki Ishimatsu, qiuxishi,
	Kani Toshimitsu, slaoub, Joonsoo Kim, Andi Kleen, David Rientjes,
	Daniel Kiper, Igor Mammedov, Vitaly Kuznetsov, LKML, Michal Hocko
In-Reply-To: <20170524122411.25212-1-mhocko@kernel.org>

From: Michal Hocko <mhocko@suse.com>

74d42d8fe146 ("memory_hotplug: ensure every online node has NORMAL
memory") has added can_offline_normal which checks the amount of
memory in !movable zones as long as CONFIG_MOVABLE_NODE is disable.
It disallows to offline memory if there is nothing left with a
justification that "memory-management acts bad when we have nodes which
is online but don't have any normal memory".

74d42d8fe146 ("memory_hotplug: ensure every online node has NORMAL
memory") has introduced a restriction that every numa node has to have
at least some memory in !movable zones before a first movable memory
can be onlined if !CONFIG_MOVABLE_NODE with the same justification

While it is true that not having _any_ memory for kernel allocations on
a NUMA node is far from great and such a node would be quite subotimal
because all kernel allocations will have to fallback to another NUMA
node but there is no reason to disallow such a configuration in
principle.

Besides that there is not really a big difference to have one memblock
for ZONE_NORMAL available or none. With 128MB size memblocks the system
might trash on the kernel allocations requests anyway. It is really
hard to draw a line on how much normal memory is really sufficient so
we have to rely on administrator to configure system sanely therefore
drop the artificial restriction and remove can_offline_normal and
can_online_high_movable altogether.

Signed-off-by: Michal Hocko <mhocko@suse.com>

mm, memory_hotplug: drop can_online_high_movable

 because "memory-management acts
bad when we have nodes which is online but don't have any normal memory.

Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 mm/memory_hotplug.c | 58 -----------------------------------------------------
 1 file changed, 58 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 599c675ad538..10052c2fd400 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -763,23 +763,6 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 	return 0;
 }
 
-#ifdef CONFIG_MOVABLE_NODE
-/*
- * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
- * normal memory.
- */
-static bool can_online_high_movable(int nid)
-{
-	return true;
-}
-#else /* CONFIG_MOVABLE_NODE */
-/* ensure every online node has NORMAL memory */
-static bool can_online_high_movable(int nid)
-{
-	return node_state(nid, N_NORMAL_MEMORY);
-}
-#endif /* CONFIG_MOVABLE_NODE */
-
 /* check which state of node_states will be changed when online memory */
 static void node_states_check_changes_online(unsigned long nr_pages,
 	struct zone *zone, struct memory_notify *arg)
@@ -979,9 +962,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
 		return -EINVAL;
 
-	if (online_type == MMOP_ONLINE_MOVABLE && !can_online_high_movable(nid))
-		return -EINVAL;
-
 	/* associate pfn range with the zone */
 	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
 
@@ -1579,41 +1559,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 	return offlined;
 }
 
-#ifdef CONFIG_MOVABLE_NODE
-/*
- * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
- * normal memory.
- */
-static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
-{
-	return true;
-}
-#else /* CONFIG_MOVABLE_NODE */
-/* ensure the node has NORMAL memory if it is still online */
-static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-	unsigned long present_pages = 0;
-	enum zone_type zt;
-
-	for (zt = 0; zt <= ZONE_NORMAL; zt++)
-		present_pages += pgdat->node_zones[zt].present_pages;
-
-	if (present_pages > nr_pages)
-		return true;
-
-	present_pages = 0;
-	for (; zt <= ZONE_MOVABLE; zt++)
-		present_pages += pgdat->node_zones[zt].present_pages;
-
-	/*
-	 * we can't offline the last normal memory until all
-	 * higher memory is offlined.
-	 */
-	return present_pages == 0;
-}
-#endif /* CONFIG_MOVABLE_NODE */
-
 static int __init cmdline_parse_movable_node(char *p)
 {
 #ifdef CONFIG_MOVABLE_NODE
@@ -1741,9 +1686,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
 	node = zone_to_nid(zone);
 	nr_pages = end_pfn - start_pfn;
 
-	if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
-		return -EINVAL;
-
 	/* set above range as isolated */
 	ret = start_isolate_page_range(start_pfn, end_pfn,
 				       MIGRATE_MOVABLE, true);
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 2/2] mm, memory_hotplug: drop CONFIG_MOVABLE_NODE
From: Michal Hocko @ 2017-05-24 12:24 UTC (permalink / raw)
  To: linux-mm
  Cc: Andrew Morton, Mel Gorman, Vlastimil Babka, Andrea Arcangeli,
	Jerome Glisse, Reza Arbab, Yasuaki Ishimatsu, qiuxishi,
	Kani Toshimitsu, slaoub, Joonsoo Kim, Andi Kleen, David Rientjes,
	Daniel Kiper, Igor Mammedov, Vitaly Kuznetsov, LKML, Michal Hocko
In-Reply-To: <20170524122411.25212-1-mhocko@kernel.org>

From: Michal Hocko <mhocko@suse.com>

20b2f52b73fe ("numa: add CONFIG_MOVABLE_NODE for movable-dedicated
node") has introduced CONFIG_MOVABLE_NODE without a good explanation on
why it is actually useful. It makes a lot of sense to make movable node
semantic opt in but we already have that because the feature has to be
explicitly enabled on the kernel command line. A config option on top
only makes the configuration space larger without a good reason. It also
adds an additional ifdefery that pollutes the code. Just drop the config
option and make it de-facto always enabled. This shouldn't introduce any
change to the semantic.

Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  7 +++++--
 drivers/base/node.c                             |  4 ----
 include/linux/memblock.h                        | 18 -----------------
 include/linux/nodemask.h                        |  4 ----
 mm/Kconfig                                      | 26 -------------------------
 mm/memblock.c                                   |  2 --
 mm/memory_hotplug.c                             |  4 ----
 mm/page_alloc.c                                 |  2 --
 8 files changed, 5 insertions(+), 62 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index facc20a3f962..ec7d6ae01c96 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2246,8 +2246,11 @@
 			that the amount of memory usable for all allocations
 			is not too small.
 
-	movable_node	[KNL] Boot-time switch to enable the effects
-			of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
+	movable_node	[KNL] Boot-time switch to make hotplugable to be
+			movable. This means that the memory of such nodes
+			will be usable only for movable allocations which
+			rules out almost all kernel allocations. Use with
+			caution!
 
 	MTD_Partition=	[MTD]
 			Format: <name>,<region-number>,<size>,<offset>
diff --git a/drivers/base/node.c b/drivers/base/node.c
index dff5b53f7905..26f4b9c02f2c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -639,9 +639,7 @@ static struct node_attr node_state_attr[] = {
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
 #endif
-#ifdef CONFIG_MOVABLE_NODE
 	[N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
-#endif
 	[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
 };
 
@@ -652,9 +650,7 @@ static struct attribute *node_state_attrs[] = {
 #ifdef CONFIG_HIGHMEM
 	&node_state_attr[N_HIGH_MEMORY].attr.attr,
 #endif
-#ifdef CONFIG_MOVABLE_NODE
 	&node_state_attr[N_MEMORY].attr.attr,
-#endif
 	&node_state_attr[N_CPU].attr.attr,
 	NULL
 };
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index bdfc65af4152..9622fb8c101b 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -57,10 +57,8 @@ struct memblock {
 
 extern struct memblock memblock;
 extern int memblock_debug;
-#ifdef CONFIG_MOVABLE_NODE
 /* If movable_node boot option specified */
 extern bool movable_node_enabled;
-#endif /* CONFIG_MOVABLE_NODE */
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 #define __init_memblock __meminit
@@ -168,7 +166,6 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
 	     i != (u64)ULLONG_MAX;					\
 	     __next_reserved_mem_region(&i, p_start, p_end))
 
-#ifdef CONFIG_MOVABLE_NODE
 static inline bool memblock_is_hotpluggable(struct memblock_region *m)
 {
 	return m->flags & MEMBLOCK_HOTPLUG;
@@ -178,16 +175,6 @@ static inline bool __init_memblock movable_node_is_enabled(void)
 {
 	return movable_node_enabled;
 }
-#else
-static inline bool memblock_is_hotpluggable(struct memblock_region *m)
-{
-	return false;
-}
-static inline bool movable_node_is_enabled(void)
-{
-	return false;
-}
-#endif
 
 static inline bool memblock_is_mirror(struct memblock_region *m)
 {
@@ -295,7 +282,6 @@ phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
 
 phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align);
 
-#ifdef CONFIG_MOVABLE_NODE
 /*
  * Set the allocation direction to bottom-up or top-down.
  */
@@ -313,10 +299,6 @@ static inline bool memblock_bottom_up(void)
 {
 	return memblock.bottom_up;
 }
-#else
-static inline void __init memblock_set_bottom_up(bool enable) {}
-static inline bool memblock_bottom_up(void) { return false; }
-#endif
 
 /* Flags for memblock_alloc_base() amd __memblock_alloc_base() */
 #define MEMBLOCK_ALLOC_ANYWHERE	(~(phys_addr_t)0)
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index f746e44d4046..cf0b91c3ec12 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -387,11 +387,7 @@ enum node_states {
 #else
 	N_HIGH_MEMORY = N_NORMAL_MEMORY,
 #endif
-#ifdef CONFIG_MOVABLE_NODE
 	N_MEMORY,		/* The node has memory(regular, high, movable) */
-#else
-	N_MEMORY = N_HIGH_MEMORY,
-#endif
 	N_CPU,		/* The node has one or more cpus */
 	NR_NODE_STATES
 };
diff --git a/mm/Kconfig b/mm/Kconfig
index 0354a4be5a55..99645f42dc62 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -149,32 +149,6 @@ config NO_BOOTMEM
 config MEMORY_ISOLATION
 	bool
 
-config MOVABLE_NODE
-	bool "Enable to assign a node which has only movable memory"
-	depends on HAVE_MEMBLOCK
-	depends on NO_BOOTMEM
-	depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
-	depends on NUMA
-	default n
-	help
-	  Allow a node to have only movable memory.  Pages used by the kernel,
-	  such as direct mapping pages cannot be migrated.  So the corresponding
-	  memory device cannot be hotplugged.  This option allows the following
-	  two things:
-	  - When the system is booting, node full of hotpluggable memory can
-	  be arranged to have only movable memory so that the whole node can
-	  be hot-removed. (need movable_node boot option specified).
-	  - After the system is up, the option allows users to online all the
-	  memory of a node as movable memory so that the whole node can be
-	  hot-removed.
-
-	  Users who don't use the memory hotplug feature are fine with this
-	  option on since they don't specify movable_node boot option or they
-	  don't online memory as movable.
-
-	  Say Y here if you want to hotplug a whole node.
-	  Say N here if you want kernel to use memory on all nodes evenly.
-
 #
 # Only be set on architectures that have completely implemented memory hotplug
 # feature. If you are not sure, don't touch it.
diff --git a/mm/memblock.c b/mm/memblock.c
index 696f06d17c4e..4895f5a6cf7e 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -54,9 +54,7 @@ struct memblock memblock __initdata_memblock = {
 };
 
 int memblock_debug __initdata_memblock;
-#ifdef CONFIG_MOVABLE_NODE
 bool movable_node_enabled __initdata_memblock = false;
-#endif
 static bool system_has_some_mirror __initdata_memblock = false;
 static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 10052c2fd400..2a14f8c18a22 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1561,11 +1561,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 
 static int __init cmdline_parse_movable_node(char *p)
 {
-#ifdef CONFIG_MOVABLE_NODE
 	movable_node_enabled = true;
-#else
-	pr_warn("movable_node option not supported\n");
-#endif
 	return 0;
 }
 early_param("movable_node", cmdline_parse_movable_node);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a26e19c3e1ff..02f5757bf253 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -112,9 +112,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
-#ifdef CONFIG_MOVABLE_NODE
 	[N_MEMORY] = { { [0] = 1UL } },
-#endif
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* Re: [RFC PATCH 1/2] mm, memory_hotplug: drop artificial restriction on online/offline
From: Vlastimil Babka @ 2017-05-24 12:44 UTC (permalink / raw)
  To: Michal Hocko, linux-mm
  Cc: Andrew Morton, Mel Gorman, Andrea Arcangeli, Jerome Glisse,
	Reza Arbab, Yasuaki Ishimatsu, qiuxishi, Kani Toshimitsu, slaoub,
	Joonsoo Kim, Andi Kleen, David Rientjes, Daniel Kiper,
	Igor Mammedov, Vitaly Kuznetsov, LKML, Michal Hocko
In-Reply-To: <20170524122411.25212-2-mhocko@kernel.org>

On 05/24/2017 02:24 PM, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
> 
> 74d42d8fe146 ("memory_hotplug: ensure every online node has NORMAL
> memory") has added can_offline_normal which checks the amount of
> memory in !movable zones as long as CONFIG_MOVABLE_NODE is disable.
> It disallows to offline memory if there is nothing left with a
> justification that "memory-management acts bad when we have nodes which
> is online but don't have any normal memory".
> 
> 74d42d8fe146 ("memory_hotplug: ensure every online node has NORMAL
> memory")

That's the same commit as above... one of them should be different?

> has introduced a restriction that every numa node has to have
> at least some memory in !movable zones before a first movable memory
> can be onlined if !CONFIG_MOVABLE_NODE with the same justification
> 
> While it is true that not having _any_ memory for kernel allocations on
> a NUMA node is far from great and such a node would be quite subotimal
> because all kernel allocations will have to fallback to another NUMA
> node but there is no reason to disallow such a configuration in
> principle.
> 
> Besides that there is not really a big difference to have one memblock
> for ZONE_NORMAL available or none. With 128MB size memblocks the system
> might trash on the kernel allocations requests anyway. It is really
> hard to draw a line on how much normal memory is really sufficient so
> we have to rely on administrator to configure system sanely therefore
> drop the artificial restriction and remove can_offline_normal and
> can_online_high_movable altogether.
> 
> Signed-off-by: Michal Hocko <mhocko@suse.com>

-

> mm, memory_hotplug: drop can_online_high_movable
> 
>  because "memory-management acts
> bad when we have nodes which is online but don't have any normal memory.
> 
> Signed-off-by: Michal Hocko <mhocko@suse.com>

-
Some editing issue?

Otherwise makes sense to me.
Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>  mm/memory_hotplug.c | 58 -----------------------------------------------------
>  1 file changed, 58 deletions(-)
> 
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 599c675ad538..10052c2fd400 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -763,23 +763,6 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
>  	return 0;
>  }
>  
> -#ifdef CONFIG_MOVABLE_NODE
> -/*
> - * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
> - * normal memory.
> - */
> -static bool can_online_high_movable(int nid)
> -{
> -	return true;
> -}
> -#else /* CONFIG_MOVABLE_NODE */
> -/* ensure every online node has NORMAL memory */
> -static bool can_online_high_movable(int nid)
> -{
> -	return node_state(nid, N_NORMAL_MEMORY);
> -}
> -#endif /* CONFIG_MOVABLE_NODE */
> -
>  /* check which state of node_states will be changed when online memory */
>  static void node_states_check_changes_online(unsigned long nr_pages,
>  	struct zone *zone, struct memory_notify *arg)
> @@ -979,9 +962,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
>  	if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
>  		return -EINVAL;
>  
> -	if (online_type == MMOP_ONLINE_MOVABLE && !can_online_high_movable(nid))
> -		return -EINVAL;
> -
>  	/* associate pfn range with the zone */
>  	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
>  
> @@ -1579,41 +1559,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
>  	return offlined;
>  }
>  
> -#ifdef CONFIG_MOVABLE_NODE
> -/*
> - * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
> - * normal memory.
> - */
> -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
> -{
> -	return true;
> -}
> -#else /* CONFIG_MOVABLE_NODE */
> -/* ensure the node has NORMAL memory if it is still online */
> -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
> -{
> -	struct pglist_data *pgdat = zone->zone_pgdat;
> -	unsigned long present_pages = 0;
> -	enum zone_type zt;
> -
> -	for (zt = 0; zt <= ZONE_NORMAL; zt++)
> -		present_pages += pgdat->node_zones[zt].present_pages;
> -
> -	if (present_pages > nr_pages)
> -		return true;
> -
> -	present_pages = 0;
> -	for (; zt <= ZONE_MOVABLE; zt++)
> -		present_pages += pgdat->node_zones[zt].present_pages;
> -
> -	/*
> -	 * we can't offline the last normal memory until all
> -	 * higher memory is offlined.
> -	 */
> -	return present_pages == 0;
> -}
> -#endif /* CONFIG_MOVABLE_NODE */
> -
>  static int __init cmdline_parse_movable_node(char *p)
>  {
>  #ifdef CONFIG_MOVABLE_NODE
> @@ -1741,9 +1686,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
>  	node = zone_to_nid(zone);
>  	nr_pages = end_pfn - start_pfn;
>  
> -	if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
> -		return -EINVAL;
> -
>  	/* set above range as isolated */
>  	ret = start_isolate_page_range(start_pfn, end_pfn,
>  				       MIGRATE_MOVABLE, true);
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [RFC PATCH 2/2] mm, memory_hotplug: drop CONFIG_MOVABLE_NODE
From: Vlastimil Babka @ 2017-05-24 12:53 UTC (permalink / raw)
  To: Michal Hocko, linux-mm
  Cc: Andrew Morton, Mel Gorman, Andrea Arcangeli, Jerome Glisse,
	Reza Arbab, Yasuaki Ishimatsu, qiuxishi, Kani Toshimitsu, slaoub,
	Joonsoo Kim, Andi Kleen, David Rientjes, Daniel Kiper,
	Igor Mammedov, Vitaly Kuznetsov, LKML, Michal Hocko
In-Reply-To: <20170524122411.25212-3-mhocko@kernel.org>

On 05/24/2017 02:24 PM, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
> 
> 20b2f52b73fe ("numa: add CONFIG_MOVABLE_NODE for movable-dedicated
> node") has introduced CONFIG_MOVABLE_NODE without a good explanation on
> why it is actually useful. It makes a lot of sense to make movable node
> semantic opt in but we already have that because the feature has to be
> explicitly enabled on the kernel command line. A config option on top
> only makes the configuration space larger without a good reason. It also
> adds an additional ifdefery that pollutes the code. Just drop the config
> option and make it de-facto always enabled. This shouldn't introduce any
> change to the semantic.
> 
> Signed-off-by: Michal Hocko <mhocko@suse.com>

I agree with the intention.

> ---
>  Documentation/admin-guide/kernel-parameters.txt |  7 +++++--
>  drivers/base/node.c                             |  4 ----
>  include/linux/memblock.h                        | 18 -----------------
>  include/linux/nodemask.h                        |  4 ----
>  mm/Kconfig                                      | 26 -------------------------
>  mm/memblock.c                                   |  2 --
>  mm/memory_hotplug.c                             |  4 ----
>  mm/page_alloc.c                                 |  2 --
>  8 files changed, 5 insertions(+), 62 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index facc20a3f962..ec7d6ae01c96 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -2246,8 +2246,11 @@
>  			that the amount of memory usable for all allocations
>  			is not too small.
>  
> -	movable_node	[KNL] Boot-time switch to enable the effects
> -			of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
> +	movable_node	[KNL] Boot-time switch to make hotplugable to be

			hotplugable what, memory? nodes?

> +			movable. This means that the memory of such nodes
> +			will be usable only for movable allocations which
> +			rules out almost all kernel allocations. Use with
> +			caution!
>  
>  	MTD_Partition=	[MTD]
>  			Format: <name>,<region-number>,<size>,<offset>

...

> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -149,32 +149,6 @@ config NO_BOOTMEM
>  config MEMORY_ISOLATION
>  	bool
>  
> -config MOVABLE_NODE
> -	bool "Enable to assign a node which has only movable memory"
> -	depends on HAVE_MEMBLOCK
> -	depends on NO_BOOTMEM
> -	depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
> -	depends on NUMA

That's a lot of depends. What happens if some of them are not met and
the movable_node bootparam is used?

> -	default n
> -	help
> -	  Allow a node to have only movable memory.  Pages used by the kernel,
> -	  such as direct mapping pages cannot be migrated.  So the corresponding
> -	  memory device cannot be hotplugged.  This option allows the following
> -	  two things:
> -	  - When the system is booting, node full of hotpluggable memory can
> -	  be arranged to have only movable memory so that the whole node can
> -	  be hot-removed. (need movable_node boot option specified).

> -	  - After the system is up, the option allows users to online all the
> -	  memory of a node as movable memory so that the whole node can be
> -	  hot-removed.

Strictly speaking this part is already gone with patch 1/2. Only matters
in case this one is rejected for some reason.

> -	  Users who don't use the memory hotplug feature are fine with this
> -	  option on since they don't specify movable_node boot option or they
> -	  don't online memory as movable.
> -
> -	  Say Y here if you want to hotplug a whole node.
> -	  Say N here if you want kernel to use memory on all nodes evenly.
> -
>  #
>  # Only be set on architectures that have completely implemented memory hotplug
>  # feature. If you are not sure, don't touch it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [RFC PATCH 1/2] mm, memory_hotplug: drop artificial restriction on online/offline
From: Michal Hocko @ 2017-05-24 12:55 UTC (permalink / raw)
  To: Vlastimil Babka
  Cc: linux-mm, Andrew Morton, Mel Gorman, Andrea Arcangeli,
	Jerome Glisse, Reza Arbab, Yasuaki Ishimatsu, qiuxishi,
	Kani Toshimitsu, slaoub, Joonsoo Kim, Andi Kleen, David Rientjes,
	Daniel Kiper, Igor Mammedov, Vitaly Kuznetsov, LKML
In-Reply-To: <467b4bcb-cc7e-a001-b35c-29d0ce29efee@suse.cz>

On Wed 24-05-17 14:44:34, Vlastimil Babka wrote:
> On 05/24/2017 02:24 PM, Michal Hocko wrote:
> > From: Michal Hocko <mhocko@suse.com>
> > 
> > 74d42d8fe146 ("memory_hotplug: ensure every online node has NORMAL
> > memory") has added can_offline_normal which checks the amount of
> > memory in !movable zones as long as CONFIG_MOVABLE_NODE is disable.
> > It disallows to offline memory if there is nothing left with a
> > justification that "memory-management acts bad when we have nodes which
> > is online but don't have any normal memory".
> > 
> > 74d42d8fe146 ("memory_hotplug: ensure every online node has NORMAL
> > memory")
> 
> That's the same commit as above... one of them should be different?

This used to be two different patches which I decided to fold together
and I didn't realize that both online and offline paths were introduced
by the same patch.
[...]
> Some editing issue?

yes result of merging two commits.

> Otherwise makes sense to me.
> Acked-by: Vlastimil Babka <vbabka@suse.cz>

Thanks! Updated version follows
---

^ permalink raw reply

* [PATCH] mm: hwpoison: Use compound_head() flags for huge pages
From: James Morse @ 2017-05-24 13:02 UTC (permalink / raw)
  To: linux-mm; +Cc: Naoya Horiguchi, James Morse, Punit Agrawal

memory_failure() chooses a recovery action function based on the page
flags. For huge pages it uses the tail page flags which don't have
anything interesting set, resulting in:
> Memory failure: 0x9be3b4: Unknown page state
> Memory failure: 0x9be3b4: recovery action for unknown page: Failed

Instead, save a copy of the head page's flags if this is a huge page,
this means if there are no relevant flags for this tail page, we use
the head pages flags instead. This results in the me_huge_page()
recovery action being called:
> Memory failure: 0x9b7969: recovery action for huge page: Delayed

For hugepages that have not yet been allocated, this allows the hugepage
to be dequeued.

CC: Punit Agrawal <punit.agrawal@arm.com>
Signed-off-by: James Morse <james.morse@arm.com>
---
This is intended as a fix, but I can't find the patch that introduced this
behaviour. (not recent, and there is a lot of history down there!)

This doesn't apply to stable trees before v3.10...
Cc: stable@vger.kernel.org # 3.10.105

 mm/memory-failure.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2527dfeddb00..44a6a33af219 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1184,7 +1184,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
 	 * correctly, we save a copy of the page flags at this time.
 	 */
-	page_flags = p->flags;
+	if (PageHuge(p))
+		page_flags = hpage->flags;
+	else
+		page_flags = p->flags;
 
 	/*
 	 * unpoison always clear PG_hwpoison inside page lock
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 0/9] arm64: Enable contiguous pte hugepage support
From: Punit Agrawal @ 2017-05-24 13:11 UTC (permalink / raw)
  To: will.deacon, catalin.marinas
  Cc: Punit Agrawal, linux-arm-kernel, steve.capper, mark.rutland,
	linux-mm

Hi,

This patchset addresses all the known issues with contiguous hugetlb
pages. Support for contiguous hugepages is useful on systems where the
PMD hugepage size is too large (512MB hugepage when using 64k page
granule) and contiguous hugepages can be used to provide reasonable
hugepage sizes to the user.

The patches can be split as 

* Patches 1-3, 9 cleanups and improvements

* Patch 4 addresses the break-before-make requirement of the
  architecture for contiguous hugepages. These patches depend on
  enabling memory failure handling on arm64[2].

* Patch 5-7 add support for handling swap entries for contiguous pte
  hugepages. These patches depend on fixes to core code required to
  support contiguous hugepages[3].

* Patch 8 enables contiguous hugepage support for arm64

The patches are based on v4.12-rc2. Previous postings can be found at
[0], [1].

All feedback welcome.

Thanks,
Punit

[0] https://www.spinics.net/lists/arm-kernel/msg570422.html
[1] http://lists.infradead.org/pipermail/linux-arm-kernel/2017-March/497027.html
[2] https://www.spinics.net/lists/arm-kernel/msg581657.html
[3] https://www.spinics.net/lists/arm-kernel/msg583342.html

Changes v3 -> v4
* Moved Patches 2 and 4 to [3] due to dependencies

Changes v2 -> v3
* Rebased on v4.12-rc2
* Included swap related fixes in this series
* Enable contiguous pte hugepages

Changes v1 -> v2:
* Marked patch 2 for stable
* Fixed comment issues in patch 7
* Added tags

Punit Agrawal (4):
  arm64: hugetlbpages: Handle swap entries in huge_pte_offset() for
    contiguous hugepages
  arm64: hugetlb: Override huge_pte_clear() to support contiguous
    hugepages
  arm64: hugetlb: Override set_huge_swap_pte_at() to support contiguous
    hugepages
  arm64: Re-enable support for contiguous hugepages

Steve Capper (5):
  arm64: hugetlb: set_huge_pte_at Add WARN_ON on !pte_present
  arm64: hugetlb: Introduce pte_pgprot helper
  arm64: hugetlb: Spring clean huge pte accessors
  arm64: hugetlb: Add break-before-make logic for contiguous entries
  arm64: hugetlb: Cleanup setup_hugepagesz

 arch/arm64/include/asm/hugetlb.h |   9 +-
 arch/arm64/mm/hugetlbpage.c      | 287 ++++++++++++++++++++++++++++-----------
 2 files changed, 213 insertions(+), 83 deletions(-)

-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* [PATCH v4 1/9] arm64: hugetlb: set_huge_pte_at Add WARN_ON on !pte_present
From: Punit Agrawal @ 2017-05-24 13:11 UTC (permalink / raw)
  To: will.deacon, catalin.marinas
  Cc: Steve Capper, linux-arm-kernel, mark.rutland, linux-mm,
	David Woods, Punit Agrawal
In-Reply-To: <20170524131122.5309-1-punit.agrawal@arm.com>

From: Steve Capper <steve.capper@arm.com>

This patch adds a WARN_ON to set_huge_pte_at as the accessor assumes
that entries to be written down are all present. (There are separate
accessors to clear huge ptes).

We will need to handle the !pte_present case where memory offlining
is used on hugetlb pages. swap and migration entries will be supplied
to set_huge_pte_at in this case.

Cc: David Woods <dwoods@mellanox.com>
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
---
 arch/arm64/mm/hugetlbpage.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 656e0ece2289..7b61e4833432 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -67,6 +67,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	unsigned long pfn;
 	pgprot_t hugeprot;
 
+	/*
+	 * Code needs to be expanded to handle huge swap and migration
+	 * entries. Needed for HUGETLB and MEMORY_FAILURE.
+	 */
+	WARN_ON(!pte_present(pte));
+
 	if (!pte_cont(pte)) {
 		set_pte_at(mm, addr, ptep, pte);
 		return;
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 2/9] arm64: hugetlb: Introduce pte_pgprot helper
From: Punit Agrawal @ 2017-05-24 13:11 UTC (permalink / raw)
  To: will.deacon, catalin.marinas
  Cc: Steve Capper, linux-arm-kernel, mark.rutland, linux-mm,
	David Woods, Punit Agrawal
In-Reply-To: <20170524131122.5309-1-punit.agrawal@arm.com>

From: Steve Capper <steve.capper@arm.com>

Rather than xor pte bits in various places, use this helper function.

Cc: David Woods <dwoods@mellanox.com>
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/mm/hugetlbpage.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 7b61e4833432..cb84ca33bc6b 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -41,6 +41,16 @@ int pud_huge(pud_t pud)
 #endif
 }
 
+/*
+ * Select all bits except the pfn
+ */
+static inline pgprot_t pte_pgprot(pte_t pte)
+{
+	unsigned long pfn = pte_pfn(pte);
+
+	return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+}
+
 static int find_num_contig(struct mm_struct *mm, unsigned long addr,
 			   pte_t *ptep, size_t *pgsize)
 {
@@ -80,7 +90,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
 	pfn = pte_pfn(pte);
-	hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+	hugeprot = pte_pgprot(pte);
 	for (i = 0; i < ncontig; i++) {
 		pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep,
 			 pte_val(pfn_pte(pfn, hugeprot)));
@@ -223,9 +233,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 		size_t pgsize = 0;
 		unsigned long pfn = pte_pfn(pte);
 		/* Select all bits except the pfn */
-		pgprot_t hugeprot =
-			__pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^
-				 pte_val(pte));
+		pgprot_t hugeprot = pte_pgprot(pte);
 
 		pfn = pte_pfn(pte);
 		ncontig = find_num_contig(vma->vm_mm, addr, ptep,
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 3/9] arm64: hugetlb: Spring clean huge pte accessors
From: Punit Agrawal @ 2017-05-24 13:11 UTC (permalink / raw)
  To: will.deacon, catalin.marinas
  Cc: Steve Capper, linux-arm-kernel, mark.rutland, linux-mm,
	David Woods, Punit Agrawal
In-Reply-To: <20170524131122.5309-1-punit.agrawal@arm.com>

From: Steve Capper <steve.capper@arm.com>

This patch aims to re-structure the huge pte accessors without affecting
their functionality. Control flow is changed to reduce indentation and
expanded use is made of post for loop variable modification.

It is then much easier to add break-before-make semantics in a subsequent
patch.

Cc: David Woods <dwoods@mellanox.com>
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/mm/hugetlbpage.c | 119 ++++++++++++++++++++------------------------
 1 file changed, 54 insertions(+), 65 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index cb84ca33bc6b..08deed7c71f0 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -74,7 +74,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	size_t pgsize;
 	int i;
 	int ncontig;
-	unsigned long pfn;
+	unsigned long pfn, dpfn;
 	pgprot_t hugeprot;
 
 	/*
@@ -90,14 +90,13 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
 	pfn = pte_pfn(pte);
+	dpfn = pgsize >> PAGE_SHIFT;
 	hugeprot = pte_pgprot(pte);
-	for (i = 0; i < ncontig; i++) {
+
+	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) {
 		pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep,
 			 pte_val(pfn_pte(pfn, hugeprot)));
 		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
-		ptep++;
-		pfn += pgsize >> PAGE_SHIFT;
-		addr += pgsize;
 	}
 }
 
@@ -195,91 +194,81 @@ pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 			      unsigned long addr, pte_t *ptep)
 {
-	pte_t pte;
-
-	if (pte_cont(*ptep)) {
-		int ncontig, i;
-		size_t pgsize;
-		bool is_dirty = false;
-
-		ncontig = find_num_contig(mm, addr, ptep, &pgsize);
-		/* save the 1st pte to return */
-		pte = ptep_get_and_clear(mm, addr, ptep);
-		for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) {
-			/*
-			 * If HW_AFDBM is enabled, then the HW could
-			 * turn on the dirty bit for any of the page
-			 * in the set, so check them all.
-			 */
-			++ptep;
-			if (pte_dirty(ptep_get_and_clear(mm, addr, ptep)))
-				is_dirty = true;
-		}
-		if (is_dirty)
-			return pte_mkdirty(pte);
-		else
-			return pte;
-	} else {
+	int ncontig, i;
+	size_t pgsize;
+	pte_t orig_pte = huge_ptep_get(ptep);
+
+	if (!pte_cont(orig_pte))
 		return ptep_get_and_clear(mm, addr, ptep);
+
+	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
+	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
+		/*
+		 * If HW_AFDBM is enabled, then the HW could
+		 * turn on the dirty bit for any of the page
+		 * in the set, so check them all.
+		 */
+		if (pte_dirty(ptep_get_and_clear(mm, addr, ptep)))
+			orig_pte = pte_mkdirty(orig_pte);
 	}
+
+	return orig_pte;
 }
 
 int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 			       unsigned long addr, pte_t *ptep,
 			       pte_t pte, int dirty)
 {
-	if (pte_cont(pte)) {
-		int ncontig, i, changed = 0;
-		size_t pgsize = 0;
-		unsigned long pfn = pte_pfn(pte);
-		/* Select all bits except the pfn */
-		pgprot_t hugeprot = pte_pgprot(pte);
-
-		pfn = pte_pfn(pte);
-		ncontig = find_num_contig(vma->vm_mm, addr, ptep,
-					  &pgsize);
-		for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) {
-			changed |= ptep_set_access_flags(vma, addr, ptep,
-							pfn_pte(pfn,
-								hugeprot),
-							dirty);
-			pfn += pgsize >> PAGE_SHIFT;
-		}
-		return changed;
-	} else {
+	int ncontig, i, changed = 0;
+	size_t pgsize = 0;
+	unsigned long pfn = pte_pfn(pte), dpfn;
+	pgprot_t hugeprot;
+
+	if (!pte_cont(pte))
 		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+
+	ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize);
+	dpfn = pgsize >> PAGE_SHIFT;
+	hugeprot = pte_pgprot(pte);
+
+	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) {
+		changed |= ptep_set_access_flags(vma, addr, ptep,
+				pfn_pte(pfn, hugeprot), dirty);
 	}
+
+	return changed;
 }
 
 void huge_ptep_set_wrprotect(struct mm_struct *mm,
 			     unsigned long addr, pte_t *ptep)
 {
-	if (pte_cont(*ptep)) {
-		int ncontig, i;
-		size_t pgsize = 0;
+	int ncontig, i;
+	size_t pgsize;
 
-		ncontig = find_num_contig(mm, addr, ptep, &pgsize);
-		for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize)
-			ptep_set_wrprotect(mm, addr, ptep);
-	} else {
+	if (!pte_cont(*ptep)) {
 		ptep_set_wrprotect(mm, addr, ptep);
+		return;
 	}
+
+	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
+	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
+		ptep_set_wrprotect(mm, addr, ptep);
 }
 
 void huge_ptep_clear_flush(struct vm_area_struct *vma,
 			   unsigned long addr, pte_t *ptep)
 {
-	if (pte_cont(*ptep)) {
-		int ncontig, i;
-		size_t pgsize = 0;
-
-		ncontig = find_num_contig(vma->vm_mm, addr, ptep,
-					  &pgsize);
-		for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize)
-			ptep_clear_flush(vma, addr, ptep);
-	} else {
+	int ncontig, i;
+	size_t pgsize;
+
+	if (!pte_cont(*ptep)) {
 		ptep_clear_flush(vma, addr, ptep);
+		return;
 	}
+
+	ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize);
+	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
+		ptep_clear_flush(vma, addr, ptep);
 }
 
 static __init int setup_hugepagesz(char *opt)
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 4/9] arm64: hugetlb: Add break-before-make logic for contiguous entries
From: Punit Agrawal @ 2017-05-24 13:11 UTC (permalink / raw)
  To: will.deacon, catalin.marinas
  Cc: Steve Capper, linux-arm-kernel, mark.rutland, linux-mm,
	David Woods, Punit Agrawal
In-Reply-To: <20170524131122.5309-1-punit.agrawal@arm.com>

From: Steve Capper <steve.capper@arm.com>

It has become apparent that one has to take special care when modifying
attributes of memory mappings that employ the contiguous bit.

Both the requirement and the architecturally correct "Break-Before-Make"
technique of updating contiguous entries can be found described in:
ARM DDI 0487A.k_iss10775, "Misprogramming of the Contiguous bit",
page D4-1762.

The huge pte accessors currently replace the attributes of contiguous
pte entries in place thus can, on certain platforms, lead to TLB
conflict aborts or even erroneous results returned from TLB lookups.

This patch adds a helper function get_clear_flush(.) that clears a
contiguous entry and returns the head pte (whilst taking care to
retain dirty bit information that could have been modified by DBM).
A tlb invalidate is performed to then ensure that there is no
possibility of multiple tlb entries being present for the same
region.

Cc: David Woods <dwoods@mellanox.com>
Signed-off-by: Steve Capper <steve.capper@arm.com>
(Fixed indentation and some comments cleanup)
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/mm/hugetlbpage.c | 81 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 08deed7c71f0..f2c976464f39 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -68,6 +68,47 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr,
 	return CONT_PTES;
 }
 
+/*
+ * Changing some bits of contiguous entries requires us to follow a
+ * Break-Before-Make approach, breaking the whole contiguous set
+ * before we can change any entries. See ARM DDI 0487A.k_iss10775,
+ * "Misprogramming of the Contiguous bit", page D4-1762.
+ *
+ * This helper performs the break step.
+ */
+static pte_t get_clear_flush(struct mm_struct *mm,
+			     unsigned long addr,
+			     pte_t *ptep,
+			     unsigned long pgsize,
+			     unsigned long ncontig)
+{
+	unsigned long i, saddr = addr;
+	struct vm_area_struct vma = { .vm_mm = mm };
+	pte_t orig_pte = huge_ptep_get(ptep);
+
+	/*
+	 * If we already have a faulting entry then we don't need
+	 * to break before make (there won't be a tlb entry cached).
+	 */
+	if (!pte_present(orig_pte))
+		return orig_pte;
+
+	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
+		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
+
+		/*
+		 * If HW_AFDBM is enabled, then the HW could turn on
+		 * the dirty bit for any page in the set, so check
+		 * them all.  All hugetlb entries are already young.
+		 */
+		if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && pte_dirty(pte))
+			orig_pte = pte_mkdirty(orig_pte);
+	}
+
+	flush_tlb_range(&vma, saddr, addr);
+	return orig_pte;
+}
+
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 			    pte_t *ptep, pte_t pte)
 {
@@ -93,6 +134,8 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	dpfn = pgsize >> PAGE_SHIFT;
 	hugeprot = pte_pgprot(pte);
 
+	get_clear_flush(mm, addr, ptep, pgsize, ncontig);
+
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) {
 		pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep,
 			 pte_val(pfn_pte(pfn, hugeprot)));
@@ -194,7 +237,7 @@ pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 			      unsigned long addr, pte_t *ptep)
 {
-	int ncontig, i;
+	int ncontig;
 	size_t pgsize;
 	pte_t orig_pte = huge_ptep_get(ptep);
 
@@ -202,17 +245,8 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 		return ptep_get_and_clear(mm, addr, ptep);
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
-	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
-		/*
-		 * If HW_AFDBM is enabled, then the HW could
-		 * turn on the dirty bit for any of the page
-		 * in the set, so check them all.
-		 */
-		if (pte_dirty(ptep_get_and_clear(mm, addr, ptep)))
-			orig_pte = pte_mkdirty(orig_pte);
-	}
 
-	return orig_pte;
+	return get_clear_flush(mm, addr, ptep, pgsize, ncontig);
 }
 
 int huge_ptep_set_access_flags(struct vm_area_struct *vma,
@@ -222,6 +256,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 	int ncontig, i, changed = 0;
 	size_t pgsize = 0;
 	unsigned long pfn = pte_pfn(pte), dpfn;
+	pte_t orig_pte;
 	pgprot_t hugeprot;
 
 	if (!pte_cont(pte))
@@ -231,10 +266,12 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 	dpfn = pgsize >> PAGE_SHIFT;
 	hugeprot = pte_pgprot(pte);
 
-	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) {
-		changed |= ptep_set_access_flags(vma, addr, ptep,
-				pfn_pte(pfn, hugeprot), dirty);
-	}
+	orig_pte = get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
+	if (!pte_same(orig_pte, pte))
+		changed = 1;
+
+	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
+		set_pte_at(vma->vm_mm, addr, ptep, pfn_pte(pfn, hugeprot));
 
 	return changed;
 }
@@ -244,6 +281,9 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 {
 	int ncontig, i;
 	size_t pgsize;
+	pte_t pte = pte_wrprotect(huge_ptep_get(ptep)), orig_pte;
+	unsigned long pfn = pte_pfn(pte), dpfn;
+	pgprot_t hugeprot;
 
 	if (!pte_cont(*ptep)) {
 		ptep_set_wrprotect(mm, addr, ptep);
@@ -251,8 +291,15 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	}
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
-	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
-		ptep_set_wrprotect(mm, addr, ptep);
+	dpfn = pgsize >> PAGE_SHIFT;
+
+	orig_pte = get_clear_flush(mm, addr, ptep, pgsize, ncontig);
+	if (pte_dirty(orig_pte))
+		pte = pte_mkdirty(pte);
+
+	hugeprot = pte_pgprot(pte);
+	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
+		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
 }
 
 void huge_ptep_clear_flush(struct vm_area_struct *vma,
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 5/9] arm64: hugetlbpages: Handle swap entries in huge_pte_offset() for contiguous hugepages
From: Punit Agrawal @ 2017-05-24 13:11 UTC (permalink / raw)
  To: will.deacon, catalin.marinas
  Cc: Punit Agrawal, linux-arm-kernel, steve.capper, mark.rutland,
	linux-mm, David Woods
In-Reply-To: <20170524131122.5309-1-punit.agrawal@arm.com>

huge_pte_offset() was updated to correctly handle swap entries for
hugepages. With the addition of the size parameter, it is now possible
to disambiguate whether the request is for a regular hugepage or a
contiguous hugepage.

Fix huge_pte_offset() for contiguous hugepages by using the size to find
the correct page table entry.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Cc: David Woods <dwoods@mellanox.com>
---
 arch/arm64/mm/hugetlbpage.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index f2c976464f39..e9061704aec3 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -195,6 +195,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
+	pte_t *pte;
 
 	pgd = pgd_offset(mm, addr);
 	pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd);
@@ -202,19 +203,29 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 		return NULL;
 
 	pud = pud_offset(pgd, addr);
-	if (pud_none(*pud))
+	if (pud_none(*pud) && sz != PUD_SIZE)
 		return NULL;
 	/* swap or huge page */
 	if (!pud_present(*pud) || pud_huge(*pud))
 		return (pte_t *)pud;
 	/* table; check the next level */
 
+	if (sz == CONT_PMD_SIZE)
+		addr &= CONT_PMD_MASK;
+
 	pmd = pmd_offset(pud, addr);
-	if (pmd_none(*pmd))
+	if (pmd_none(*pmd) &&
+	    !(sz == PMD_SIZE || sz == CONT_PMD_SIZE))
 		return NULL;
 	if (!pmd_present(*pmd) || pmd_huge(*pmd))
 		return (pte_t *)pmd;
 
+	if (sz == CONT_PTE_SIZE) {
+		pte = pte_offset_kernel(
+			pmd, (addr & CONT_PTE_MASK));
+		return pte;
+	}
+
 	return NULL;
 }
 
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 6/9] arm64: hugetlb: Override huge_pte_clear() to support contiguous hugepages
From: Punit Agrawal @ 2017-05-24 13:11 UTC (permalink / raw)
  To: will.deacon, catalin.marinas
  Cc: Punit Agrawal, linux-arm-kernel, steve.capper, mark.rutland,
	linux-mm, David Woods
In-Reply-To: <20170524131122.5309-1-punit.agrawal@arm.com>

The default huge_pte_clear() implementation does not clear contiguous
page table entries when it encounters contiguous hugepages that are
supported on arm64.

Fix this by overriding the default implementation to clear all the
entries associated with contiguous hugepages.

Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Cc: David Woods <dwoods@mellanox.com>
---
 arch/arm64/include/asm/hugetlb.h |  6 +++++-
 arch/arm64/mm/hugetlbpage.c      | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index bbc1e35aa601..bb86f0741863 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -18,7 +18,6 @@
 #ifndef __ASM_HUGETLB_H
 #define __ASM_HUGETLB_H
 
-#include <asm-generic/hugetlb.h>
 #include <asm/page.h>
 
 static inline pte_t huge_ptep_get(pte_t *ptep)
@@ -82,5 +81,10 @@ extern void huge_ptep_set_wrprotect(struct mm_struct *mm,
 				    unsigned long addr, pte_t *ptep);
 extern void huge_ptep_clear_flush(struct vm_area_struct *vma,
 				  unsigned long addr, pte_t *ptep);
+extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
+			   pte_t *ptep, unsigned long sz);
+#define huge_pte_clear huge_pte_clear
+
+#include <asm-generic/hugetlb.h>
 
 #endif /* __ASM_HUGETLB_H */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index e9061704aec3..240b2fd53266 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -68,6 +68,30 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr,
 	return CONT_PTES;
 }
 
+static inline int num_contig_ptes(unsigned long size, size_t *pgsize)
+{
+	int contig_ptes = 0;
+
+	*pgsize = size;
+
+	switch (size) {
+	case PUD_SIZE:
+	case PMD_SIZE:
+		contig_ptes = 1;
+		break;
+	case CONT_PMD_SIZE:
+		*pgsize = PMD_SIZE;
+		contig_ptes = CONT_PMDS;
+		break;
+	case CONT_PTE_SIZE:
+		*pgsize = PAGE_SIZE;
+		contig_ptes = CONT_PTES;
+		break;
+	}
+
+	return contig_ptes;
+}
+
 /*
  * Changing some bits of contiguous entries requires us to follow a
  * Break-Before-Make approach, breaking the whole contiguous set
@@ -245,6 +269,18 @@ pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
 	return entry;
 }
 
+void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, unsigned long sz)
+{
+	int i, ncontig;
+	size_t pgsize;
+
+	ncontig = num_contig_ptes(sz, &pgsize);
+
+	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
+		pte_clear(mm, addr, ptep);
+}
+
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 			      unsigned long addr, pte_t *ptep)
 {
-- 
2.11.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox