LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3 07/13] mm/debug_vm_pgtable/set_pte/pmd/pud: Don't use set_*_at to update an existing pte entry
From: Aneesh Kumar K.V @ 2020-08-27  8:04 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: linux-arch, linux-s390, Anshuman Khandual, Aneesh Kumar K.V, x86,
	Mike Rapoport, Qian Cai, Gerald Schaefer, Christophe Leroy,
	Vineet Gupta, linux-snps-arc, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200827080438.315345-1-aneesh.kumar@linux.ibm.com>

set_pte_at() should not be used to set a pte entry at locations that
already holds a valid pte entry. Architectures like ppc64 don't do TLB
invalidate in set_pte_at() and hence expect it to be used to set locations
that are not a valid PTE.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index de83a20c1b30..f9f6358899a8 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -79,15 +79,18 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
 {
 	pte_t pte = pfn_pte(pfn, prot);
 
+	/*
+	 * Architectures optimize set_pte_at by avoiding TLB flush.
+	 * This requires set_pte_at to be not used to update an
+	 * existing pte entry. Clear pte before we do set_pte_at
+	 */
+
 	pr_debug("Validating PTE advanced\n");
 	pte = pfn_pte(pfn, prot);
 	set_pte_at(mm, vaddr, ptep, pte);
 	ptep_set_wrprotect(mm, vaddr, ptep);
 	pte = ptep_get(ptep);
 	WARN_ON(pte_write(pte));
-
-	pte = pfn_pte(pfn, prot);
-	set_pte_at(mm, vaddr, ptep, pte);
 	ptep_get_and_clear(mm, vaddr, ptep);
 	pte = ptep_get(ptep);
 	WARN_ON(!pte_none(pte));
@@ -101,13 +104,11 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
 	ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
 	pte = ptep_get(ptep);
 	WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
-
-	pte = pfn_pte(pfn, prot);
-	set_pte_at(mm, vaddr, ptep, pte);
 	ptep_get_and_clear_full(mm, vaddr, ptep, 1);
 	pte = ptep_get(ptep);
 	WARN_ON(!pte_none(pte));
 
+	pte = pfn_pte(pfn, prot);
 	pte = pte_mkyoung(pte);
 	set_pte_at(mm, vaddr, ptep, pte);
 	ptep_test_and_clear_young(vma, vaddr, ptep);
@@ -169,9 +170,6 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 	pmdp_set_wrprotect(mm, vaddr, pmdp);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(pmd_write(pmd));
-
-	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
-	set_pmd_at(mm, vaddr, pmdp, pmd);
 	pmdp_huge_get_and_clear(mm, vaddr, pmdp);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(!pmd_none(pmd));
@@ -185,13 +183,11 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 	pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
-
-	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
-	set_pmd_at(mm, vaddr, pmdp, pmd);
 	pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(!pmd_none(pmd));
 
+	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
 	pmd = pmd_mkyoung(pmd);
 	set_pmd_at(mm, vaddr, pmdp, pmd);
 	pmdp_test_and_clear_young(vma, vaddr, pmdp);
@@ -293,18 +289,10 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
 	WARN_ON(pud_write(pud));
 
 #ifndef __PAGETABLE_PMD_FOLDED
-
-	pud = pud_mkhuge(pfn_pud(pfn, prot));
-	set_pud_at(mm, vaddr, pudp, pud);
 	pudp_huge_get_and_clear(mm, vaddr, pudp);
 	pud = READ_ONCE(*pudp);
 	WARN_ON(!pud_none(pud));
 
-	pud = pud_mkhuge(pfn_pud(pfn, prot));
-	set_pud_at(mm, vaddr, pudp, pud);
-	pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
-	pud = READ_ONCE(*pudp);
-	WARN_ON(!pud_none(pud));
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 	pud = pud_mkhuge(pfn_pud(pfn, prot));
@@ -317,6 +305,13 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
 	pud = READ_ONCE(*pudp);
 	WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
 
+#ifndef __PAGETABLE_PMD_FOLDED
+	pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+	pud = READ_ONCE(*pudp);
+	WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+	pud = pud_mkhuge(pfn_pud(pfn, prot));
 	pud = pud_mkyoung(pud);
 	set_pud_at(mm, vaddr, pudp, pud);
 	pudp_test_and_clear_young(vma, vaddr, pudp);
-- 
2.26.2


^ permalink raw reply related

* [PATCH v3 08/13] mm/debug_vm_pgtable/thp: Use page table depost/withdraw with THP
From: Aneesh Kumar K.V @ 2020-08-27  8:04 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: linux-arch, linux-s390, Anshuman Khandual, Aneesh Kumar K.V, x86,
	Mike Rapoport, Qian Cai, Gerald Schaefer, Christophe Leroy,
	Vineet Gupta, linux-snps-arc, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200827080438.315345-1-aneesh.kumar@linux.ibm.com>

Architectures like ppc64 use deposited page table while updating the huge pte
entries.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index f9f6358899a8..0ce5c6a24c5b 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -154,7 +154,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
 static void __init pmd_advanced_tests(struct mm_struct *mm,
 				      struct vm_area_struct *vma, pmd_t *pmdp,
 				      unsigned long pfn, unsigned long vaddr,
-				      pgprot_t prot)
+				      pgprot_t prot, pgtable_t pgtable)
 {
 	pmd_t pmd;
 
@@ -165,6 +165,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 	/* Align the address wrt HPAGE_PMD_SIZE */
 	vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
 
+	pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+
 	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
 	set_pmd_at(mm, vaddr, pmdp, pmd);
 	pmdp_set_wrprotect(mm, vaddr, pmdp);
@@ -193,6 +195,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 	pmdp_test_and_clear_young(vma, vaddr, pmdp);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(pmd_young(pmd));
+
+	pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
 }
 
 static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -373,7 +377,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
 static void __init pmd_advanced_tests(struct mm_struct *mm,
 				      struct vm_area_struct *vma, pmd_t *pmdp,
 				      unsigned long pfn, unsigned long vaddr,
-				      pgprot_t prot)
+				      pgprot_t prot, pgtable_t pgtable)
 {
 }
 static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -1015,7 +1019,7 @@ static int __init debug_vm_pgtable(void)
 	pgd_clear_tests(mm, pgdp);
 
 	pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
+	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
 	pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
 	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH v3 10/13] mm/debug_vm_pgtable/locks: Take correct page table lock
From: Aneesh Kumar K.V @ 2020-08-27  8:04 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: linux-arch, linux-s390, Anshuman Khandual, Aneesh Kumar K.V, x86,
	Mike Rapoport, Qian Cai, Gerald Schaefer, Christophe Leroy,
	Vineet Gupta, linux-snps-arc, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200827080438.315345-1-aneesh.kumar@linux.ibm.com>

Make sure we call pte accessors with correct lock held.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 78c8af3445ac..0a6e771ebd13 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1039,33 +1039,39 @@ static int __init debug_vm_pgtable(void)
 	pmd_thp_tests(pmd_aligned, prot);
 	pud_thp_tests(pud_aligned, prot);
 
+	hugetlb_basic_tests(pte_aligned, prot);
+
 	/*
 	 * Page table modifying tests
 	 */
-	pte_clear_tests(mm, ptep, vaddr);
-	pmd_clear_tests(mm, pmdp);
-	pud_clear_tests(mm, pudp);
-	p4d_clear_tests(mm, p4dp);
-	pgd_clear_tests(mm, pgdp);
 
 	ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+	pte_clear_tests(mm, ptep, vaddr);
 	pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
-	pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
-	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-
+	pte_unmap_unlock(ptep, ptl);
 
+	ptl = pmd_lock(mm, pmdp);
+	pmd_clear_tests(mm, pmdp);
+	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
 	pmd_huge_tests(pmdp, pmd_aligned, prot);
+	pmd_populate_tests(mm, pmdp, saved_ptep);
+	spin_unlock(ptl);
+
+	ptl = pud_lock(mm, pudp);
+	pud_clear_tests(mm, pudp);
+	pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
 	pud_huge_tests(pudp, pud_aligned, prot);
+	pud_populate_tests(mm, pudp, saved_pmdp);
+	spin_unlock(ptl);
 
-	pte_unmap_unlock(ptep, ptl);
+	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
 
-	pmd_populate_tests(mm, pmdp, saved_ptep);
-	pud_populate_tests(mm, pudp, saved_pmdp);
+	spin_lock(&mm->page_table_lock);
+	p4d_clear_tests(mm, p4dp);
+	pgd_clear_tests(mm, pgdp);
 	p4d_populate_tests(mm, p4dp, saved_pudp);
 	pgd_populate_tests(mm, pgdp, saved_p4dp);
-
-	hugetlb_basic_tests(pte_aligned, prot);
+	spin_unlock(&mm->page_table_lock);
 
 	p4d_free(mm, saved_p4dp);
 	pud_free(mm, saved_pudp);
-- 
2.26.2


^ permalink raw reply related

* [PATCH v3 09/13] mm/debug_vm_pgtable/locks: Move non page table modifying test together
From: Aneesh Kumar K.V @ 2020-08-27  8:04 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: linux-arch, linux-s390, Anshuman Khandual, Aneesh Kumar K.V, x86,
	Mike Rapoport, Qian Cai, Gerald Schaefer, Christophe Leroy,
	Vineet Gupta, linux-snps-arc, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200827080438.315345-1-aneesh.kumar@linux.ibm.com>

This will help in adding proper locks in a later patch

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 52 ++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 0ce5c6a24c5b..78c8af3445ac 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -992,7 +992,7 @@ static int __init debug_vm_pgtable(void)
 	p4dp = p4d_alloc(mm, pgdp, vaddr);
 	pudp = pud_alloc(mm, p4dp, vaddr);
 	pmdp = pmd_alloc(mm, pudp, vaddr);
-	ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+	ptep = pte_alloc_map(mm, pmdp, vaddr);
 
 	/*
 	 * Save all the page table page addresses as the page table
@@ -1012,33 +1012,12 @@ static int __init debug_vm_pgtable(void)
 	p4d_basic_tests(p4d_aligned, prot);
 	pgd_basic_tests(pgd_aligned, prot);
 
-	pte_clear_tests(mm, ptep, vaddr);
-	pmd_clear_tests(mm, pmdp);
-	pud_clear_tests(mm, pudp);
-	p4d_clear_tests(mm, p4dp);
-	pgd_clear_tests(mm, pgdp);
-
-	pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
-	pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
-	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-
 	pmd_leaf_tests(pmd_aligned, prot);
 	pud_leaf_tests(pud_aligned, prot);
 
-	pmd_huge_tests(pmdp, pmd_aligned, prot);
-	pud_huge_tests(pudp, pud_aligned, prot);
-
 	pte_savedwrite_tests(pte_aligned, protnone);
 	pmd_savedwrite_tests(pmd_aligned, protnone);
 
-	pte_unmap_unlock(ptep, ptl);
-
-	pmd_populate_tests(mm, pmdp, saved_ptep);
-	pud_populate_tests(mm, pudp, saved_pmdp);
-	p4d_populate_tests(mm, p4dp, saved_pudp);
-	pgd_populate_tests(mm, pgdp, saved_p4dp);
-
 	pte_special_tests(pte_aligned, prot);
 	pte_protnone_tests(pte_aligned, protnone);
 	pmd_protnone_tests(pmd_aligned, protnone);
@@ -1056,11 +1035,38 @@ static int __init debug_vm_pgtable(void)
 	pmd_swap_tests(pmd_aligned, prot);
 
 	swap_migration_tests();
-	hugetlb_basic_tests(pte_aligned, prot);
 
 	pmd_thp_tests(pmd_aligned, prot);
 	pud_thp_tests(pud_aligned, prot);
 
+	/*
+	 * Page table modifying tests
+	 */
+	pte_clear_tests(mm, ptep, vaddr);
+	pmd_clear_tests(mm, pmdp);
+	pud_clear_tests(mm, pudp);
+	p4d_clear_tests(mm, p4dp);
+	pgd_clear_tests(mm, pgdp);
+
+	ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+	pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
+	pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+
+
+	pmd_huge_tests(pmdp, pmd_aligned, prot);
+	pud_huge_tests(pudp, pud_aligned, prot);
+
+	pte_unmap_unlock(ptep, ptl);
+
+	pmd_populate_tests(mm, pmdp, saved_ptep);
+	pud_populate_tests(mm, pudp, saved_pmdp);
+	p4d_populate_tests(mm, p4dp, saved_pudp);
+	pgd_populate_tests(mm, pgdp, saved_p4dp);
+
+	hugetlb_basic_tests(pte_aligned, prot);
+
 	p4d_free(mm, saved_p4dp);
 	pud_free(mm, saved_pudp);
 	pmd_free(mm, saved_pmdp);
-- 
2.26.2


^ permalink raw reply related

* [PATCH v3 11/13] mm/debug_vm_pgtable/pmd_clear: Don't use pmd/pud_clear on pte entries
From: Aneesh Kumar K.V @ 2020-08-27  8:04 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: linux-arch, linux-s390, Anshuman Khandual, Aneesh Kumar K.V, x86,
	Mike Rapoport, Qian Cai, Gerald Schaefer, Christophe Leroy,
	Vineet Gupta, linux-snps-arc, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200827080438.315345-1-aneesh.kumar@linux.ibm.com>

pmd_clear() should not be used to clear pmd level pte entries.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 0a6e771ebd13..a188b6e4e37e 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -196,6 +196,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(pmd_young(pmd));
 
+	/*  Clear the pte entries  */
+	pmdp_huge_get_and_clear(mm, vaddr, pmdp);
 	pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
 }
 
@@ -321,6 +323,8 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
 	pudp_test_and_clear_young(vma, vaddr, pudp);
 	pud = READ_ONCE(*pudp);
 	WARN_ON(pud_young(pud));
+
+	pudp_huge_get_and_clear(mm, vaddr, pudp);
 }
 
 static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -444,8 +448,6 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
 	 * This entry points to next level page table page.
 	 * Hence this must not qualify as pud_bad().
 	 */
-	pmd_clear(pmdp);
-	pud_clear(pudp);
 	pud_populate(mm, pudp, pmdp);
 	pud = READ_ONCE(*pudp);
 	WARN_ON(pud_bad(pud));
@@ -577,7 +579,6 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
 	 * This entry points to next level page table page.
 	 * Hence this must not qualify as pmd_bad().
 	 */
-	pmd_clear(pmdp);
 	pmd_populate(mm, pmdp, pgtable);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(pmd_bad(pmd));
-- 
2.26.2


^ permalink raw reply related

* [PATCH v3 12/13] mm/debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64
From: Aneesh Kumar K.V @ 2020-08-27  8:04 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: linux-arch, linux-s390, Anshuman Khandual, Aneesh Kumar K.V, x86,
	Mike Rapoport, Qian Cai, Gerald Schaefer, Christophe Leroy,
	Vineet Gupta, linux-snps-arc, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200827080438.315345-1-aneesh.kumar@linux.ibm.com>

The seems to be missing quite a lot of details w.r.t allocating
the correct pgtable_t page (huge_pte_alloc()), holding the right
lock (huge_pte_lock()) etc. The vma used is also not a hugetlb VMA.

ppc64 do have runtime checks within CONFIG_DEBUG_VM for most of these.
Hence disable the test on ppc64.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index a188b6e4e37e..21329c7d672f 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -813,6 +813,7 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 }
 
+#ifndef CONFIG_PPC_BOOK3S_64
 static void __init hugetlb_advanced_tests(struct mm_struct *mm,
 					  struct vm_area_struct *vma,
 					  pte_t *ptep, unsigned long pfn,
@@ -855,6 +856,7 @@ static void __init hugetlb_advanced_tests(struct mm_struct *mm,
 	pte = huge_ptep_get(ptep);
 	WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
 }
+#endif
 #else  /* !CONFIG_HUGETLB_PAGE */
 static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
 static void __init hugetlb_advanced_tests(struct mm_struct *mm,
@@ -1065,7 +1067,9 @@ static int __init debug_vm_pgtable(void)
 	pud_populate_tests(mm, pudp, saved_pmdp);
 	spin_unlock(ptl);
 
+#ifndef CONFIG_PPC_BOOK3S_64
 	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+#endif
 
 	spin_lock(&mm->page_table_lock);
 	p4d_clear_tests(mm, p4dp);
-- 
2.26.2


^ permalink raw reply related

* [PATCH v3 13/13] mm/debug_vm_pgtable: populate a pte entry before fetching it
From: Aneesh Kumar K.V @ 2020-08-27  8:04 UTC (permalink / raw)
  To: linux-mm, akpm
  Cc: linux-arch, linux-s390, Anshuman Khandual, Aneesh Kumar K.V, x86,
	Mike Rapoport, Qian Cai, Gerald Schaefer, Christophe Leroy,
	Vineet Gupta, linux-snps-arc, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200827080438.315345-1-aneesh.kumar@linux.ibm.com>

pte_clear_tests operate on an existing pte entry. Make sure that is not a none
pte entry.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 21329c7d672f..8527ebb75f2c 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -546,7 +546,7 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
 static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
 				   unsigned long vaddr)
 {
-	pte_t pte = ptep_get(ptep);
+	pte_t pte =  ptep_get_and_clear(mm, vaddr, ptep);
 
 	pr_debug("Validating PTE clear\n");
 	pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
@@ -944,7 +944,7 @@ static int __init debug_vm_pgtable(void)
 	p4d_t *p4dp, *saved_p4dp;
 	pud_t *pudp, *saved_pudp;
 	pmd_t *pmdp, *saved_pmdp, pmd;
-	pte_t *ptep;
+	pte_t *ptep, pte;
 	pgtable_t saved_ptep;
 	pgprot_t prot, protnone;
 	phys_addr_t paddr;
@@ -1049,6 +1049,8 @@ static int __init debug_vm_pgtable(void)
 	 */
 
 	ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+	pte = pfn_pte(pte_aligned, prot);
+	set_pte_at(mm, vaddr, ptep, pte);
 	pte_clear_tests(mm, ptep, vaddr);
 	pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
 	pte_unmap_unlock(ptep, ptl);
-- 
2.26.2


^ permalink raw reply related

* Re: kernel since 5.6 do not boot anymore on Apple PowerBook
From: Giuseppe Sacco @ 2020-08-27  7:46 UTC (permalink / raw)
  To: linuxppc-dev, Christophe Leroy
In-Reply-To: <0a18fc199cef2643bd07591205a6234c2edf6c95.camel@sguazz.it>

Il giorno gio, 27/08/2020 alle 00.28 +0200, Giuseppe Sacco ha scritto:
> Hello Christophe,
> 
> Il giorno mer, 26/08/2020 alle 15.53 +0200, Christophe Leroy ha
> scritto:
> [...]
> > If there is no warning, then the issue is something else, bad luck.
> > 
> > Could you increase the loglevel and try again both with and without 
> > VMAP_STACK ? Maybe we'll get more information on where it stops.
> 
> The problem is related to the CPU frequency changes. This is where the
> system stop: cpufreq get the CPU frequency limits and then start the
> default governor (performance) and then calls function
> cpufreq_gov_performance_limits() that never returns.
> 
> Rebuilding after enabling pr_debug for cpufreq.c, I've got some more
> lines of output:
> 
> cpufreq: setting new policy for CPU 0: 667000 - 867000 kHz
> cpufreq: new min and max freqs are 667000 - 867000 kHz
> cpufreq: governor switch
> cpufreq: cpufreq_init_governor: for CPU 0
> cpufreq: cpufreq_start_governor: for CPU 0
> cpufreq: target for CPU 0: 867000 kHz, relation 1, requested 867000 kHz
> cpufreq: __target_index: cpu: 0, oldfreq: 667000, new freq: 867000
> cpufreq: notification 0 of frequency transition to 867000 kHz
> cpufreq: saving 133328 as reference value for loops_per_jiffy; freq is 667000 kHz
> 
> no more lines are printed. I think this output only refers to the
> notification sent prior to the frequency change.
> 
> I was thinking that selecting a governor that would run at 667mHz would
> probably skip the problem. I added cpufreq.default_governor=powersave
> to the command line parameters but it did not work: the selected
> governor was still performance and the system crashed.

I kept following the thread and found that CPU frequency is changed in
function pmu_set_cpu_speed() in file drivers/cpufreq/pmac32-cpufreq.c.
As first thing, the function calls the macro preempt_disable() and this
is where it stops.

Bye,
Giuseppe


^ permalink raw reply

* Re: kernel since 5.6 do not boot anymore on Apple PowerBook
From: Giuseppe Sacco @ 2020-08-27  8:28 UTC (permalink / raw)
  To: linuxppc-dev, Christophe Leroy
In-Reply-To: <afd75c134e2c4a57f8cf1f064595455e67b17e41.camel@sguazz.it>

Il giorno gio, 27/08/2020 alle 09.46 +0200, Giuseppe Sacco ha scritto:
> Il giorno gio, 27/08/2020 alle 00.28 +0200, Giuseppe Sacco ha scritto:
> > Hello Christophe,
> > 
> > Il giorno mer, 26/08/2020 alle 15.53 +0200, Christophe Leroy ha
> > scritto:
> > [...]
> > > If there is no warning, then the issue is something else, bad luck.
> > > 
> > > Could you increase the loglevel and try again both with and without 
> > > VMAP_STACK ? Maybe we'll get more information on where it stops.
> > 
> > The problem is related to the CPU frequency changes. This is where the
> > system stop: cpufreq get the CPU frequency limits and then start the
> > default governor (performance) and then calls function
> > cpufreq_gov_performance_limits() that never returns.
> > 
> > Rebuilding after enabling pr_debug for cpufreq.c, I've got some more
> > lines of output:
> > 
> > cpufreq: setting new policy for CPU 0: 667000 - 867000 kHz
> > cpufreq: new min and max freqs are 667000 - 867000 kHz
> > cpufreq: governor switch
> > cpufreq: cpufreq_init_governor: for CPU 0
> > cpufreq: cpufreq_start_governor: for CPU 0
> > cpufreq: target for CPU 0: 867000 kHz, relation 1, requested 867000 kHz
> > cpufreq: __target_index: cpu: 0, oldfreq: 667000, new freq: 867000
> > cpufreq: notification 0 of frequency transition to 867000 kHz
> > cpufreq: saving 133328 as reference value for loops_per_jiffy; freq is 667000 kHz
> > 
> > no more lines are printed. I think this output only refers to the
> > notification sent prior to the frequency change.
> > 
> > I was thinking that selecting a governor that would run at 667mHz would
> > probably skip the problem. I added cpufreq.default_governor=powersave
> > to the command line parameters but it did not work: the selected
> > governor was still performance and the system crashed.
> 
> I kept following the thread and found that CPU frequency is changed in
> function pmu_set_cpu_speed() in file drivers/cpufreq/pmac32-cpufreq.c.
> As first thing, the function calls the macro preempt_disable() and this
> is where it stops.

Sorry, I made a mistake. The real problem is down, on the same
function, when it calls low_sleep_handler(). This is where the problem
probably is.

Bye,
Giuseppe


^ permalink raw reply

* Re: [PATCHv5 1/2] powerpc/pseries: group lmb operation and memblock's
From: Laurent Dufour @ 2020-08-27  8:53 UTC (permalink / raw)
  To: Pingfan Liu, linuxppc-dev
  Cc: Nathan Lynch, kexec, Hari Bathini, Nathan Fontenot
In-Reply-To: <1597049570-19536-1-git-send-email-kernelfans@gmail.com>

Le 10/08/2020 à 10:52, Pingfan Liu a écrit :
> This patch prepares for the incoming patch which swaps the order of
> KOBJ_ADD/REMOVE uevent and dt's updating.
> 
> The dt updating should come after lmb operations, and before
> __remove_memory()/__add_memory().  Accordingly, grouping all lmb operations
> before the memblock's.

I can't find the link between this commit description and the code's changes below.

> 
> Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Hari Bathini <hbathini@linux.ibm.com>
> Cc: Nathan Lynch <nathanl@linux.ibm.com>
> Cc: Nathan Fontenot <nfont@linux.vnet.ibm.com>
> Cc: Laurent Dufour <ldufour@linux.ibm.com>
> To: linuxppc-dev@lists.ozlabs.org
> Cc: kexec@lists.infradead.org
> ---
> v4 -> v5: fix the miss of clearing DRCONF_MEM_ASSIGNED in a failure path
>   arch/powerpc/platforms/pseries/hotplug-memory.c | 28 +++++++++++++++++--------
>   1 file changed, 19 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 5d545b7..46cbcd1 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -355,7 +355,8 @@ static int dlpar_add_lmb(struct drmem_lmb *);
>   static int dlpar_remove_lmb(struct drmem_lmb *lmb)
>   {
>   	unsigned long block_sz;
> -	int rc;
> +	phys_addr_t base_addr;
> +	int rc, nid;
>   
>   	if (!lmb_is_removable(lmb))
>   		return -EINVAL;
> @@ -364,17 +365,19 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
>   	if (rc)
>   		return rc;
>   
> +	base_addr = lmb->base_addr;
> +	nid = lmb->nid;
>   	block_sz = pseries_memory_block_size();
>   
> -	__remove_memory(lmb->nid, lmb->base_addr, block_sz);
> -
> -	/* Update memory regions for memory remove */
> -	memblock_remove(lmb->base_addr, block_sz);
> -
>   	invalidate_lmb_associativity_index(lmb);
>   	lmb_clear_nid(lmb);
>   	lmb->flags &= ~DRCONF_MEM_ASSIGNED;
>   
> +	__remove_memory(nid, base_addr, block_sz);
> +
> +	/* Update memory regions for memory remove */
> +	memblock_remove(base_addr, block_sz);
> +
>   	return 0;
>   }
>   
> @@ -603,22 +606,29 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
>   	}
>   
>   	lmb_set_nid(lmb);
> +	lmb->flags |= DRCONF_MEM_ASSIGNED;
> +
>   	block_sz = memory_block_size_bytes();
>   
>   	/* Add the memory */
>   	rc = __add_memory(lmb->nid, lmb->base_addr, block_sz);
>   	if (rc) {
>   		invalidate_lmb_associativity_index(lmb);
> +		lmb_clear_nid(lmb);
> +		lmb->flags &= ~DRCONF_MEM_ASSIGNED;
>   		return rc;
>   	}
>   
>   	rc = dlpar_online_lmb(lmb);
>   	if (rc) {
> -		__remove_memory(lmb->nid, lmb->base_addr, block_sz);
> +		int nid = lmb->nid;
> +		phys_addr_t base_addr = lmb->base_addr;
> +
>   		invalidate_lmb_associativity_index(lmb);
>   		lmb_clear_nid(lmb);
> -	} else {
> -		lmb->flags |= DRCONF_MEM_ASSIGNED;
> +		lmb->flags &= ~DRCONF_MEM_ASSIGNED;
> +
> +		__remove_memory(nid, base_addr, block_sz);
>   	}
>   
>   	return rc;
> 


^ permalink raw reply

* Re: [PATCH 09/11] x86: remove address space overrides using set_fs()
From: 'Christoph Hellwig' @ 2020-08-27  9:37 UTC (permalink / raw)
  To: David Laight
  Cc: linux-arch@vger.kernel.org, Kees Cook, x86@kernel.org,
	linux-kernel@vger.kernel.org, Al Viro,
	linux-fsdevel@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	'Christoph Hellwig'
In-Reply-To: <935d551809894d14965e430e05d21057@AcuMS.aculab.com>

On Mon, Aug 17, 2020 at 08:23:11AM +0000, David Laight wrote:
> From: Christoph Hellwig
> > Sent: 17 August 2020 08:32
> >
> > Stop providing the possibility to override the address space using
> > set_fs() now that there is no need for that any more.  To properly
> > handle the TASK_SIZE_MAX checking for 4 vs 5-level page tables on
> > x86 a new alternative is introduced, which just like the one in
> > entry_64.S has to use the hardcoded virtual address bits to escape
> > the fact that TASK_SIZE_MAX isn't actually a constant when 5-level
> > page tables are enabled.
> ....
> > @@ -93,7 +69,7 @@ static inline bool pagefault_disabled(void);
> >  #define access_ok(addr, size)					\
> >  ({									\
> >  	WARN_ON_IN_IRQ();						\
> > -	likely(!__range_not_ok(addr, size, user_addr_max()));		\
> > +	likely(!__range_not_ok(addr, size, TASK_SIZE_MAX));		\
> >  })
> 
> Can't that always compare against a constant even when 5-levl
> page tables are enabled on x86-64?
> 
> On x86-64 it can (probably) reduce to (addr | (addr + size)) < 0.

I'll leave that to the x86 maintainers as a future cleanup if wanted.

^ permalink raw reply

* Re: [PATCH v3 3/6] Add LKDTM test to hijack a patch mapping (powerpc, x86_64)
From: kernel test robot @ 2020-08-27 10:11 UTC (permalink / raw)
  To: Christopher M. Riedl, linuxppc-dev; +Cc: kbuild-all, kernel-hardening
In-Reply-To: <20200827052659.24922-4-cmr@codefail.de>

[-- Attachment #1: Type: text/plain, Size: 4191 bytes --]

Hi "Christopher,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on char-misc/char-misc-testing tip/x86/core v5.9-rc2 next-20200827]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Christopher-M-Riedl/Use-per-CPU-temporary-mappings-for-patching/20200827-161532
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: x86_64-allmodconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce (this is a W=1 build):
        # save the attached .config to linux build tree
        make W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   drivers/misc/lkdtm/perms.c: In function 'lkdtm_HIJACK_PATCH':
>> drivers/misc/lkdtm/perms.c:318:38: error: implicit declaration of function 'read_cpu_patching_addr' [-Werror=implicit-function-declaration]
     318 |  addr = offset_in_page(patch_site) | read_cpu_patching_addr(patching_cpu);
         |                                      ^~~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

# https://github.com/0day-ci/linux/commit/36a98d779ee4620e6e091cbe3b438b52faa108ad
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Christopher-M-Riedl/Use-per-CPU-temporary-mappings-for-patching/20200827-161532
git checkout 36a98d779ee4620e6e091cbe3b438b52faa108ad
vim +/read_cpu_patching_addr +318 drivers/misc/lkdtm/perms.c

   289	
   290	void lkdtm_HIJACK_PATCH(void)
   291	{
   292	#ifdef CONFIG_PPC
   293		struct ppc_inst original_insn = ppc_inst_read(READ_ONCE(patch_site));
   294	#endif
   295	#ifdef CONFIG_X86_64
   296		int original_insn = READ_ONCE(*patch_site);
   297	#endif
   298		struct task_struct *patching_kthrd;
   299		int patching_cpu, hijacker_cpu, attempts;
   300		unsigned long addr;
   301		bool hijacked;
   302		const int bad_data = 0xbad00bad;
   303	
   304		if (num_online_cpus() < 2) {
   305			pr_warn("need at least two cpus\n");
   306			return;
   307		}
   308	
   309		hijacker_cpu = smp_processor_id();
   310		patching_cpu = cpumask_any_but(cpu_online_mask, hijacker_cpu);
   311	
   312		patching_kthrd = kthread_create_on_node(&lkdtm_patching_cpu, NULL,
   313							cpu_to_node(patching_cpu),
   314							"lkdtm_patching_cpu");
   315		kthread_bind(patching_kthrd, patching_cpu);
   316		wake_up_process(patching_kthrd);
   317	
 > 318		addr = offset_in_page(patch_site) | read_cpu_patching_addr(patching_cpu);
   319	
   320		pr_info("starting hijacker_cpu=%d\n", hijacker_cpu);
   321		for (attempts = 0; attempts < 100000; ++attempts) {
   322			/* Use __put_user to catch faults without an Oops */
   323			hijacked = !__put_user(bad_data, (int *)addr);
   324	
   325			if (hijacked) {
   326				if (kthread_stop(patching_kthrd))
   327					pr_err("error trying to stop patching thread\n");
   328				break;
   329			}
   330		}
   331		pr_info("hijack attempts: %d\n", attempts);
   332	
   333		if (hijacked) {
   334			if (lkdtm_verify_patch(bad_data))
   335				pr_err("overwrote kernel text\n");
   336			/*
   337			 * There are window conditions where the hijacker cpu manages to
   338			 * write to the patch site but the site gets overwritten again by
   339			 * the patching cpu. We still consider that a "successful" hijack
   340			 * since the hijacker cpu did not fault on the write.
   341			 */
   342			pr_err("FAIL: wrote to another cpu's patching area\n");
   343		} else {
   344			kthread_stop(patching_kthrd);
   345		}
   346	
   347		/* Restore the original insn for any future lkdtm tests */
   348	#ifdef CONFIG_PPC
   349		patch_instruction(patch_site, original_insn);
   350	#endif
   351	#ifdef CONFIG_X86_64
   352		lkdtm_do_patch(original_insn);
   353	#endif
   354	}
   355	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 76556 bytes --]

^ permalink raw reply

* Re: kernel since 5.6 do not boot anymore on Apple PowerBook
From: Christophe Leroy @ 2020-08-27 10:39 UTC (permalink / raw)
  To: Giuseppe Sacco, linuxppc-dev
In-Reply-To: <cab15033beeefa317aae40370664e108f57dc050.camel@sguazz.it>

Hi,

Le 27/08/2020 à 10:28, Giuseppe Sacco a écrit :
> Il giorno gio, 27/08/2020 alle 09.46 +0200, Giuseppe Sacco ha scritto:
>> Il giorno gio, 27/08/2020 alle 00.28 +0200, Giuseppe Sacco ha scritto:
>>> Hello Christophe,
>>>
>>> Il giorno mer, 26/08/2020 alle 15.53 +0200, Christophe Leroy ha
>>> scritto:
>>> [...]
>>>> If there is no warning, then the issue is something else, bad luck.
>>>>
>>>> Could you increase the loglevel and try again both with and without
>>>> VMAP_STACK ? Maybe we'll get more information on where it stops.
>>>
>>> The problem is related to the CPU frequency changes. This is where the
>>> system stop: cpufreq get the CPU frequency limits and then start the
>>> default governor (performance) and then calls function
>>> cpufreq_gov_performance_limits() that never returns.
>>>
>>> Rebuilding after enabling pr_debug for cpufreq.c, I've got some more
>>> lines of output:
>>>
>>> cpufreq: setting new policy for CPU 0: 667000 - 867000 kHz
>>> cpufreq: new min and max freqs are 667000 - 867000 kHz
>>> cpufreq: governor switch
>>> cpufreq: cpufreq_init_governor: for CPU 0
>>> cpufreq: cpufreq_start_governor: for CPU 0
>>> cpufreq: target for CPU 0: 867000 kHz, relation 1, requested 867000 kHz
>>> cpufreq: __target_index: cpu: 0, oldfreq: 667000, new freq: 867000
>>> cpufreq: notification 0 of frequency transition to 867000 kHz
>>> cpufreq: saving 133328 as reference value for loops_per_jiffy; freq is 667000 kHz
>>>
>>> no more lines are printed. I think this output only refers to the
>>> notification sent prior to the frequency change.
>>>
>>> I was thinking that selecting a governor that would run at 667mHz would
>>> probably skip the problem. I added cpufreq.default_governor=powersave
>>> to the command line parameters but it did not work: the selected
>>> governor was still performance and the system crashed.
>>
>> I kept following the thread and found that CPU frequency is changed in
>> function pmu_set_cpu_speed() in file drivers/cpufreq/pmac32-cpufreq.c.
>> As first thing, the function calls the macro preempt_disable() and this
>> is where it stops.
> 
> Sorry, I made a mistake. The real problem is down, on the same
> function, when it calls low_sleep_handler(). This is where the problem
> probably is.
> 


Great, you spotted the problem.

I see what it is, it is in low_sleep_handler() in 
arch/powerpc/platforms/powermac/sleep.S

All critical registers are saved on the stack. At restore, they are 
restore BEFORE re-enabling MMU (because they are needed for that). But 
when we have VMAP_STACK, the stack can hardly be accessed without the 
MMU enabled. tophys() doesn't work for virtual stack addresses.

Therefore, the low_sleep_handler() has to be reworked for using an area 
in the linear mem instead of the stack.

Christophe

^ permalink raw reply

* Re: [PATCH 2/2] ASoC: fsl: imx-es8328: add missing put_device() call in imx_es8328_probe()
From: Marco Felsch @ 2020-08-27  9:17 UTC (permalink / raw)
  To: Yu Kuai
  Cc: linux-arm-kernel, alsa-devel, linuxppc-dev, timur, Xiubo.Lee,
	yi.zhang, festevam, s.hauer, tiwai, lgirdwood, perex,
	nicoleotsuka, broonie, linux-imx, kernel, yukuai, shawnguo, xobs,
	shengjiu.wang, linux-kernel
In-Reply-To: <20200825120531.1479304-3-yukuai3@huawei.com>

On 20-08-25 20:05, Yu Kuai wrote:
> if of_find_device_by_node() succeed, imx_es8328_probe() doesn't have
> a corresponding put_device().

Why do we need the ssi_pdev reference here at all?

Regards,
  Marco

^ permalink raw reply

* Re: [PATCH v3 13/13] mm/debug_vm_pgtable: populate a pte entry before fetching it
From: kernel test robot @ 2020-08-27 12:17 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linux-mm, akpm
  Cc: linux-arch, linux-s390, kbuild-all, Anshuman Khandual, x86,
	linux-snps-arc, linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200827080438.315345-14-aneesh.kumar@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 3593 bytes --]

Hi "Aneesh,

I love your patch! Perhaps something to improve:

[auto build test WARNING on hnaz-linux-mm/master]
[also build test WARNING on powerpc/next v5.9-rc2 next-20200827]
[cannot apply to mmotm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Aneesh-Kumar-K-V/mm-debug_vm_pgtable-fixes/20200827-160758
base:   https://github.com/hnaz/linux-mm master
config: x86_64-randconfig-s022-20200827 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.2-191-g10164920-dirty
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


sparse warnings: (new ones prefixed by >>)

   mm/debug_vm_pgtable.c:509:9: sparse: sparse: incompatible types in conditional expression (different base types):
   mm/debug_vm_pgtable.c:509:9: sparse:    void
   mm/debug_vm_pgtable.c:509:9: sparse:    int
   mm/debug_vm_pgtable.c:528:9: sparse: sparse: incompatible types in conditional expression (different base types):
   mm/debug_vm_pgtable.c:528:9: sparse:    void
   mm/debug_vm_pgtable.c:528:9: sparse:    int
   mm/debug_vm_pgtable.c: note: in included file (through include/linux/pgtable.h, include/linux/mm.h, include/linux/highmem.h):
>> arch/x86/include/asm/pgtable.h:587:27: sparse: sparse: context imbalance in 'debug_vm_pgtable' - unexpected unlock

# https://github.com/0day-ci/linux/commit/9370726f47eaffdf772fdc273d180ec03b245cca
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Aneesh-Kumar-K-V/mm-debug_vm_pgtable-fixes/20200827-160758
git checkout 9370726f47eaffdf772fdc273d180ec03b245cca
vim +/debug_vm_pgtable +587 arch/x86/include/asm/pgtable.h

b534816b552d35 Jeremy Fitzhardinge 2009-02-04  586  
fb43d6cb91ef57 Dave Hansen         2018-04-06 @587  static inline pgprotval_t check_pgprot(pgprot_t pgprot)
fb43d6cb91ef57 Dave Hansen         2018-04-06  588  {
fb43d6cb91ef57 Dave Hansen         2018-04-06  589  	pgprotval_t massaged_val = massage_pgprot(pgprot);
fb43d6cb91ef57 Dave Hansen         2018-04-06  590  
fb43d6cb91ef57 Dave Hansen         2018-04-06  591  	/* mmdebug.h can not be included here because of dependencies */
fb43d6cb91ef57 Dave Hansen         2018-04-06  592  #ifdef CONFIG_DEBUG_VM
fb43d6cb91ef57 Dave Hansen         2018-04-06  593  	WARN_ONCE(pgprot_val(pgprot) != massaged_val,
fb43d6cb91ef57 Dave Hansen         2018-04-06  594  		  "attempted to set unsupported pgprot: %016llx "
fb43d6cb91ef57 Dave Hansen         2018-04-06  595  		  "bits: %016llx supported: %016llx\n",
fb43d6cb91ef57 Dave Hansen         2018-04-06  596  		  (u64)pgprot_val(pgprot),
fb43d6cb91ef57 Dave Hansen         2018-04-06  597  		  (u64)pgprot_val(pgprot) ^ massaged_val,
fb43d6cb91ef57 Dave Hansen         2018-04-06  598  		  (u64)__supported_pte_mask);
fb43d6cb91ef57 Dave Hansen         2018-04-06  599  #endif
fb43d6cb91ef57 Dave Hansen         2018-04-06  600  
fb43d6cb91ef57 Dave Hansen         2018-04-06  601  	return massaged_val;
fb43d6cb91ef57 Dave Hansen         2018-04-06  602  }
fb43d6cb91ef57 Dave Hansen         2018-04-06  603  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 30318 bytes --]

^ permalink raw reply

* Re: [PATCH v1 4/9] powerpc/vdso: Remove unnecessary ifdefs in vdso_pagelist initialization
From: Michael Ellerman @ 2020-08-27 13:19 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <04a968f6-88c0-0603-43aa-202196a68df2@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> On 08/26/2020 02:58 PM, Michael Ellerman wrote:
>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>> diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
>>> index daef14a284a3..bbb69832fd46 100644
>>> --- a/arch/powerpc/kernel/vdso.c
>>> +++ b/arch/powerpc/kernel/vdso.c
>>> @@ -718,16 +710,14 @@ static int __init vdso_init(void)
>> ...
>>>   
>>> -
>>> -#ifdef CONFIG_VDSO32
>>>   	vdso32_kbase = &vdso32_start;
>>>   
>>>   	/*
>>> @@ -735,8 +725,6 @@ static int __init vdso_init(void)
>>>   	 */
>>>   	vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT;
>>>   	DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages);
>>> -#endif
>> 
>> This didn't build for ppc64le:
>> 
>>    /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: arch/powerpc/kernel/vdso.o:(.toc+0x0): undefined reference to `vdso32_end'
>>    /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: arch/powerpc/kernel/vdso.o:(.toc+0x8): undefined reference to `vdso32_start'
>>    make[1]: *** [/scratch/michael/build/maint/Makefile:1166: vmlinux] Error 1
>>    make: *** [Makefile:185: __sub-make] Error 2
>> 
>> So I just put that ifdef back.
>> 
>
> The problem is because is_32bit() can still return true even when 
> CONFIG_VDSO32 is not set.

Hmm, you're right. My config had CONFIG_COMPAT enabled.

But that seems like a bug, if someone enables COMPAT on ppc64le they are
almost certainly going to want VDSO32 as well.

So I think I'll do a lead up patch as below.

cheers

diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index d4fd109f177e..cf2da1e401ef 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -501,13 +501,12 @@ endmenu
 
 config VDSO32
 	def_bool y
-	depends on PPC32 || CPU_BIG_ENDIAN
+	depends on PPC32 || COMPAT
 	help
 	  This symbol controls whether we build the 32-bit VDSO. We obviously
 	  want to do that if we're building a 32-bit kernel. If we're building
-	  a 64-bit kernel then we only want a 32-bit VDSO if we're building for
-	  big endian. That is because the only little endian configuration we
-	  support is ppc64le which is 64-bit only.
+	  a 64-bit kernel then we only want a 32-bit VDSO if we're also enabling
+	  COMPAT.
 
 choice
 	prompt "Endianness selection"


^ permalink raw reply related

* Re: [PATCH v3 4/6] powerpc: Introduce temporary mm
From: Jann Horn @ 2020-08-27 14:15 UTC (permalink / raw)
  To: Christopher M. Riedl; +Cc: linuxppc-dev, Kernel Hardening
In-Reply-To: <20200827052659.24922-5-cmr@codefail.de>

On Thu, Aug 27, 2020 at 7:24 AM Christopher M. Riedl <cmr@codefail.de> wrote:
> x86 supports the notion of a temporary mm which restricts access to
> temporary PTEs to a single CPU. A temporary mm is useful for situations
> where a CPU needs to perform sensitive operations (such as patching a
> STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
> said mappings to other CPUs. A side benefit is that other CPU TLBs do
> not need to be flushed when the temporary mm is torn down.
>
> Mappings in the temporary mm can be set in the userspace portion of the
> address-space.
[...]
> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
[...]
> @@ -44,6 +45,70 @@ int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
>  }
>
>  #ifdef CONFIG_STRICT_KERNEL_RWX
> +
> +struct temp_mm {
> +       struct mm_struct *temp;
> +       struct mm_struct *prev;
> +       bool is_kernel_thread;
> +       struct arch_hw_breakpoint brk[HBP_NUM_MAX];
> +};
> +
> +static inline void init_temp_mm(struct temp_mm *temp_mm, struct mm_struct *mm)
> +{
> +       temp_mm->temp = mm;
> +       temp_mm->prev = NULL;
> +       temp_mm->is_kernel_thread = false;
> +       memset(&temp_mm->brk, 0, sizeof(temp_mm->brk));
> +}
> +
> +static inline void use_temporary_mm(struct temp_mm *temp_mm)
> +{
> +       lockdep_assert_irqs_disabled();
> +
> +       temp_mm->is_kernel_thread = current->mm == NULL;

(That's a somewhat misleading variable name - kernel threads can have
a non-NULL ->mm, too.)

> +       if (temp_mm->is_kernel_thread)
> +               temp_mm->prev = current->active_mm;
> +       else
> +               temp_mm->prev = current->mm;

Why the branch? Shouldn't current->active_mm work in both cases?


> +       /*
> +        * Hash requires a non-NULL current->mm to allocate a userspace address
> +        * when handling a page fault. Does not appear to hurt in Radix either.
> +        */
> +       current->mm = temp_mm->temp;

This looks dangerous to me. There are various places that attempt to
find all userspace tasks that use a given mm by iterating through all
tasks on the system and comparing each task's ->mm pointer to
current's. Things like current_is_single_threaded() as part of various
security checks, mm_update_next_owner(), zap_threads(), and so on. So
if this is reachable from userspace task context (which I think it
is?), I don't think we're allowed to switch out the ->mm pointer here.


> +       switch_mm_irqs_off(NULL, temp_mm->temp, current);

switch_mm_irqs_off() calls switch_mmu_context(), which in the nohash
implementation increments next->context.active and decrements
prev->context.active if prev is non-NULL, right? So this would
increase temp_mm->temp->context.active...

> +       if (ppc_breakpoint_available()) {
> +               struct arch_hw_breakpoint null_brk = {0};
> +               int i = 0;
> +
> +               for (; i < nr_wp_slots(); ++i) {
> +                       __get_breakpoint(i, &temp_mm->brk[i]);
> +                       if (temp_mm->brk[i].type != 0)
> +                               __set_breakpoint(i, &null_brk);
> +               }
> +       }
> +}
> +
> +static inline void unuse_temporary_mm(struct temp_mm *temp_mm)
> +{
> +       lockdep_assert_irqs_disabled();
> +
> +       if (temp_mm->is_kernel_thread)
> +               current->mm = NULL;
> +       else
> +               current->mm = temp_mm->prev;
> +       switch_mm_irqs_off(NULL, temp_mm->prev, current);

... whereas this would increase temp_mm->prev->context.active. As far
as I can tell, that'll mean that both the original mm and the patching
mm will have their .active counts permanently too high after
use_temporary_mm()+unuse_temporary_mm()?

> +       if (ppc_breakpoint_available()) {
> +               int i = 0;
> +
> +               for (; i < nr_wp_slots(); ++i)
> +                       if (temp_mm->brk[i].type != 0)
> +                               __set_breakpoint(i, &temp_mm->brk[i]);
> +       }
> +}

^ permalink raw reply

* Re: kernel since 5.6 do not boot anymore on Apple PowerBook
From: Giuseppe Sacco @ 2020-08-27 14:37 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <afae7efd-0d8a-5672-7b75-9394b0ff3d3c@csgroup.eu>

Il giorno gio, 27/08/2020 alle 12.39 +0200, Christophe Leroy ha
scritto:
> Hi,
> 
> Le 27/08/2020 à 10:28, Giuseppe Sacco a écrit :
[...]
> > Sorry, I made a mistake. The real problem is down, on the same
> > function, when it calls low_sleep_handler(). This is where the problem
> > probably is.
> 
> Great, you spotted the problem.
> 
> I see what it is, it is in low_sleep_handler() in 
> arch/powerpc/platforms/powermac/sleep.S
> 
> All critical registers are saved on the stack. At restore, they are 
> restore BEFORE re-enabling MMU (because they are needed for that). But 
> when we have VMAP_STACK, the stack can hardly be accessed without the 
> MMU enabled. tophys() doesn't work for virtual stack addresses.
> 
> Therefore, the low_sleep_handler() has to be reworked for using an area 
> in the linear mem instead of the stack.

I am sorry, but I don't know how to fix it. Should I open a bug for
tracking this problem?

Thank you,
Giuseppe


^ permalink raw reply

* [PATCH 01/10] fs: don't allow kernel reads and writes without iter ops
From: Christoph Hellwig @ 2020-08-27 15:00 UTC (permalink / raw)
  To: Linus Torvalds, Al Viro, Michael Ellerman, x86
  Cc: linux-fsdevel, linux-arch, linuxppc-dev, Kees Cook, linux-kernel
In-Reply-To: <20200827150030.282762-1-hch@lst.de>

Don't allow calling ->read or ->write with set_fs as a preparation for
killing off set_fs.  All the instances that we use kernel_read/write on
are using the iter ops already.

If a file has both the regular ->read/->write methods and the iter
variants those could have different semantics for messed up enough
drivers.  Also fails the kernel access to them in that case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
 fs/read_write.c | 67 +++++++++++++++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 25 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 5db58b8c78d0dd..702c4301d9eb6b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -419,27 +419,41 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
 	return ret;
 }
 
+static int warn_unsupported(struct file *file, const char *op)
+{
+	pr_warn_ratelimited(
+		"kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
+		op, file, current->pid, current->comm);
+	return -EINVAL;
+}
+
 ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
 {
-	mm_segment_t old_fs = get_fs();
+	struct kvec iov = {
+		.iov_base	= buf,
+		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
+	};
+	struct kiocb kiocb;
+	struct iov_iter iter;
 	ssize_t ret;
 
 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
 		return -EINVAL;
 	if (!(file->f_mode & FMODE_CAN_READ))
 		return -EINVAL;
+	/*
+	 * Also fail if ->read_iter and ->read are both wired up as that
+	 * implies very convoluted semantics.
+	 */
+	if (unlikely(!file->f_op->read_iter || file->f_op->read))
+		return warn_unsupported(file, "read");
 
-	if (count > MAX_RW_COUNT)
-		count =  MAX_RW_COUNT;
-	set_fs(KERNEL_DS);
-	if (file->f_op->read)
-		ret = file->f_op->read(file, (void __user *)buf, count, pos);
-	else if (file->f_op->read_iter)
-		ret = new_sync_read(file, (void __user *)buf, count, pos);
-	else
-		ret = -EINVAL;
-	set_fs(old_fs);
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *pos;
+	iov_iter_kvec(&iter, READ, &iov, 1, iov.iov_len);
+	ret = file->f_op->read_iter(&kiocb, &iter);
 	if (ret > 0) {
+		*pos = kiocb.ki_pos;
 		fsnotify_access(file);
 		add_rchar(current, ret);
 	}
@@ -510,28 +524,31 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
 /* caller is responsible for file_start_write/file_end_write */
 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
 {
-	mm_segment_t old_fs;
-	const char __user *p;
+	struct kvec iov = {
+		.iov_base	= (void *)buf,
+		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
+	};
+	struct kiocb kiocb;
+	struct iov_iter iter;
 	ssize_t ret;
 
 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
 		return -EBADF;
 	if (!(file->f_mode & FMODE_CAN_WRITE))
 		return -EINVAL;
+	/*
+	 * Also fail if ->write_iter and ->write are both wired up as that
+	 * implies very convoluted semantics.
+	 */
+	if (unlikely(!file->f_op->write_iter || file->f_op->write))
+		return warn_unsupported(file, "write");
 
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-	p = (__force const char __user *)buf;
-	if (count > MAX_RW_COUNT)
-		count =  MAX_RW_COUNT;
-	if (file->f_op->write)
-		ret = file->f_op->write(file, p, count, pos);
-	else if (file->f_op->write_iter)
-		ret = new_sync_write(file, p, count, pos);
-	else
-		ret = -EINVAL;
-	set_fs(old_fs);
+	init_sync_kiocb(&kiocb, file);
+	kiocb.ki_pos = *pos;
+	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+	ret = file->f_op->write_iter(&kiocb, &iter);
 	if (ret > 0) {
+		*pos = kiocb.ki_pos;
 		fsnotify_modify(file);
 		add_wchar(current, ret);
 	}
-- 
2.28.0


^ permalink raw reply related

* remove the last set_fs() in common code, and remove it for x86 and powerpc v2
From: Christoph Hellwig @ 2020-08-27 15:00 UTC (permalink / raw)
  To: Linus Torvalds, Al Viro, Michael Ellerman, x86
  Cc: linux-fsdevel, linux-arch, linuxppc-dev, Kees Cook, linux-kernel

Hi all,

this series removes the last set_fs() used to force a kernel address
space for the uaccess code in the kernel read/write/splice code, and then
stops implementing the address space overrides entirely for x86 and
powerpc.

The file system part has been posted a few times, and the read/write side
has been pretty much unchanced.  For splice this series drops the
conversion of the seq_file and sysctl code to the iter ops, and thus loses
the splice support for them.  The reasons for that is that it caused a lot
of churn for not much use - splice for these small files really isn't much
of a win, even if existing userspace uses it.  All callers I found do the
proper fallback, but if this turns out to be an issue the conversion can
be resurrected.

Besides x86 and powerpc I plan to eventually convert all other
architectures, although this will be a slow process, starting with the
easier ones once the infrastructure is merged.  The process to convert
architectures is roughtly:

 (1) ensure there is no set_fs(KERNEL_DS) left in arch specific code
 (2) implement __get_kernel_nofault and __put_kernel_nofault
 (3) remove the arch specific address limitation functionality

Changes since v1:
 - drop the patch to remove the non-iter ops for /dev/zero and
   /dev/null as they caused a performance regression
 - don't enable user access in __get_kernel on powerpc
 - xfail the set_fs() based lkdtm tests

Diffstat:

^ permalink raw reply

* [PATCH 03/10] uaccess: add infrastructure for kernel builds with set_fs()
From: Christoph Hellwig @ 2020-08-27 15:00 UTC (permalink / raw)
  To: Linus Torvalds, Al Viro, Michael Ellerman, x86
  Cc: linux-fsdevel, linux-arch, linuxppc-dev, Kees Cook, linux-kernel
In-Reply-To: <20200827150030.282762-1-hch@lst.de>

Add a CONFIG_SET_FS option that is selected by architecturess that
implement set_fs, which is all of them initially.  If the option is not
set stubs for routines related to overriding the address space are
provided so that architectures can start to opt out of providing set_fs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
 arch/Kconfig            |  3 +++
 arch/alpha/Kconfig      |  1 +
 arch/arc/Kconfig        |  1 +
 arch/arm/Kconfig        |  1 +
 arch/arm64/Kconfig      |  1 +
 arch/c6x/Kconfig        |  1 +
 arch/csky/Kconfig       |  1 +
 arch/h8300/Kconfig      |  1 +
 arch/hexagon/Kconfig    |  1 +
 arch/ia64/Kconfig       |  1 +
 arch/m68k/Kconfig       |  1 +
 arch/microblaze/Kconfig |  1 +
 arch/mips/Kconfig       |  1 +
 arch/nds32/Kconfig      |  1 +
 arch/nios2/Kconfig      |  1 +
 arch/openrisc/Kconfig   |  1 +
 arch/parisc/Kconfig     |  1 +
 arch/powerpc/Kconfig    |  1 +
 arch/riscv/Kconfig      |  1 +
 arch/s390/Kconfig       |  1 +
 arch/sh/Kconfig         |  1 +
 arch/sparc/Kconfig      |  1 +
 arch/um/Kconfig         |  1 +
 arch/x86/Kconfig        |  1 +
 arch/xtensa/Kconfig     |  1 +
 include/linux/uaccess.h | 18 ++++++++++++++++++
 26 files changed, 45 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index af14a567b493fc..3fab619a6aa51a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -24,6 +24,9 @@ config KEXEC_ELF
 config HAVE_IMA_KEXEC
 	bool
 
+config SET_FS
+	bool
+
 config HOTPLUG_SMT
 	bool
 
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9c5f06e8eb9bc0..d6e9fc7a7b19e2 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -39,6 +39,7 @@ config ALPHA
 	select OLD_SIGSUSPEND
 	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	select MMU_GATHER_NO_RANGE
+	select SET_FS
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ba00c4e1e1c271..c49f5754a11e40 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -48,6 +48,7 @@ config ARC
 	select PCI_SYSCALL if PCI
 	select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING
 	select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32
+	select SET_FS
 
 config ARCH_HAS_CACHE_LINE_SIZE
 	def_bool y
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e00d94b1665876..87e1478a42dc4f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -118,6 +118,7 @@ config ARM
 	select PCI_SYSCALL if PCI
 	select PERF_USE_VMALLOC
 	select RTC_LIB
+	select SET_FS
 	select SYS_SUPPORTS_APM_EMULATION
 	# Above selects are sorted alphabetically; please add new ones
 	# according to that.  Thanks.
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6d232837cbeee8..fbd9e35bef096f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -192,6 +192,7 @@ config ARM64
 	select PCI_SYSCALL if PCI
 	select POWER_RESET
 	select POWER_SUPPLY
+	select SET_FS
 	select SPARSE_IRQ
 	select SWIOTLB
 	select SYSCTL_EXCEPTION_TRACE
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 6444ebfd06a665..48d66bf0465d68 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -22,6 +22,7 @@ config C6X
 	select GENERIC_CLOCKEVENTS
 	select MODULES_USE_ELF_RELA
 	select MMU_GATHER_NO_RANGE if MMU
+	select SET_FS
 
 config MMU
 	def_bool n
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 3d5afb5f568543..2836f6e76fdb2d 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -78,6 +78,7 @@ config CSKY
 	select PCI_DOMAINS_GENERIC if PCI
 	select PCI_SYSCALL if PCI
 	select PCI_MSI if PCI
+	select SET_FS
 
 config LOCKDEP_SUPPORT
 	def_bool y
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index d11666d538fea8..7945de067e9fcc 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -25,6 +25,7 @@ config H8300
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_HASH
 	select CPU_NO_EFFICIENT_FFS
+	select SET_FS
 	select UACCESS_MEMCPY
 
 config CPU_BIG_ENDIAN
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 667cfc511cf999..f2afabbadd430e 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -31,6 +31,7 @@ config HEXAGON
 	select GENERIC_CLOCKEVENTS_BROADCAST
 	select MODULES_USE_ELF_RELA
 	select GENERIC_CPU_DEVICES
+	select SET_FS
 	help
 	  Qualcomm Hexagon is a processor architecture designed for high
 	  performance and low power across a wide variety of applications.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 5b4ec80bf5863a..22a6853840e235 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -56,6 +56,7 @@ config IA64
 	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
 	select NUMA if !FLATMEM
+	select SET_FS
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 6f2f38d05772ab..dcf4ae8c9b215f 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -32,6 +32,7 @@ config M68K
 	select OLD_SIGSUSPEND3
 	select OLD_SIGACTION
 	select MMU_GATHER_NO_RANGE if MMU
+	select SET_FS
 
 config CPU_BIG_ENDIAN
 	def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index d262ac0c8714bd..7e3d4583abf3e6 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -46,6 +46,7 @@ config MICROBLAZE
 	select CPU_NO_EFFICIENT_FFS
 	select MMU_GATHER_NO_RANGE if MMU
 	select SPARSE_IRQ
+	select SET_FS
 
 # Endianness selection
 choice
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index c95fa3a2484cf0..fbc26391b588f8 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -87,6 +87,7 @@ config MIPS
 	select MODULES_USE_ELF_RELA if MODULES && 64BIT
 	select PERF_USE_VMALLOC
 	select RTC_LIB
+	select SET_FS
 	select SYSCTL_EXCEPTION_TRACE
 	select VIRT_TO_BUS
 
diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig
index e30298e99e1bdf..e8e541fd2267d0 100644
--- a/arch/nds32/Kconfig
+++ b/arch/nds32/Kconfig
@@ -48,6 +48,7 @@ config NDS32
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
+	select SET_FS
 	help
 	  Andes(nds32) Linux support.
 
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index c6645141bb2a88..c7c6ba6bec9dfc 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -27,6 +27,7 @@ config NIOS2
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
 	select CPU_NO_EFFICIENT_FFS
 	select MMU_GATHER_NO_RANGE if MMU
+	select SET_FS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 7e94fe37cb2fdf..6233c62931803f 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -39,6 +39,7 @@ config OPENRISC
 	select ARCH_WANT_FRAME_POINTERS
 	select GENERIC_IRQ_MULTI_HANDLER
 	select MMU_GATHER_NO_RANGE if MMU
+	select SET_FS
 
 config CPU_BIG_ENDIAN
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 3b0f53dd70bc9b..be70af482b5a9a 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -63,6 +63,7 @@ config PARISC
 	select HAVE_FTRACE_MCOUNT_RECORD if HAVE_DYNAMIC_FTRACE
 	select HAVE_KPROBES_ON_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select SET_FS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1f48bbfb3ce99d..3f09d6fdf89405 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -249,6 +249,7 @@ config PPC
 	select PCI_SYSCALL			if PCI
 	select PPC_DAWR				if PPC64
 	select RTC_LIB
+	select SET_FS
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index df18372861d8d2..ea0c1ad456d838 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -82,6 +82,7 @@ config RISCV
 	select PCI_MSI if PCI
 	select RISCV_INTC
 	select RISCV_TIMER if RISCV_SBI
+	select SET_FS
 	select SPARSEMEM_STATIC if 32BIT
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 3d86e12e8e3c21..fd81385a7787cb 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -185,6 +185,7 @@ config S390
 	select OLD_SIGSUSPEND3
 	select PCI_DOMAINS		if PCI
 	select PCI_MSI			if PCI
+	select SET_FS
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d20927128fce05..2bd1653f3b3fea 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -71,6 +71,7 @@ config SUPERH
 	select PERF_EVENTS
 	select PERF_USE_VMALLOC
 	select RTC_LIB
+	select SET_FS
 	select SPARSE_IRQ
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index efeff2c896a544..3e0cf0319a278a 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -49,6 +49,7 @@ config SPARC
 	select LOCKDEP_SMALL if LOCKDEP
 	select NEED_DMA_MAP_STATE
 	select NEED_SG_DMA_LENGTH
+	select SET_FS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index eb51fec759484a..3aefcd81566809 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -19,6 +19,7 @@ config UML
 	select GENERIC_CPU_DEVICES
 	select GENERIC_CLOCKEVENTS
 	select HAVE_GCC_PLUGINS
+	select SET_FS
 	select TTY # Needed for line.c
 
 config MMU
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7101ac64bb209d..f85c13355732fe 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -237,6 +237,7 @@ config X86
 	select HAVE_ARCH_KCSAN			if X86_64
 	select X86_FEATURE_NAMES		if PROC_FS
 	select PROC_PID_ARCH_STATUS		if PROC_FS
+	select SET_FS
 	imply IMA_SECURE_AND_OR_TRUSTED_BOOT    if EFI
 
 config INSTRUCTION_DECODER
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index e997e0119c0251..94bad4d66b4bde 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -41,6 +41,7 @@ config XTENSA
 	select IRQ_DOMAIN
 	select MODULES_USE_ELF_RELA
 	select PERF_USE_VMALLOC
+	select SET_FS
 	select VIRT_TO_BUS
 	help
 	  Xtensa processors are 32-bit RISC machines designed by Tensilica
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 94b28541165929..70073c802b48ed 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -8,6 +8,7 @@
 
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_SET_FS
 /*
  * Force the uaccess routines to be wired up for actual userspace access,
  * overriding any possible set_fs(KERNEL_DS) still lingering around.  Undone
@@ -25,6 +26,23 @@ static inline void force_uaccess_end(mm_segment_t oldfs)
 {
 	set_fs(oldfs);
 }
+#else /* CONFIG_SET_FS */
+typedef struct {
+	/* empty dummy */
+} mm_segment_t;
+
+#define uaccess_kernel()		(false)
+#define user_addr_max()			(TASK_SIZE_MAX)
+
+static inline mm_segment_t force_uaccess_begin(void)
+{
+	return (mm_segment_t) { };
+}
+
+static inline void force_uaccess_end(mm_segment_t oldfs)
+{
+}
+#endif /* CONFIG_SET_FS */
 
 /*
  * Architectures should provide two primitives (raw_copy_{to,from}_user())
-- 
2.28.0


^ permalink raw reply related

* [PATCH 05/10] lkdtm: disable set_fs-based tests for !CONFIG_SET_FS
From: Christoph Hellwig @ 2020-08-27 15:00 UTC (permalink / raw)
  To: Linus Torvalds, Al Viro, Michael Ellerman, x86
  Cc: linux-fsdevel, linux-arch, linuxppc-dev, Kees Cook, linux-kernel
In-Reply-To: <20200827150030.282762-1-hch@lst.de>

Once we can't manipulate the address limit, we also can't test what
happens when the manipulation is abused.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/misc/lkdtm/bugs.c     | 4 ++++
 drivers/misc/lkdtm/usercopy.c | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/drivers/misc/lkdtm/bugs.c b/drivers/misc/lkdtm/bugs.c
index 4dfbfd51bdf774..0d5b93694a0183 100644
--- a/drivers/misc/lkdtm/bugs.c
+++ b/drivers/misc/lkdtm/bugs.c
@@ -315,11 +315,15 @@ void lkdtm_CORRUPT_LIST_DEL(void)
 /* Test if unbalanced set_fs(KERNEL_DS)/set_fs(USER_DS) check exists. */
 void lkdtm_CORRUPT_USER_DS(void)
 {
+#ifdef CONFIG_SET_FS
 	pr_info("setting bad task size limit\n");
 	set_fs(KERNEL_DS);
 
 	/* Make sure we do not keep running with a KERNEL_DS! */
 	force_sig(SIGKILL);
+#else
+	pr_err("XFAIL: this requires set_fs()\n");
+#endif
 }
 
 /* Test that VMAP_STACK is actually allocating with a leading guard page */
diff --git a/drivers/misc/lkdtm/usercopy.c b/drivers/misc/lkdtm/usercopy.c
index b833367a45d053..04d10063835241 100644
--- a/drivers/misc/lkdtm/usercopy.c
+++ b/drivers/misc/lkdtm/usercopy.c
@@ -327,6 +327,7 @@ void lkdtm_USERCOPY_KERNEL(void)
 
 void lkdtm_USERCOPY_KERNEL_DS(void)
 {
+#ifdef CONFIG_SET_FS
 	char __user *user_ptr =
 		(char __user *)(0xFUL << (sizeof(unsigned long) * 8 - 4));
 	mm_segment_t old_fs = get_fs();
@@ -338,6 +339,9 @@ void lkdtm_USERCOPY_KERNEL_DS(void)
 	if (copy_to_user(user_ptr, buf, sizeof(buf)) == 0)
 		pr_err("copy_to_user() to noncanonical address succeeded!?\n");
 	set_fs(old_fs);
+#else
+	pr_err("XFAIL: this requires set_fs()\n");
+#endif
 }
 
 void __init lkdtm_usercopy_init(void)
-- 
2.28.0


^ permalink raw reply related

* [PATCH 02/10] fs: don't allow splice read/write without explicit ops
From: Christoph Hellwig @ 2020-08-27 15:00 UTC (permalink / raw)
  To: Linus Torvalds, Al Viro, Michael Ellerman, x86
  Cc: linux-fsdevel, linux-arch, linuxppc-dev, Kees Cook, linux-kernel
In-Reply-To: <20200827150030.282762-1-hch@lst.de>

default_file_splice_write is the last piece of generic code that uses
set_fs to make the uaccess routines operate on kernel pointers.  It
implements a "fallback loop" for splicing from files that do not actually
provide a proper splice_read method.  The usual file systems and other
high bandwith instances all provide a ->splice_read, so this just removes
support for various device drivers and procfs/debugfs files.  If splice
support for any of those turns out to be important it can be added back
by switching them to the iter ops and using generic_file_splice_read.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
 fs/read_write.c    |   2 +-
 fs/splice.c        | 130 +++++----------------------------------------
 include/linux/fs.h |   2 -
 3 files changed, 15 insertions(+), 119 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 702c4301d9eb6b..8c61f67453e3d3 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1077,7 +1077,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
 }
 EXPORT_SYMBOL(vfs_iter_write);
 
-ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
+static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 		  unsigned long vlen, loff_t *pos, rwf_t flags)
 {
 	struct iovec iovstack[UIO_FASTIOV];
diff --git a/fs/splice.c b/fs/splice.c
index d7c8a7c4db07ff..412df7b48f9eb7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -342,89 +342,6 @@ const struct pipe_buf_operations nosteal_pipe_buf_ops = {
 };
 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
 
-static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
-			    unsigned long vlen, loff_t offset)
-{
-	mm_segment_t old_fs;
-	loff_t pos = offset;
-	ssize_t res;
-
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-	/* The cast to a user pointer is valid due to the set_fs() */
-	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
-	set_fs(old_fs);
-
-	return res;
-}
-
-static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
-				 struct pipe_inode_info *pipe, size_t len,
-				 unsigned int flags)
-{
-	struct kvec *vec, __vec[PIPE_DEF_BUFFERS];
-	struct iov_iter to;
-	struct page **pages;
-	unsigned int nr_pages;
-	unsigned int mask;
-	size_t offset, base, copied = 0;
-	ssize_t res;
-	int i;
-
-	if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
-		return -EAGAIN;
-
-	/*
-	 * Try to keep page boundaries matching to source pagecache ones -
-	 * it probably won't be much help, but...
-	 */
-	offset = *ppos & ~PAGE_MASK;
-
-	iov_iter_pipe(&to, READ, pipe, len + offset);
-
-	res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base);
-	if (res <= 0)
-		return -ENOMEM;
-
-	nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE);
-
-	vec = __vec;
-	if (nr_pages > PIPE_DEF_BUFFERS) {
-		vec = kmalloc_array(nr_pages, sizeof(struct kvec), GFP_KERNEL);
-		if (unlikely(!vec)) {
-			res = -ENOMEM;
-			goto out;
-		}
-	}
-
-	mask = pipe->ring_size - 1;
-	pipe->bufs[to.head & mask].offset = offset;
-	pipe->bufs[to.head & mask].len -= offset;
-
-	for (i = 0; i < nr_pages; i++) {
-		size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
-		vec[i].iov_base = page_address(pages[i]) + offset;
-		vec[i].iov_len = this_len;
-		len -= this_len;
-		offset = 0;
-	}
-
-	res = kernel_readv(in, vec, nr_pages, *ppos);
-	if (res > 0) {
-		copied = res;
-		*ppos += res;
-	}
-
-	if (vec != __vec)
-		kfree(vec);
-out:
-	for (i = 0; i < nr_pages; i++)
-		put_page(pages[i]);
-	kvfree(pages);
-	iov_iter_advance(&to, copied);	/* truncates and discards */
-	return res;
-}
-
 /*
  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
  * using sendpage(). Return the number of bytes sent.
@@ -788,33 +705,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 EXPORT_SYMBOL(iter_file_splice_write);
 
-static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
-			  struct splice_desc *sd)
-{
-	int ret;
-	void *data;
-	loff_t tmp = sd->pos;
-
-	data = kmap(buf->page);
-	ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
-	kunmap(buf->page);
-
-	return ret;
-}
-
-static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
-					 struct file *out, loff_t *ppos,
-					 size_t len, unsigned int flags)
-{
-	ssize_t ret;
-
-	ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
-	if (ret > 0)
-		*ppos += ret;
-
-	return ret;
-}
-
 /**
  * generic_splice_sendpage - splice data from a pipe to a socket
  * @pipe:	pipe to splice from
@@ -836,15 +726,23 @@ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 
 EXPORT_SYMBOL(generic_splice_sendpage);
 
+static int warn_unsupported(struct file *file, const char *op)
+{
+	pr_debug_ratelimited(
+		"splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
+		op, file, current->pid, current->comm);
+	return -EINVAL;
+}
+
 /*
  * Attempt to initiate a splice from pipe to file.
  */
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 			   loff_t *ppos, size_t len, unsigned int flags)
 {
-	if (out->f_op->splice_write)
-		return out->f_op->splice_write(pipe, out, ppos, len, flags);
-	return default_file_splice_write(pipe, out, ppos, len, flags);
+	if (unlikely(!out->f_op->splice_write))
+		return warn_unsupported(out, "write");
+	return out->f_op->splice_write(pipe, out, ppos, len, flags);
 }
 
 /*
@@ -866,9 +764,9 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 	if (unlikely(len > MAX_RW_COUNT))
 		len = MAX_RW_COUNT;
 
-	if (in->f_op->splice_read)
-		return in->f_op->splice_read(in, ppos, pipe, len, flags);
-	return default_file_splice_read(in, ppos, pipe, len, flags);
+	if (unlikely(!in->f_op->splice_read))
+		return warn_unsupported(in, "read");
+	return in->f_op->splice_read(in, ppos, pipe, len, flags);
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e019ea2f1347e6..d33cc3e8ed410b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1894,8 +1894,6 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
-extern ssize_t vfs_readv(struct file *, const struct iovec __user *,
-		unsigned long, loff_t *, rwf_t);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 				   loff_t, size_t, unsigned int);
 extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
-- 
2.28.0


^ permalink raw reply related

* [PATCH 04/10] test_bitmap: skip user bitmap tests for !CONFIG_SET_FS
From: Christoph Hellwig @ 2020-08-27 15:00 UTC (permalink / raw)
  To: Linus Torvalds, Al Viro, Michael Ellerman, x86
  Cc: linux-fsdevel, linux-arch, linuxppc-dev, Kees Cook, linux-kernel
In-Reply-To: <20200827150030.282762-1-hch@lst.de>

We can't run the tests for userspace bitmap parsing if set_fs() doesn't
exist.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
 lib/test_bitmap.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index df903c53952bb9..49b1d25fbaf546 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -365,6 +365,7 @@ static void __init __test_bitmap_parselist(int is_user)
 	for (i = 0; i < ARRAY_SIZE(parselist_tests); i++) {
 #define ptest parselist_tests[i]
 
+#ifdef CONFIG_SET_FS
 		if (is_user) {
 			mm_segment_t orig_fs = get_fs();
 			size_t len = strlen(ptest.in);
@@ -375,7 +376,9 @@ static void __init __test_bitmap_parselist(int is_user)
 						    bmap, ptest.nbits);
 			time = ktime_get() - time;
 			set_fs(orig_fs);
-		} else {
+		} else
+#endif /* CONFIG_SET_FS */
+		{
 			time = ktime_get();
 			err = bitmap_parselist(ptest.in, bmap, ptest.nbits);
 			time = ktime_get() - time;
@@ -454,6 +457,7 @@ static void __init __test_bitmap_parse(int is_user)
 	for (i = 0; i < ARRAY_SIZE(parse_tests); i++) {
 		struct test_bitmap_parselist test = parse_tests[i];
 
+#ifdef CONFIG_SET_FS
 		if (is_user) {
 			size_t len = strlen(test.in);
 			mm_segment_t orig_fs = get_fs();
@@ -464,7 +468,9 @@ static void __init __test_bitmap_parse(int is_user)
 						bmap, test.nbits);
 			time = ktime_get() - time;
 			set_fs(orig_fs);
-		} else {
+		} else
+#endif /* CONFIG_SET_FS */
+		{
 			size_t len = test.flags & NO_LEN ?
 				UINT_MAX : strlen(test.in);
 			time = ktime_get();
-- 
2.28.0


^ permalink raw reply related

* [PATCH 06/10] x86: move PAGE_OFFSET, TASK_SIZE & friends to page_{32, 64}_types.h
From: Christoph Hellwig @ 2020-08-27 15:00 UTC (permalink / raw)
  To: Linus Torvalds, Al Viro, Michael Ellerman, x86
  Cc: linux-fsdevel, linux-arch, linuxppc-dev, Kees Cook, linux-kernel
In-Reply-To: <20200827150030.282762-1-hch@lst.de>

At least for 64-bit this moves them closer to some of the defines
they are based on, and it prepares for using the TASK_SIZE_MAX
definition from assembly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
 arch/x86/include/asm/page_32_types.h | 11 +++++++
 arch/x86/include/asm/page_64_types.h | 38 +++++++++++++++++++++
 arch/x86/include/asm/processor.h     | 49 ----------------------------
 3 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 565ad755c785e2..26236925fb2c36 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -41,6 +41,17 @@
 #define __VIRTUAL_MASK_SHIFT	32
 #endif	/* CONFIG_X86_PAE */
 
+/*
+ * User space process size: 3GB (default).
+ */
+#define IA32_PAGE_OFFSET	PAGE_OFFSET
+#define TASK_SIZE		PAGE_OFFSET
+#define TASK_SIZE_LOW		TASK_SIZE
+#define TASK_SIZE_MAX		TASK_SIZE
+#define DEFAULT_MAP_WINDOW	TASK_SIZE
+#define STACK_TOP		TASK_SIZE
+#define STACK_TOP_MAX		STACK_TOP
+
 /*
  * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
  */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 288b065955b729..996595c9897e0a 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -58,6 +58,44 @@
 #define __VIRTUAL_MASK_SHIFT	47
 #endif
 
+/*
+ * User space process size.  This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen.  This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
+ */
+#define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
+
+#define DEFAULT_MAP_WINDOW	((1UL << 47) - PAGE_SIZE)
+
+/* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+#define IA32_PAGE_OFFSET	((current->personality & ADDR_LIMIT_3GB) ? \
+					0xc0000000 : 0xFFFFe000)
+
+#define TASK_SIZE_LOW		(test_thread_flag(TIF_ADDR32) ? \
+					IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
+#define TASK_SIZE		(test_thread_flag(TIF_ADDR32) ? \
+					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+#define TASK_SIZE_OF(child)	((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
+					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
+
+#define STACK_TOP		TASK_SIZE_LOW
+#define STACK_TOP_MAX		TASK_SIZE_MAX
+
 /*
  * Maximum kernel image size is limited to 1 GiB, due to the fixmap living
  * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 97143d87994c24..1618eeb08361a9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -782,17 +782,6 @@ static inline void spin_lock_prefetch(const void *x)
 })
 
 #ifdef CONFIG_X86_32
-/*
- * User space process size: 3GB (default).
- */
-#define IA32_PAGE_OFFSET	PAGE_OFFSET
-#define TASK_SIZE		PAGE_OFFSET
-#define TASK_SIZE_LOW		TASK_SIZE
-#define TASK_SIZE_MAX		TASK_SIZE
-#define DEFAULT_MAP_WINDOW	TASK_SIZE
-#define STACK_TOP		TASK_SIZE
-#define STACK_TOP_MAX		STACK_TOP
-
 #define INIT_THREAD  {							  \
 	.sp0			= TOP_OF_INIT_STACK,			  \
 	.sysenter_cs		= __KERNEL_CS,				  \
@@ -802,44 +791,6 @@ static inline void spin_lock_prefetch(const void *x)
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)
 
 #else
-/*
- * User space process size.  This is the first address outside the user range.
- * There are a few constraints that determine this:
- *
- * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
- * address, then that syscall will enter the kernel with a
- * non-canonical return address, and SYSRET will explode dangerously.
- * We avoid this particular problem by preventing anything executable
- * from being mapped at the maximum canonical address.
- *
- * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
- * CPUs malfunction if they execute code from the highest canonical page.
- * They'll speculate right off the end of the canonical space, and
- * bad things happen.  This is worked around in the same way as the
- * Intel problem.
- *
- * With page table isolation enabled, we map the LDT in ... [stay tuned]
- */
-#define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
-
-#define DEFAULT_MAP_WINDOW	((1UL << 47) - PAGE_SIZE)
-
-/* This decides where the kernel will search for a free chunk of vm
- * space during mmap's.
- */
-#define IA32_PAGE_OFFSET	((current->personality & ADDR_LIMIT_3GB) ? \
-					0xc0000000 : 0xFFFFe000)
-
-#define TASK_SIZE_LOW		(test_thread_flag(TIF_ADDR32) ? \
-					IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW)
-#define TASK_SIZE		(test_thread_flag(TIF_ADDR32) ? \
-					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
-#define TASK_SIZE_OF(child)	((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
-					IA32_PAGE_OFFSET : TASK_SIZE_MAX)
-
-#define STACK_TOP		TASK_SIZE_LOW
-#define STACK_TOP_MAX		TASK_SIZE_MAX
-
 #define INIT_THREAD  {						\
 	.addr_limit		= KERNEL_DS,			\
 }
-- 
2.28.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox