LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v4 08/13] mm/debug_vm_pgtable/locks: Move non page table modifying test together
From: Aneesh Kumar K.V @ 2020-09-02 11:42 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: linuxppc-dev, Aneesh Kumar K.V, Anshuman Khandual
In-Reply-To: <20200902114222.181353-1-aneesh.kumar@linux.ibm.com>

This will help in adding proper locks in a later patch

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 51 ++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index de333871f407..f59cf6a9b05e 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -986,7 +986,7 @@ static int __init debug_vm_pgtable(void)
 	p4dp = p4d_alloc(mm, pgdp, vaddr);
 	pudp = pud_alloc(mm, p4dp, vaddr);
 	pmdp = pmd_alloc(mm, pudp, vaddr);
-	ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+	ptep = pte_alloc_map(mm, pmdp, vaddr);
 
 	/*
 	 * Save all the page table page addresses as the page table
@@ -1006,33 +1006,12 @@ static int __init debug_vm_pgtable(void)
 	p4d_basic_tests(p4d_aligned, prot);
 	pgd_basic_tests(pgd_aligned, prot);
 
-	pte_clear_tests(mm, ptep, vaddr);
-	pmd_clear_tests(mm, pmdp);
-	pud_clear_tests(mm, pudp);
-	p4d_clear_tests(mm, p4dp);
-	pgd_clear_tests(mm, pgdp);
-
-	pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
-	pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
-	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-
 	pmd_leaf_tests(pmd_aligned, prot);
 	pud_leaf_tests(pud_aligned, prot);
 
-	pmd_huge_tests(pmdp, pmd_aligned, prot);
-	pud_huge_tests(pudp, pud_aligned, prot);
-
 	pte_savedwrite_tests(pte_aligned, protnone);
 	pmd_savedwrite_tests(pmd_aligned, protnone);
 
-	pte_unmap_unlock(ptep, ptl);
-
-	pmd_populate_tests(mm, pmdp, saved_ptep);
-	pud_populate_tests(mm, pudp, saved_pmdp);
-	p4d_populate_tests(mm, p4dp, saved_pudp);
-	pgd_populate_tests(mm, pgdp, saved_p4dp);
-
 	pte_special_tests(pte_aligned, prot);
 	pte_protnone_tests(pte_aligned, protnone);
 	pmd_protnone_tests(pmd_aligned, protnone);
@@ -1050,11 +1029,37 @@ static int __init debug_vm_pgtable(void)
 	pmd_swap_tests(pmd_aligned, prot);
 
 	swap_migration_tests();
-	hugetlb_basic_tests(pte_aligned, prot);
 
 	pmd_thp_tests(pmd_aligned, prot);
 	pud_thp_tests(pud_aligned, prot);
 
+	hugetlb_basic_tests(pte_aligned, prot);
+
+	pte_clear_tests(mm, ptep, vaddr);
+	pmd_clear_tests(mm, pmdp);
+	pud_clear_tests(mm, pudp);
+	p4d_clear_tests(mm, p4dp);
+	pgd_clear_tests(mm, pgdp);
+
+	ptl = pte_lockptr(mm, pmdp);
+	spin_lock(ptl);
+
+	pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
+	pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+
+
+	pmd_huge_tests(pmdp, pmd_aligned, prot);
+	pud_huge_tests(pudp, pud_aligned, prot);
+
+	pte_unmap_unlock(ptep, ptl);
+
+	pmd_populate_tests(mm, pmdp, saved_ptep);
+	pud_populate_tests(mm, pudp, saved_pmdp);
+	p4d_populate_tests(mm, p4dp, saved_pudp);
+	pgd_populate_tests(mm, pgdp, saved_p4dp);
+
 	p4d_free(mm, saved_p4dp);
 	pud_free(mm, saved_pudp);
 	pmd_free(mm, saved_pmdp);
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 07/13] mm/debug_vm_pgtable/set_pte/pmd/pud: Don't use set_*_at to update an existing pte entry
From: Aneesh Kumar K.V @ 2020-09-02 11:42 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: linuxppc-dev, Aneesh Kumar K.V, Anshuman Khandual
In-Reply-To: <20200902114222.181353-1-aneesh.kumar@linux.ibm.com>

set_pte_at() should not be used to set a pte entry at locations that
already holds a valid pte entry. Architectures like ppc64 don't do TLB
invalidate in set_pte_at() and hence expect it to be used to set locations
that are not a valid PTE.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 9cafed39c236..de333871f407 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -79,15 +79,18 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
 {
 	pte_t pte = pfn_pte(pfn, prot);
 
+	/*
+	 * Architectures optimize set_pte_at by avoiding TLB flush.
+	 * This requires set_pte_at to be not used to update an
+	 * existing pte entry. Clear pte before we do set_pte_at
+	 */
+
 	pr_debug("Validating PTE advanced\n");
 	pte = pfn_pte(pfn, prot);
 	set_pte_at(mm, vaddr, ptep, pte);
 	ptep_set_wrprotect(mm, vaddr, ptep);
 	pte = ptep_get(ptep);
 	WARN_ON(pte_write(pte));
-
-	pte = pfn_pte(pfn, prot);
-	set_pte_at(mm, vaddr, ptep, pte);
 	ptep_get_and_clear(mm, vaddr, ptep);
 	pte = ptep_get(ptep);
 	WARN_ON(!pte_none(pte));
@@ -101,13 +104,11 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
 	ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
 	pte = ptep_get(ptep);
 	WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
-
-	pte = pfn_pte(pfn, prot);
-	set_pte_at(mm, vaddr, ptep, pte);
 	ptep_get_and_clear_full(mm, vaddr, ptep, 1);
 	pte = ptep_get(ptep);
 	WARN_ON(!pte_none(pte));
 
+	pte = pfn_pte(pfn, prot);
 	pte = pte_mkyoung(pte);
 	set_pte_at(mm, vaddr, ptep, pte);
 	ptep_test_and_clear_young(vma, vaddr, ptep);
@@ -169,9 +170,6 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 	pmdp_set_wrprotect(mm, vaddr, pmdp);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(pmd_write(pmd));
-
-	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
-	set_pmd_at(mm, vaddr, pmdp, pmd);
 	pmdp_huge_get_and_clear(mm, vaddr, pmdp);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(!pmd_none(pmd));
@@ -185,13 +183,11 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 	pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
-
-	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
-	set_pmd_at(mm, vaddr, pmdp, pmd);
 	pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(!pmd_none(pmd));
 
+	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
 	pmd = pmd_mkyoung(pmd);
 	set_pmd_at(mm, vaddr, pmdp, pmd);
 	pmdp_test_and_clear_young(vma, vaddr, pmdp);
@@ -292,17 +288,9 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
 	WARN_ON(pud_write(pud));
 
 #ifndef __PAGETABLE_PMD_FOLDED
-	pud = pud_mkhuge(pfn_pud(pfn, prot));
-	set_pud_at(mm, vaddr, pudp, pud);
 	pudp_huge_get_and_clear(mm, vaddr, pudp);
 	pud = READ_ONCE(*pudp);
 	WARN_ON(!pud_none(pud));
-
-	pud = pud_mkhuge(pfn_pud(pfn, prot));
-	set_pud_at(mm, vaddr, pudp, pud);
-	pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
-	pud = READ_ONCE(*pudp);
-	WARN_ON(!pud_none(pud));
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 	pud = pud_mkhuge(pfn_pud(pfn, prot));
@@ -315,6 +303,13 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
 	pud = READ_ONCE(*pudp);
 	WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
 
+#ifndef __PAGETABLE_PMD_FOLDED
+	pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+	pud = READ_ONCE(*pudp);
+	WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+	pud = pud_mkhuge(pfn_pud(pfn, prot));
 	pud = pud_mkyoung(pud);
 	set_pud_at(mm, vaddr, pudp, pud);
 	pudp_test_and_clear_young(vma, vaddr, pudp);
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 06/13] mm/debug_vm_pgtable/THP: Mark the pte entry huge before using set_pmd/pud_at
From: Aneesh Kumar K.V @ 2020-09-02 11:42 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: linuxppc-dev, Aneesh Kumar K.V, Anshuman Khandual
In-Reply-To: <20200902114222.181353-1-aneesh.kumar@linux.ibm.com>

kernel expects entries to be marked huge before we use
set_pmd_at()/set_pud_at().

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 8704901f6bd8..9cafed39c236 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -155,7 +155,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 				      unsigned long pfn, unsigned long vaddr,
 				      pgprot_t prot)
 {
-	pmd_t pmd = pfn_pmd(pfn, prot);
+	pmd_t pmd;
 
 	if (!has_transparent_hugepage())
 		return;
@@ -164,19 +164,19 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 	/* Align the address wrt HPAGE_PMD_SIZE */
 	vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
 
-	pmd = pfn_pmd(pfn, prot);
+	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
 	set_pmd_at(mm, vaddr, pmdp, pmd);
 	pmdp_set_wrprotect(mm, vaddr, pmdp);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(pmd_write(pmd));
 
-	pmd = pfn_pmd(pfn, prot);
+	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
 	set_pmd_at(mm, vaddr, pmdp, pmd);
 	pmdp_huge_get_and_clear(mm, vaddr, pmdp);
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(!pmd_none(pmd));
 
-	pmd = pfn_pmd(pfn, prot);
+	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
 	pmd = pmd_wrprotect(pmd);
 	pmd = pmd_mkclean(pmd);
 	set_pmd_at(mm, vaddr, pmdp, pmd);
@@ -236,7 +236,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
 
 static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 {
-	pmd_t pmd = pfn_pmd(pfn, prot);
+	pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
 
 	if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
 		return;
@@ -276,7 +276,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
 				      unsigned long pfn, unsigned long vaddr,
 				      pgprot_t prot)
 {
-	pud_t pud = pfn_pud(pfn, prot);
+	pud_t pud;
 
 	if (!has_transparent_hugepage())
 		return;
@@ -285,25 +285,27 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
 	/* Align the address wrt HPAGE_PUD_SIZE */
 	vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
 
+	pud = pud_mkhuge(pfn_pud(pfn, prot));
 	set_pud_at(mm, vaddr, pudp, pud);
 	pudp_set_wrprotect(mm, vaddr, pudp);
 	pud = READ_ONCE(*pudp);
 	WARN_ON(pud_write(pud));
 
 #ifndef __PAGETABLE_PMD_FOLDED
-	pud = pfn_pud(pfn, prot);
+	pud = pud_mkhuge(pfn_pud(pfn, prot));
 	set_pud_at(mm, vaddr, pudp, pud);
 	pudp_huge_get_and_clear(mm, vaddr, pudp);
 	pud = READ_ONCE(*pudp);
 	WARN_ON(!pud_none(pud));
 
-	pud = pfn_pud(pfn, prot);
+	pud = pud_mkhuge(pfn_pud(pfn, prot));
 	set_pud_at(mm, vaddr, pudp, pud);
 	pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
 	pud = READ_ONCE(*pudp);
 	WARN_ON(!pud_none(pud));
 #endif /* __PAGETABLE_PMD_FOLDED */
-	pud = pfn_pud(pfn, prot);
+
+	pud = pud_mkhuge(pfn_pud(pfn, prot));
 	pud = pud_wrprotect(pud);
 	pud = pud_mkclean(pud);
 	set_pud_at(mm, vaddr, pudp, pud);
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 05/13] mm/debug_vm_pgtable/savedwrite: Enable savedwrite test with CONFIG_NUMA_BALANCING
From: Aneesh Kumar K.V @ 2020-09-02 11:42 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: linuxppc-dev, Aneesh Kumar K.V, Anshuman Khandual
In-Reply-To: <20200902114222.181353-1-aneesh.kumar@linux.ibm.com>

Saved write support was added to track the write bit of a pte after
marking the pte protnone. This was done so that AUTONUMA can convert
a write pte to protnone and still track the old write bit. When converting
it back we set the pte write bit correctly thereby avoiding a write fault
again. Hence enable the test only when CONFIG_NUMA_BALANCING is enabled and
use protnone protflags.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 4c73e63b4ceb..8704901f6bd8 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -119,10 +119,14 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 {
 	pte_t pte = pfn_pte(pfn, prot);
 
+	if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+		return;
+
 	pr_debug("Validating PTE saved write\n");
 	WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
 	WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
 }
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
 {
@@ -234,6 +238,9 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 {
 	pmd_t pmd = pfn_pmd(pfn, prot);
 
+	if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+		return;
+
 	pr_debug("Validating PMD saved write\n");
 	WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
 	WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
@@ -1019,8 +1026,8 @@ static int __init debug_vm_pgtable(void)
 	pmd_huge_tests(pmdp, pmd_aligned, prot);
 	pud_huge_tests(pudp, pud_aligned, prot);
 
-	pte_savedwrite_tests(pte_aligned, prot);
-	pmd_savedwrite_tests(pmd_aligned, prot);
+	pte_savedwrite_tests(pte_aligned, protnone);
+	pmd_savedwrite_tests(pmd_aligned, protnone);
 
 	pte_unmap_unlock(ptep, ptl);
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 04/13] mm/debug_vm_pgtables/hugevmap: Use the arch helper to identify huge vmap support.
From: Aneesh Kumar K.V @ 2020-09-02 11:42 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: linuxppc-dev, Aneesh Kumar K.V, Anshuman Khandual
In-Reply-To: <20200902114222.181353-1-aneesh.kumar@linux.ibm.com>

ppc64 supports huge vmap only with radix translation. Hence use arch helper
to determine the huge vmap support.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 00649b47f6e0..4c73e63b4ceb 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -28,6 +28,7 @@
 #include <linux/swapops.h>
 #include <linux/start_kernel.h>
 #include <linux/sched/mm.h>
+#include <linux/io.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
@@ -206,11 +207,12 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
 	WARN_ON(!pmd_leaf(pmd));
 }
 
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
 {
 	pmd_t pmd;
 
-	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+	if (!arch_ioremap_pmd_supported())
 		return;
 
 	pr_debug("Validating PMD huge\n");
@@ -224,6 +226,9 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
 	pmd = READ_ONCE(*pmdp);
 	WARN_ON(!pmd_none(pmd));
 }
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
 static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 {
@@ -320,11 +325,12 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
 	WARN_ON(!pud_leaf(pud));
 }
 
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
 {
 	pud_t pud;
 
-	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+	if (!arch_ioremap_pud_supported())
 		return;
 
 	pr_debug("Validating PUD huge\n");
@@ -338,6 +344,10 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
 	pud = READ_ONCE(*pudp);
 	WARN_ON(!pud_none(pud));
 }
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+
 #else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
 static void __init pud_advanced_tests(struct mm_struct *mm,
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 03/13] mm/debug_vm_pgtable/ppc64: Avoid setting top bits in radom value
From: Aneesh Kumar K.V @ 2020-09-02 11:42 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: linuxppc-dev, Aneesh Kumar K.V, Anshuman Khandual
In-Reply-To: <20200902114222.181353-1-aneesh.kumar@linux.ibm.com>

ppc64 use bit 62 to indicate a pte entry (_PAGE_PTE). Avoid setting
that bit in random value.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/debug_vm_pgtable.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 086309fb9b6f..00649b47f6e0 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -44,10 +44,17 @@
  * entry type. But these bits might affect the ability to clear entries with
  * pxx_clear() because of how dynamic page table folding works on s390. So
  * while loading up the entries do not change the lower 4 bits. It does not
- * have affect any other platform.
+ * have affect any other platform. Also avoid the 62nd bit on ppc64 that is
+ * used to mark a pte entry.
  */
-#define S390_MASK_BITS	4
-#define RANDOM_ORVALUE	GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define S390_SKIP_MASK		GENMASK(3, 0)
+#if __BITS_PER_LONG == 64
+#define PPC64_SKIP_MASK		GENMASK(62, 62)
+#else
+#define PPC64_SKIP_MASK		0x0
+#endif
+#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK)
+#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
 #define RANDOM_NZVALUE	GENMASK(7, 0)
 
 static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 01/13] powerpc/mm: Add DEBUG_VM WARN for pmd_clear
From: Aneesh Kumar K.V @ 2020-09-02 11:42 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: linuxppc-dev, Aneesh Kumar K.V, Anshuman Khandual
In-Reply-To: <20200902114222.181353-1-aneesh.kumar@linux.ibm.com>

With the hash page table, the kernel should not use pmd_clear for clearing
huge pte entries. Add a DEBUG_VM WARN to catch the wrong usage.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 6de56c3b33c4..079211968987 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -868,6 +868,13 @@ static inline bool pte_ci(pte_t pte)
 
 static inline void pmd_clear(pmd_t *pmdp)
 {
+	if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
+		/*
+		 * Don't use this if we can possibly have a hash page table
+		 * entry mapping this.
+		 */
+		WARN_ON((pmd_val(*pmdp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE));
+	}
 	*pmdp = __pmd(0);
 }
 
@@ -916,6 +923,13 @@ static inline int pmd_bad(pmd_t pmd)
 
 static inline void pud_clear(pud_t *pudp)
 {
+	if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
+		/*
+		 * Don't use this if we can possibly have a hash page table
+		 * entry mapping this.
+		 */
+		WARN_ON((pud_val(*pudp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE));
+	}
 	*pudp = __pud(0);
 }
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 02/13] powerpc/mm: Move setting pte specific flags to pfn_pte
From: Aneesh Kumar K.V @ 2020-09-02 11:42 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: linuxppc-dev, Aneesh Kumar K.V, Anshuman Khandual
In-Reply-To: <20200902114222.181353-1-aneesh.kumar@linux.ibm.com>

powerpc used to set the pte specific flags in set_pte_at(). This is
different from other architectures. To be consistent with other
architecture update pfn_pte to set _PAGE_PTE on ppc64. Also, drop now
unused pte_mkpte.

We add a VM_WARN_ON() to catch the usage of calling set_pte_at()
without setting _PAGE_PTE bit. We will remove that after a few releases.

With respect to huge pmd entries, pmd_mkhuge() takes care of adding the
_PAGE_PTE bit.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 15 +++++++++------
 arch/powerpc/include/asm/nohash/pgtable.h    |  5 -----
 arch/powerpc/mm/pgtable.c                    |  5 -----
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 079211968987..2382fd516f6b 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -619,7 +619,7 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 	VM_BUG_ON(pfn >> (64 - PAGE_SHIFT));
 	VM_BUG_ON((pfn << PAGE_SHIFT) & ~PTE_RPN_MASK);
 
-	return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot));
+	return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot) | _PAGE_PTE);
 }
 
 static inline unsigned long pte_pfn(pte_t pte)
@@ -655,11 +655,6 @@ static inline pte_t pte_mkexec(pte_t pte)
 	return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC));
 }
 
-static inline pte_t pte_mkpte(pte_t pte)
-{
-	return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE));
-}
-
 static inline pte_t pte_mkwrite(pte_t pte)
 {
 	/*
@@ -823,6 +818,14 @@ static inline int pte_none(pte_t pte)
 static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 				pte_t *ptep, pte_t pte, int percpu)
 {
+
+	VM_WARN_ON(!(pte_raw(pte) & cpu_to_be64(_PAGE_PTE)));
+	/*
+	 * Keep the _PAGE_PTE added till we are sure we handle _PAGE_PTE
+	 * in all the callers.
+	 */
+	 pte = __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE));
+
 	if (radix_enabled())
 		return radix__set_pte_at(mm, addr, ptep, pte, percpu);
 	return hash__set_pte_at(mm, addr, ptep, pte, percpu);
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h
index 4b7c3472eab1..6277e7596ae5 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -140,11 +140,6 @@ static inline pte_t pte_mkold(pte_t pte)
 	return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
 }
 
-static inline pte_t pte_mkpte(pte_t pte)
-{
-	return pte;
-}
-
 static inline pte_t pte_mkspecial(pte_t pte)
 {
 	return __pte(pte_val(pte) | _PAGE_SPECIAL);
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9c0547d77af3..ab57b07ef39a 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -184,9 +184,6 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 	 */
 	VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
 
-	/* Add the pte bit when trying to set a pte */
-	pte = pte_mkpte(pte);
-
 	/* Note: mm->context.id might not yet have been assigned as
 	 * this context might not have been activated yet when this
 	 * is called.
@@ -275,8 +272,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_
 	 */
 	VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
 
-	pte = pte_mkpte(pte);
-
 	pte = set_pte_filter(pte);
 
 	val = pte_val(pte);
-- 
2.26.2


^ permalink raw reply related

* [PATCH v4 00/13] mm/debug_vm_pgtable fixes
From: Aneesh Kumar K.V @ 2020-09-02 11:42 UTC (permalink / raw)
  To: linux-mm, akpm; +Cc: linuxppc-dev, Aneesh Kumar K.V, Anshuman Khandual

This patch series includes fixes for debug_vm_pgtable test code so that
they follow page table updates rules correctly. The first two patches introduce
changes w.r.t ppc64. The patches are included in this series for completeness. We can
merge them via ppc64 tree if required.

Hugetlb test is disabled on ppc64 because that needs larger change to satisfy
page table update rules.

These tests are broken w.r.t page table update rules and results in kernel
crash as below. 

[   21.083519] kernel BUG at arch/powerpc/mm/pgtable.c:304!
cpu 0x0: Vector: 700 (Program Check) at [c000000c6d1e76c0]
    pc: c00000000009a5ec: assert_pte_locked+0x14c/0x380
    lr: c0000000005eeeec: pte_update+0x11c/0x190
    sp: c000000c6d1e7950
   msr: 8000000002029033
  current = 0xc000000c6d172c80
  paca    = 0xc000000003ba0000   irqmask: 0x03   irq_happened: 0x01
    pid   = 1, comm = swapper/0
kernel BUG at arch/powerpc/mm/pgtable.c:304!
[link register   ] c0000000005eeeec pte_update+0x11c/0x190
[c000000c6d1e7950] 0000000000000001 (unreliable)
[c000000c6d1e79b0] c0000000005eee14 pte_update+0x44/0x190
[c000000c6d1e7a10] c000000001a2ca9c pte_advanced_tests+0x160/0x3d8
[c000000c6d1e7ab0] c000000001a2d4fc debug_vm_pgtable+0x7e8/0x1338
[c000000c6d1e7ba0] c0000000000116ec do_one_initcall+0xac/0x5f0
[c000000c6d1e7c80] c0000000019e4fac kernel_init_freeable+0x4dc/0x5a4
[c000000c6d1e7db0] c000000000012474 kernel_init+0x24/0x160
[c000000c6d1e7e20] c00000000000cbd0 ret_from_kernel_thread+0x5c/0x6c

With DEBUG_VM disabled

[   20.530152] BUG: Kernel NULL pointer dereference on read at 0x00000000
[   20.530183] Faulting instruction address: 0xc0000000000df330
cpu 0x33: Vector: 380 (Data SLB Access) at [c000000c6d19f700]
    pc: c0000000000df330: memset+0x68/0x104
    lr: c00000000009f6d8: hash__pmdp_huge_get_and_clear+0xe8/0x1b0
    sp: c000000c6d19f990
   msr: 8000000002009033
   dar: 0
  current = 0xc000000c6d177480
  paca    = 0xc00000001ec4f400   irqmask: 0x03   irq_happened: 0x01
    pid   = 1, comm = swapper/0
[link register   ] c00000000009f6d8 hash__pmdp_huge_get_and_clear+0xe8/0x1b0
[c000000c6d19f990] c00000000009f748 hash__pmdp_huge_get_and_clear+0x158/0x1b0 (unreliable)
[c000000c6d19fa10] c0000000019ebf30 pmd_advanced_tests+0x1f0/0x378
[c000000c6d19fab0] c0000000019ed088 debug_vm_pgtable+0x79c/0x1244
[c000000c6d19fba0] c0000000000116ec do_one_initcall+0xac/0x5f0
[c000000c6d19fc80] c0000000019a4fac kernel_init_freeable+0x4dc/0x5a4
[c000000c6d19fdb0] c000000000012474 kernel_init+0x24/0x160
[c000000c6d19fe20] c00000000000cbd0 ret_from_kernel_thread+0x5c/0x6c

Changes from v3:
* Address review feedback
* Move page table depost and withdraw patch after adding pmdlock to avoid bisect failure.

Changes from v2:
* Fix build failure with different configs and architecture.

Changes from v1:
* Address review feedback
* drop test specific pfn_pte and pfn_pmd.
* Update ppc64 page table helper to add _PAGE_PTE 


Aneesh Kumar K.V (13):
  powerpc/mm: Add DEBUG_VM WARN for pmd_clear
  powerpc/mm: Move setting pte specific flags to pfn_pte
  mm/debug_vm_pgtable/ppc64: Avoid setting top bits in radom value
  mm/debug_vm_pgtables/hugevmap: Use the arch helper to identify huge
    vmap support.
  mm/debug_vm_pgtable/savedwrite: Enable savedwrite test with
    CONFIG_NUMA_BALANCING
  mm/debug_vm_pgtable/THP: Mark the pte entry huge before using
    set_pmd/pud_at
  mm/debug_vm_pgtable/set_pte/pmd/pud: Don't use set_*_at to update an
    existing pte entry
  mm/debug_vm_pgtable/locks: Move non page table modifying test together
  mm/debug_vm_pgtable/locks: Take correct page table lock
  mm/debug_vm_pgtable/thp: Use page table depost/withdraw with THP
  mm/debug_vm_pgtable/pmd_clear: Don't use pmd/pud_clear on pte entries
  mm/debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64
  mm/debug_vm_pgtable: Avoid none pte in pte_clear_test

 arch/powerpc/include/asm/book3s/64/pgtable.h |  29 +++-
 arch/powerpc/include/asm/nohash/pgtable.h    |   5 -
 arch/powerpc/mm/pgtable.c                    |   5 -
 mm/debug_vm_pgtable.c                        | 171 ++++++++++++-------
 4 files changed, 131 insertions(+), 79 deletions(-)

-- 
2.26.2


^ permalink raw reply

* [Bug 209029] kernel 5.9-rc2 fails to boot on a PowerMac G5 11,2 - BUG: Kernel NULL pointer dereference on read at 0x00000020
From: bugzilla-daemon @ 2020-09-02 10:48 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-209029-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=209029

--- Comment #4 from Erhard F. (erhard_f@mailbox.org) ---
(In reply to Christophe Leroy from comment #3)
> Did you try without CONFIG_DEBUG_VM_PGTABLE ?
Without CONFIG_DEBUG_VM_PGTABLE the G5 boots fine. Thanks!

> If you want CONFIG_DEBUG_VM_PGTABLE, the following series aims at fixing it
> for PPC64:
> https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=197961
Did not check the series as current ozlabs patches indicate that the
CONFIG_DEBUG_VM_PGTABLE option is removed for the time being.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [PATCH 2/2] powerpc/vdso32: link vdso64 with linker
From: Christophe Leroy @ 2020-09-02 10:16 UTC (permalink / raw)
  To: Nick Desaulniers, Michael Ellerman, Nicholas Piggin
  Cc: Joe Lawrence, Kees Cook, Fangrui Song, linux-kernel,
	clang-built-linux, Paul Mackerras, linuxppc-dev
In-Reply-To: <20200901222523.1941988-3-ndesaulniers@google.com>



On 9/1/20 10:25 PM, Nick Desaulniers wrote:
> Rather than invoke the compiler as the driver, use the linker. That way
> we can check --orphan-handling=warn support correctly, as cc-ldoption
> was removed in
> commit 055efab3120b ("kbuild: drop support for cc-ldoption").
> 
> Requires dropping the .got section.  I couldn't find how it was used in
> the vdso32.
> 
> Fixes: commit f2af201002a8 ("powerpc/build: vdso linker warning for orphan sections")
> Link: https://lore.kernel.org/lkml/CAKwvOdnn3wxYdJomvnveyD_njwRku3fABWT_bS92duihhywLJQ@mail.gmail.com/
> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
> ---
> Not sure removing .got is a good idea or not.  Otherwise I observe the
> following link error:
> powerpc-linux-gnu-ld: warning: orphan section `.got' from `arch/powerpc/kernel/vdso32/sigtramp.o' being placed in section `.got'
> powerpc-linux-gnu-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
> powerpc-linux-gnu-ld: final link failed: bad value

Finally I spotted it I think:

	make arch/powerpc/kernel/vdso32/ V=1

powerpc64-linux-ld  -EB -m elf64ppc -shared -soname linux-vdso32.so.1 
--eh-frame-hdr  --orphan-handling=warn -T 
arch/powerpc/kernel/vdso32/vdso32.lds 
arch/powerpc/kernel/vdso32/sigtramp.o 
arch/powerpc/kernel/vdso32/gettimeofday.o 
arch/powerpc/kernel/vdso32/datapage.o 
arch/powerpc/kernel/vdso32/cacheflush.o 
arch/powerpc/kernel/vdso32/note.o arch/powerpc/kernel/vdso32/getcpu.o -o 
arch/powerpc/kernel/vdso32/vdso32.so.dbg



If I do the same manually but with -m elf32ppc instead of -m elf64ppc, 
there is no failure.

Adding -m elf32ppc to ldflags-y also works, allthough I don't like too 
much having "-m elf64ppc -m elf32ppc" on the line.

Christophe

^ permalink raw reply

* Re: [PATCH 4/4] powerpc/64s/radix: Fix mm_cpumask trimming race vs kthread_use_mm
From: Nicholas Piggin @ 2020-09-02  9:48 UTC (permalink / raw)
  To: linux-mm, Michael Ellerman
  Cc: Jens Axboe, linux-arch, Peter Zijlstra, Aneesh Kumar K.V,
	linux-kernel, Andrew Morton, linuxppc-dev, David S. Miller
In-Reply-To: <87pn751zcb.fsf@mpe.ellerman.id.au>

Excerpts from Michael Ellerman's message of September 1, 2020 10:00 pm:
> Nicholas Piggin <npiggin@gmail.com> writes:
>> Commit 0cef77c7798a7 ("powerpc/64s/radix: flush remote CPUs out of
>> single-threaded mm_cpumask") added a mechanism to trim the mm_cpumask of
>> a process under certain conditions. One of the assumptions is that
>> mm_users would not be incremented via a reference outside the process
>> context with mmget_not_zero() then go on to kthread_use_mm() via that
>> reference.
>>
>> That invariant was broken by io_uring code (see previous sparc64 fix),
>> but I'll point Fixes: to the original powerpc commit because we are
>> changing that assumption going forward, so this will make backports
>> match up.
>>
>> Fix this by no longer relying on that assumption, but by having each CPU
>> check the mm is not being used, and clearing their own bit from the mask
>> if it's okay. This fix relies on commit 38cf307c1f20 ("mm: fix
>> kthread_use_mm() vs TLB invalidate") to disable irqs over the mm switch,
>> and ARCH_WANT_IRQS_OFF_ACTIVATE_MM to be enabled.
> 
> You could use:
> 
> Depends-on: 38cf307c1f20 ("mm: fix kthread_use_mm() vs TLB invalidate")

Good idea I wil.

>> Fixes: 0cef77c7798a7 ("powerpc/64s/radix: flush remote CPUs out of single-threaded mm_cpumask")
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/powerpc/include/asm/tlb.h       | 13 -------------
>>  arch/powerpc/mm/book3s64/radix_tlb.c | 23 ++++++++++++++++-------
>>  2 files changed, 16 insertions(+), 20 deletions(-)
> 
> One minor nit below if you're respinning anyway.
> 
> You know this stuff better than me, but I still reviewed it and it seems
> good to me.
> 
> Reviewed-by: Michael Ellerman <mpe@ellerman.id.au>

Thanks.

> 
>> diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
>> index fbc6f3002f23..d97f061fecac 100644
>> --- a/arch/powerpc/include/asm/tlb.h
>> +++ b/arch/powerpc/include/asm/tlb.h
>> @@ -66,19 +66,6 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
>>  		return false;
>>  	return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm));
>>  }
>> -static inline void mm_reset_thread_local(struct mm_struct *mm)
>> -{
>> -	WARN_ON(atomic_read(&mm->context.copros) > 0);
>> -	/*
>> -	 * It's possible for mm_access to take a reference on mm_users to
>> -	 * access the remote mm from another thread, but it's not allowed
>> -	 * to set mm_cpumask, so mm_users may be > 1 here.
>> -	 */
>> -	WARN_ON(current->mm != mm);
>> -	atomic_set(&mm->context.active_cpus, 1);
>> -	cpumask_clear(mm_cpumask(mm));
>> -	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
>> -}
>>  #else /* CONFIG_PPC_BOOK3S_64 */
>>  static inline int mm_is_thread_local(struct mm_struct *mm)
>>  {
>> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
>> index 0d233763441f..a421a0e3f930 100644
>> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
>> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
>> @@ -645,19 +645,29 @@ static void do_exit_flush_lazy_tlb(void *arg)
>>  	struct mm_struct *mm = arg;
>>  	unsigned long pid = mm->context.id;
>>  
>> +	/*
>> +	 * A kthread could have done a mmget_not_zero() after the flushing CPU
>> +	 * checked mm_users == 1, and be in the process of kthread_use_mm when
>                                 ^
>                                 in mm_is_singlethreaded()
> 
> Adding that reference would help join the dots for a new reader I think.

Yes you're right I can change that.

Thanks,
Nick

^ permalink raw reply

* Re: [RFC PATCH 1/2] KVM: PPC: Use the ppc_inst type
From: Paul Mackerras @ 2020-09-02  9:32 UTC (permalink / raw)
  To: Jordan Niethe; +Cc: linuxppc-dev, Nicholas Piggin, kvm-ppc
In-Reply-To: <CACzsE9qrgs8ujQ7HeHVo-8oyY2bdwFVnVxR5dEZns5V7qK7Cbg@mail.gmail.com>

On Wed, Sep 02, 2020 at 06:00:24PM +1000, Jordan Niethe wrote:
> On Wed, Sep 2, 2020 at 4:18 PM Paul Mackerras <paulus@ozlabs.org> wrote:
> >
> > On Thu, Aug 20, 2020 at 01:39:21PM +1000, Jordan Niethe wrote:
> > > The ppc_inst type was added to help cope with the addition of prefixed
> > > instructions to the ISA. Convert KVM to use this new type for dealing
> > > wiht instructions. For now do not try to add further support for
> > > prefixed instructions.
> >
> > This change does seem to splatter itself across a lot of code that
> > mostly or exclusively runs on machines which are not POWER10 and will
> > never need to handle prefixed instructions, unfortunately.  I wonder
> > if there is a less invasive way to approach this.
> Something less invasive would be good.
> >
> > In particular we are inflicting this 64-bit struct on 32-bit platforms
> > unnecessarily (I assume, correct me if I am wrong here).
> No, that is something that I wanted to to avoid, on 32 bit platforms
> it is a 32bit struct:
> 
> struct ppc_inst {
>         u32 val;
> #ifdef CONFIG_PPC64
>         u32 suffix;
> #endif
> } __packed;
> >
> > How would it be to do something like:
> >
> > typedef unsigned long ppc_inst_t;
> >
> > so it is 32 bits on 32-bit platforms and 64 bits on 64-bit platforms,
> > and then use that instead of 'struct ppc_inst'?  You would still need
> > to change the function declarations but I think most of the function
> > bodies would not need to be changed.  In particular you would avoid a
> > lot of the churn related to having to add ppc_inst_val() and suchlike.
> 
> Would the idea be to get rid of `struct ppc_inst` entirely or just not
> use it in kvm?
> In an earlier series I did something similar (at least code shared
> between 32bit and 64bit would need helpers, but 32bit only code need
> not change):
> 
> #ifdef __powerpc64__
> 
> typedef struct ppc_inst {
>     union {
>         struct {
>             u32 word;
>             u32 pad;
>         } __packed;
>         struct {
>             u32 prefix;
>             u32 suffix;
>         } __packed;
>     };
> } ppc_inst;
> 
> #else /* !__powerpc64__ */
> 
> typedef u32 ppc_inst;
> #endif
> 
> However mpe wanted to avoid using a typedef
> (https://patchwork.ozlabs.org/comment/2391845/)

Well it doesn't have to be typedef'd, it could just be "unsigned
long", which is used in other places for things that want to be 32-bit
on 32-bit machines and 64-bit on 64-bit machines.

I do however think that it should be a numeric type so that we can
mask, shift and compare it more easily.  I know that's less "abstract"
but it's also a lot less obfuscated and I think that will lead to
clearer code.  If you got the opposite advice from Michael Ellerman or
Nick Piggin then I will discuss it with them.

> We did also talk about just using a u64 for instructions
> (https://lore.kernel.org/linuxppc-dev/1585028462.t27rstc2uf.astroid@bobo.none/)
> but the concern was that as prefixed instructions act as two separate
> u32s (prefix is always before the suffix regardless of endianess)
> keeping it as a u64 would lead to lot of macros and potential
> confusion.
> But it does seem if that can avoid a lot of needless churn it might
> worth the trade off.

	u32 *ip;

	instr = *ip++;
	if (is_prefix(instr) && is_suitably_aligned(ip))
		instr = (instr << 32) | *ip++;

would avoid the endian issues pretty cleanly I think.  In other words
the prefix would always be the high half of the 64-bit value, so you
can't just do a single 64-bit of the instruction on little-endian
platforms; but you can't do a single 64-bit load for other reasons as
well, such as alignment.

Paul.

^ permalink raw reply

* Re: [RFC PATCH 2/2] KVM: PPC: Book3S HV: Support prefixed instructions
From: Jordan Niethe @ 2020-09-02  9:19 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, kvm-ppc
In-Reply-To: <20200902061829.GF272502@thinks.paulus.ozlabs.org>

On Wed, Sep 2, 2020 at 4:18 PM Paul Mackerras <paulus@ozlabs.org> wrote:
>
> On Thu, Aug 20, 2020 at 01:39:22PM +1000, Jordan Niethe wrote:
> > There are two main places where instructions are loaded from the guest:
> >     * Emulate loadstore - such as when performing MMIO emulation
> >       triggered by an HDSI
> >     * After an HV emulation assistance interrupt (e40)
> >
> > If it is a prefixed instruction that triggers these cases, its suffix
> > must be loaded. Use the SRR1_PREFIX bit to decide if a suffix needs to
> > be loaded. Make sure if this bit is set inject_interrupt() also sets it
> > when giving an interrupt to the guest.
> >
> > ISA v3.10 extends the Hypervisor Emulation Instruction Register (HEIR)
> > to 64 bits long to accommodate prefixed instructions. For interrupts
> > caused by a word instruction the instruction is loaded into bits 32:63
> > and bits 0:31 are zeroed. When caused by a prefixed instruction the
> > prefix and suffix are loaded into bits 0:63.
> >
> > Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
> > ---
> >  arch/powerpc/kvm/book3s.c               | 15 +++++++++++++--
> >  arch/powerpc/kvm/book3s_64_mmu_hv.c     | 10 +++++++---
> >  arch/powerpc/kvm/book3s_hv_builtin.c    |  3 +++
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 14 ++++++++++++++
> >  4 files changed, 37 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> > index 70d8967acc9b..18b1928a571b 100644
> > --- a/arch/powerpc/kvm/book3s.c
> > +++ b/arch/powerpc/kvm/book3s.c
> > @@ -456,13 +456,24 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
> >  {
> >       ulong pc = kvmppc_get_pc(vcpu);
> >       u32 word;
> > +     u64 doubleword;
> >       int r;
> >
> >       if (type == INST_SC)
> >               pc -= 4;
> >
> > -     r = kvmppc_ld(vcpu, &pc, sizeof(u32), &word, false);
> > -     *inst = ppc_inst(word);
> > +     if ((kvmppc_get_msr(vcpu) & SRR1_PREFIXED)) {
> > +             r = kvmppc_ld(vcpu, &pc, sizeof(u64), &doubleword, false);
>
> Should we also have a check here that the doubleword is not crossing a
> page boundary?  I can't think of a way to get this code to cross a
> page boundary, assuming the hardware is working correctly, but it
> makes me just a little nervous.
I didn't think it could happen but I will add a check to be safe.
>
> > +#ifdef CONFIG_CPU_LITTLE_ENDIAN
> > +             *inst = ppc_inst_prefix(doubleword & 0xffffffff, doubleword >> 32);
> > +#else
> > +             *inst = ppc_inst_prefix(doubleword >> 32, doubleword & 0xffffffff);
> > +#endif
>
> Ick.  Is there a cleaner way to do this?
Would it be nicer to read the prefix as u32 then the suffix as a u32 too?
>
> > +     } else {
> > +             r = kvmppc_ld(vcpu, &pc, sizeof(u32), &word, false);
> > +             *inst = ppc_inst(word);
> > +     }
> > +
> >       if (r == EMULATE_DONE)
> >               return r;
> >       else
> > diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> > index 775ce41738ce..0802471f4856 100644
> > --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> > +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> > @@ -411,9 +411,13 @@ static int instruction_is_store(struct ppc_inst instr)
> >       unsigned int mask;
> >
> >       mask = 0x10000000;
> > -     if ((ppc_inst_val(instr) & 0xfc000000) == 0x7c000000)
> > -             mask = 0x100;           /* major opcode 31 */
> > -     return (ppc_inst_val(instr) & mask) != 0;
> > +     if (ppc_inst_prefixed(instr)) {
> > +             return (ppc_inst_suffix(instr) & mask) != 0;
> > +     } else {
> > +             if ((ppc_inst_val(instr) & 0xfc000000) == 0x7c000000)
> > +                     mask = 0x100;           /* major opcode 31 */
> > +             return (ppc_inst_val(instr) & mask) != 0;
> > +     }
>
> The way the code worked before, the mask depended on whether the
> instruction was a D-form (or DS-form or other variant) instruction,
> where you can tell loads and stores apart by looking at the major
> opcode, or an X-form instruction, where you look at the minor opcode.
>
> Now we are only looking at the minor opcode if it is not a prefixed
> instruction.  Are there no X-form prefixed loads or stores?
I could not see an X-form load/stores so I went with just that.
But checking the ISA it does mention  "..X-form instructions that are
preceded by an MLS-form or MMLS-form prefix..." so I shall use the
other mask too.
>
> Paul.
Thank you for the comments and suggestions.

^ permalink raw reply

* [PATCH] mm: check for memory's node later during boot
From: Laurent Dufour @ 2020-09-02  9:09 UTC (permalink / raw)
  To: linux-kernel, linuxppc-dev, linux-mm, Greg Kroah-Hartman
  Cc: nathanl, cheloha, Andrew Morton, Rafael J. Wysocki

register_mem_sect_under_nodem() is checking the memory block's node id only
if the system state is "SYSTEM_BOOTING". On PowerPC, the memory blocks are
registered while the system state is "SYSTEM_SCHEDULING", the one before
SYSTEM_RUNNING.

The consequence on PowerPC guest with interleaved memory node's ranges is
that some memory block could be assigned to multiple nodes on sysfs. This
lately prevents some memory hot-plug and hot-unplug to succeed because
links are remaining. Such a panic is then displayed:

------------[ cut here ]------------
kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084!
Oops: Exception in kernel mode, sig: 5 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4
CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25
NIP:  c000000000403f34 LR: c000000000403f2c CTR: 0000000000000000
REGS: c0000004876e3660 TRAP: 0700   Not tainted  (5.9.0-rc1+)
MSR:  800000000282b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 24000448  XER: 20040000
CFAR: c000000000846d20 IRQMASK: 0
GPR00: c000000000403f2c c0000004876e38f0 c0000000012f6f00 ffffffffffffffef
GPR04: 0000000000000227 c0000004805ae680 0000000000000000 00000004886f0000
GPR08: 0000000000000226 0000000000000003 0000000000000002 fffffffffffffffd
GPR12: 0000000088000484 c00000001ec96280 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000004 0000000000000003
GPR20: c00000047814ffe0 c0000007ffff7c08 0000000000000010 c0000000013332c8
GPR24: 0000000000000000 c0000000011f6cc0 0000000000000000 0000000000000000
GPR28: ffffffffffffffef 0000000000000001 0000000150000000 0000000010000000
NIP [c000000000403f34] add_memory_resource+0x244/0x340
LR [c000000000403f2c] add_memory_resource+0x23c/0x340
Call Trace:
[c0000004876e38f0] [c000000000403f2c] add_memory_resource+0x23c/0x340 (unreliable)
[c0000004876e39c0] [c00000000040408c] __add_memory+0x5c/0xf0
[c0000004876e39f0] [c0000000000e2b94] dlpar_add_lmb+0x1b4/0x500
[c0000004876e3ad0] [c0000000000e3888] dlpar_memory+0x1f8/0xb80
[c0000004876e3b60] [c0000000000dc0d0] handle_dlpar_errorlog+0xc0/0x190
[c0000004876e3bd0] [c0000000000dc398] dlpar_store+0x198/0x4a0
[c0000004876e3c90] [c00000000072e630] kobj_attr_store+0x30/0x50
[c0000004876e3cb0] [c00000000051f954] sysfs_kf_write+0x64/0x90
[c0000004876e3cd0] [c00000000051ee40] kernfs_fop_write+0x1b0/0x290
[c0000004876e3d20] [c000000000438dd8] vfs_write+0xe8/0x290
[c0000004876e3d70] [c0000000004391ac] ksys_write+0xdc/0x130
[c0000004876e3dc0] [c000000000034e40] system_call_exception+0x160/0x270
[c0000004876e3e20] [c00000000000d740] system_call_common+0xf0/0x27c
Instruction dump:
48442e35 60000000 0b030000 3cbe0001 7fa3eb78 7bc48402 38a5fffe 7ca5fa14
78a58402 48442db1 60000000 7c7c1b78 <0b030000> 7f23cb78 4bda371d 60000000
---[ end trace 562fd6c109cd0fb2 ]---

To prevent this multiple links, make the node checking done for states
prior to SYSTEM_RUNNING.

Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 4fbce633910e ("mm/memory_hotplug.c: make register_mem_sect_under_node() a callback of walk_memory_range()")
---
 drivers/base/node.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 508b80f6329b..8e9f39b562ef 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -789,7 +789,7 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
 		 * case, during hotplug we know that all pages in the memory
 		 * block belong to the same node.
 		 */
-		if (system_state == SYSTEM_BOOTING) {
+		if (system_state < SYSTEM_RUNNING) {
 			page_nid = get_nid_for_pfn(pfn);
 			if (page_nid < 0)
 				continue;
-- 
2.28.0


^ permalink raw reply related

* Re: [PATCH] cpuidle-pseries: Fix CEDE latency conversion from tb to us
From: Gautham R Shenoy @ 2020-09-02  8:35 UTC (permalink / raw)
  To: Joel Stanley
  Cc: Gautham R. Shenoy, linux-pm, Rafael J. Wysocki,
	Linux Kernel Mailing List, Vaidyanathan Srinivasan, linuxppc-dev
In-Reply-To: <CACPK8XfZdnKusEuu8i=-aH=Wfr6X6sMrvX=btFq9PtnXJ2w-SQ@mail.gmail.com>

Hello Joel,

On Wed, Sep 02, 2020 at 01:08:35AM +0000, Joel Stanley wrote:
> On Tue, 1 Sep 2020 at 14:09, Gautham R. Shenoy <ego@linux.vnet.ibm.com> wrote:
> >
> > From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
> >
> > commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
> > CEDE(0)") sets the exit latency of CEDE(0) based on the latency values
> > of the Extended CEDE states advertised by the platform. The values
> > advertised by the platform are in timebase ticks. However the cpuidle
> > framework requires the latency values in microseconds.
> >
> > If the tb-ticks value advertised by the platform correspond to a value
> > smaller than 1us, during the conversion from tb-ticks to microseconds,
> > in the current code, the result becomes zero. This is incorrect as it
> > puts a CEDE state on par with the snooze state.
> >
> > This patch fixes this by rounding up the result obtained while
> > converting the latency value from tb-ticks to microseconds.
> >
> > Fixes: commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for
> > CEDE(0)")
> >
> > Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
> 
> Reviewed-by: Joel Stanley <joel@jms.id.au>
>

Thanks for reviewing the fix.

> Should you check for the zero case and print a warning?

Yes, that would be better. I will post a v2 with that.

> 
> > ---
> >  drivers/cpuidle/cpuidle-pseries.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
> > index ff6d99e..9043358 100644
> > --- a/drivers/cpuidle/cpuidle-pseries.c
> > +++ b/drivers/cpuidle/cpuidle-pseries.c
> > @@ -361,7 +361,7 @@ static void __init fixup_cede0_latency(void)
> >         for (i = 0; i < nr_xcede_records; i++) {
> >                 struct xcede_latency_record *record = &payload->records[i];
> >                 u64 latency_tb = be64_to_cpu(record->latency_ticks);
> > -               u64 latency_us = tb_to_ns(latency_tb) / NSEC_PER_USEC;
> > +               u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC);
> >
> >                 if (latency_us < min_latency_us)
> >                         min_latency_us = latency_us;
> > --
> > 1.9.4
> >

^ permalink raw reply

* Re: ptrace_syscall_32 is failing
From: Thomas Gleixner @ 2020-09-02  8:29 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-s390, linuxppc-dev, Vasily Gorbik, Brian Gerst,
	Heiko Carstens, X86 ML, LKML, Christian Borntraeger,
	Paul Mackerras, Catalin Marinas, Andy Lutomirski, Will Deacon,
	linux-arm-kernel
In-Reply-To: <CALCETrUpjUPPvnPuS9fP4jgid7U_qdU_yTKSq9PjJ=z2w9HvHg@mail.gmail.com>

On Tue, Sep 01 2020 at 17:09, Andy Lutomirski wrote:
> On Tue, Sep 1, 2020 at 4:50 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>> > I think that they almost work for x86, but not quite as
>> > indicated by this bug.  Even if we imagine we can somehow hack around
>> > this bug, I imagine we're going to find other problems with this
>> > model, e.g. the potential upcoming exit problem I noted in my review.
>>
>> What's the upcoming problem?
>
> If we ever want to get single-stepping fully correct across syscalls,
> we might need to inject SIGTRAP on syscall return. This would be more
> awkward if we can't run instrumentable code after the syscall part of
> the syscall is done.

We run a lot of instrumentable code after sys_foo() returns. Otherwise
all the TIF work would not be possible at all.

But you might tell me where exactly you want to inject the SIGTRAP in
the syscall exit code flow.

>> I don't think we want that in general. The current variant is perfectly
>> fine for everything except the 32bit fast syscall nonsense. Also
>> irqentry_entry/exit is not equivalent to the syscall_enter/exit
>> counterparts.
>
> If there are any architectures in which actual work is needed to
> figure out whether something is a syscall in the first place, they'll
> want to do the usual kernel entry work before the syscall entry work.

That's low level entry code which does not require RCU, lockdep, tracing
or whatever muck we setup before actual work can be done.

arch_asm_entry()
  ...
  arch_c_entry(cause) {
    switch(cause) {
      case EXCEPTION: arch_c_exception(...);
      case SYSCALL: arch_c_syscall(...);
      ...
    }

You really want to differentiate between exception and syscall
entry/exit.

The splitting of syscall_enter_from_user_mode() is only necessary for
that 32bit fast syscall thing on x86 and there is no point to open code
it with two calls for e.g. do_syscall_64().

> Maybe your patch actually makes this possible -- I haven't digested
> all the details yet.
>
> Who advised you to drop the arch parameter?

Kees, IIRC, but I would have to search through the gazillions of mail
threads to be sure.

>> +       syscall_enter_from_user_mode_prepare(regs);
>
> I'm getting lost in all these "enter" functions...

It's not that hard.

     syscall_enter_from_user_mode_prepare()
+    syscall_enter_from_user_mode_work()
=    syscall_enter_from_user_mode()

That's exactly what you suggested just with the difference that it is
explicit for syscalls and not using irqentry_enter/exit().

If we would do that then instead of having a single call for sane
syscall pathes:

  arch_c_entry()
     nr = syscall_enter_from_user_mode();

or for that 32bit fast syscall nonsense the split variant:

  arch_c_entry()
     syscall_enter_from_user_mode_prepare();
     do_fast_syscall_muck();
     nr = syscall_enter_from_user_mode_work();

we'd have:

  arch_c_entry()
     irqentry_enter();
     local_irq_enble();
     nr = syscall_enter_from_user_mode_work();
     ...

which enforces two calls for sane entries and more code in arch/....

Thanks,

        tglx

^ permalink raw reply

* Re: [PATCH] powerpc: Fix random segfault when freeing hugetlb range
From: Aneesh Kumar K.V @ 2020-09-02  8:15 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <96409d24-c8bf-7f3a-0a81-0830174d6bcc@csgroup.eu>

On 9/2/20 1:41 PM, Christophe Leroy wrote:
> 
> 
> Le 02/09/2020 à 05:23, Aneesh Kumar K.V a écrit :
>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>
>>> The following random segfault is observed from time to time with
>>> map_hugetlb selftest:
>>>
>>> root@localhost:~# ./map_hugetlb 1 19
>>> 524288 kB hugepages
>>> Mapping 1 Mbytes
>>> Segmentation fault
>>>
>>> [   31.219972] map_hugetlb[365]: segfault (11) at 117 nip 77974f8c lr 
>>> 779a6834 code 1 in ld-2.23.so[77966000+21000]
>>> [   31.220192] map_hugetlb[365]: code: 9421ffc0 480318d1 93410028 
>>> 90010044 9361002c 93810030 93a10034 93c10038
>>> [   31.220307] map_hugetlb[365]: code: 93e1003c 93210024 8123007c 
>>> 81430038 <80e90004> 814a0004 7f443a14 813a0004
>>> [   31.221911] BUG: Bad rss-counter state mm:(ptrval) 
>>> type:MM_FILEPAGES val:33
>>> [   31.229362] BUG: Bad rss-counter state mm:(ptrval) 
>>> type:MM_ANONPAGES val:5
>>>
>>> This fault is due to hugetlb_free_pgd_range() freeing page tables
>>> that are also used by regular pages.
>>>
>>> As explain in the comment at the beginning of
>>> hugetlb_free_pgd_range(), the verification done in free_pgd_range()
>>> on floor and ceiling is not done here, which means
>>> hugetlb_free_pte_range() can free outside the expected range.
>>>
>>> As the verification cannot be done in hugetlb_free_pgd_range(), it
>>> must be done in hugetlb_free_pte_range().
>>>
>>
>> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>
>>> Fixes: b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as standard 
>>> pages.")
>>> Cc: stable@vger.kernel.org
>>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>>> ---
>>>   arch/powerpc/mm/hugetlbpage.c | 18 ++++++++++++++++--
>>>   1 file changed, 16 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/powerpc/mm/hugetlbpage.c 
>>> b/arch/powerpc/mm/hugetlbpage.c
>>> index 26292544630f..e7ae2a2c4545 100644
>>> --- a/arch/powerpc/mm/hugetlbpage.c
>>> +++ b/arch/powerpc/mm/hugetlbpage.c
>>> @@ -330,10 +330,24 @@ static void free_hugepd_range(struct mmu_gather 
>>> *tlb, hugepd_t *hpdp, int pdshif
>>>                    get_hugepd_cache_index(pdshift - shift));
>>>   }
>>> -static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t 
>>> *pmd, unsigned long addr)
>>> +static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
>>> +                   unsigned long addr, unsigned long end,
>>> +                   unsigned long floor, unsigned long ceiling)
>>>   {
>>> +    unsigned long start = addr;
>>>       pgtable_t token = pmd_pgtable(*pmd);
>>> +    start &= PMD_MASK;
>>> +    if (start < floor)
>>> +        return;
>>> +    if (ceiling) {
>>> +        ceiling &= PMD_MASK;
>>> +        if (!ceiling)
>>> +            return;
>>> +    }
>>> +    if (end - 1 > ceiling - 1)
>>> +        return;
>>> +
>>
>> We do repeat that for pud/pmd/pte hugetlb_free_range. Can we consolidate
>> that with comment explaining we are checking if the pgtable entry is
>> mapping outside the range?
> 
> I was thinking about refactoring that into a helper and add all the 
> necessary comments to explain what it does.
> 
> Will do that in a followup series if you are OK. This patch is a bug fix 
> and also have to go through stable.
> 

agreed.

Thanks.
-aneesh

^ permalink raw reply

* Re: [PATCH 0/2] dma-mapping: update default segment_boundary_mask
From: Niklas Schnelle @ 2020-09-02  8:13 UTC (permalink / raw)
  To: Nicolin Chen, hch
  Cc: linux-ia64, James.Bottomley, paulus, hpa, sparclinux, sfr, deller,
	x86, borntraeger, mingo, mattst88, fenghua.yu, gor, linux-s390,
	hca, ink, tglx, gerald.schaefer, rth, tony.luck, linux-parisc,
	linux-kernel, linux-alpha, bp, linuxppc-dev, davem
In-Reply-To: <20200901221646.26491-1-nicoleotsuka@gmail.com>



On 9/2/20 12:16 AM, Nicolin Chen wrote:
> These two patches are to update default segment_boundary_mask.
> 
> PATCH-1 fixes overflow issues in callers of dma_get_seg_boundary.
> Previous version was a series: https://lkml.org/lkml/2020/8/31/1026
> 
> Then PATCH-2 sets default segment_boundary_mask to ULONG_MAX.
> 
> Nicolin Chen (2):
>   dma-mapping: introduce dma_get_seg_boundary_nr_pages()
>   dma-mapping: set default segment_boundary_mask to ULONG_MAX

I gave both of your patches a quick test ride on a couple of dev mainframes,
both NVMe, ConnectX and virtio-pci devices all seems to work fine.
I already commented on Christoph's mail that I like the helper approach,
so as for s390 you can add my

Acked-by: Niklas Schnelle <schnelle@linux.ibm.com>

> 
>  arch/alpha/kernel/pci_iommu.c    |  7 +------
>  arch/ia64/hp/common/sba_iommu.c  |  3 +--
>  arch/powerpc/kernel/iommu.c      |  9 ++-------
>  arch/s390/pci/pci_dma.c          |  6 ++----
>  arch/sparc/kernel/iommu-common.c | 10 +++-------
>  arch/sparc/kernel/iommu.c        |  3 +--
>  arch/sparc/kernel/pci_sun4v.c    |  3 +--
>  arch/x86/kernel/amd_gart_64.c    |  3 +--
>  drivers/parisc/ccio-dma.c        |  3 +--
>  drivers/parisc/sba_iommu.c       |  3 +--
>  include/linux/dma-mapping.h      | 21 ++++++++++++++++++++-
>  11 files changed, 34 insertions(+), 37 deletions(-)
> 

^ permalink raw reply

* Re: [PATCH] powerpc: Fix random segfault when freeing hugetlb range
From: Christophe Leroy @ 2020-09-02  8:11 UTC (permalink / raw)
  To: Aneesh Kumar K.V, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <875z8weua7.fsf@linux.ibm.com>



Le 02/09/2020 à 05:23, Aneesh Kumar K.V a écrit :
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> 
>> The following random segfault is observed from time to time with
>> map_hugetlb selftest:
>>
>> root@localhost:~# ./map_hugetlb 1 19
>> 524288 kB hugepages
>> Mapping 1 Mbytes
>> Segmentation fault
>>
>> [   31.219972] map_hugetlb[365]: segfault (11) at 117 nip 77974f8c lr 779a6834 code 1 in ld-2.23.so[77966000+21000]
>> [   31.220192] map_hugetlb[365]: code: 9421ffc0 480318d1 93410028 90010044 9361002c 93810030 93a10034 93c10038
>> [   31.220307] map_hugetlb[365]: code: 93e1003c 93210024 8123007c 81430038 <80e90004> 814a0004 7f443a14 813a0004
>> [   31.221911] BUG: Bad rss-counter state mm:(ptrval) type:MM_FILEPAGES val:33
>> [   31.229362] BUG: Bad rss-counter state mm:(ptrval) type:MM_ANONPAGES val:5
>>
>> This fault is due to hugetlb_free_pgd_range() freeing page tables
>> that are also used by regular pages.
>>
>> As explain in the comment at the beginning of
>> hugetlb_free_pgd_range(), the verification done in free_pgd_range()
>> on floor and ceiling is not done here, which means
>> hugetlb_free_pte_range() can free outside the expected range.
>>
>> As the verification cannot be done in hugetlb_free_pgd_range(), it
>> must be done in hugetlb_free_pte_range().
>>
> 
> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> 
>> Fixes: b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as standard pages.")
>> Cc: stable@vger.kernel.org
>> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
>> ---
>>   arch/powerpc/mm/hugetlbpage.c | 18 ++++++++++++++++--
>>   1 file changed, 16 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
>> index 26292544630f..e7ae2a2c4545 100644
>> --- a/arch/powerpc/mm/hugetlbpage.c
>> +++ b/arch/powerpc/mm/hugetlbpage.c
>> @@ -330,10 +330,24 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif
>>   				 get_hugepd_cache_index(pdshift - shift));
>>   }
>>   
>> -static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr)
>> +static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
>> +				   unsigned long addr, unsigned long end,
>> +				   unsigned long floor, unsigned long ceiling)
>>   {
>> +	unsigned long start = addr;
>>   	pgtable_t token = pmd_pgtable(*pmd);
>>   
>> +	start &= PMD_MASK;
>> +	if (start < floor)
>> +		return;
>> +	if (ceiling) {
>> +		ceiling &= PMD_MASK;
>> +		if (!ceiling)
>> +			return;
>> +	}
>> +	if (end - 1 > ceiling - 1)
>> +		return;
>> +
> 
> We do repeat that for pud/pmd/pte hugetlb_free_range. Can we consolidate
> that with comment explaining we are checking if the pgtable entry is
> mapping outside the range?

I was thinking about refactoring that into a helper and add all the 
necessary comments to explain what it does.

Will do that in a followup series if you are OK. This patch is a bug fix 
and also have to go through stable.

Christophe

^ permalink raw reply

* Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v2
From: Christoph Hellwig @ 2020-09-02  8:10 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-arch, Kees Cook, x86, linuxppc-dev, linux-kernel,
	linux-fsdevel, Linus Torvalds, Christoph Hellwig
In-Reply-To: <20200901172512.GI1236603@ZenIV.linux.org.uk>

On Tue, Sep 01, 2020 at 06:25:12PM +0100, Al Viro wrote:
> On Tue, Sep 01, 2020 at 07:13:00PM +0200, Christophe Leroy wrote:
> 
> >     10.92%  dd       [kernel.kallsyms]  [k] iov_iter_zero
> 
> Interesting...  Could you get an instruction-level profile inside iov_iter_zero(),
> along with the disassembly of that sucker?

So the interesting thing here is with that none of these code paths
should have changed at all, and the biggest items on the profile look
the same modulo some minor reordering.

^ permalink raw reply

* Re: [PATCH 05/10] lkdtm: disable set_fs-based tests for !CONFIG_SET_FS
From: Christoph Hellwig @ 2020-09-02  8:09 UTC (permalink / raw)
  To: Kees Cook
  Cc: linux-arch, linuxppc-dev, the arch/x86 maintainers,
	Linux Kernel Mailing List, Al Viro, linux-fsdevel, Linus Torvalds,
	Christoph Hellwig
In-Reply-To: <202009011156.0F49882@keescook>

On Tue, Sep 01, 2020 at 11:57:37AM -0700, Kees Cook wrote:
> On Sat, Aug 29, 2020 at 11:24:06AM +0200, Christoph Hellwig wrote:
> > On Thu, Aug 27, 2020 at 11:06:28AM -0700, Linus Torvalds wrote:
> > > On Thu, Aug 27, 2020 at 8:00 AM Christoph Hellwig <hch@lst.de> wrote:
> > > >
> > > > Once we can't manipulate the address limit, we also can't test what
> > > > happens when the manipulation is abused.
> > > 
> > > Just remove these tests entirely.
> > > 
> > > Once set_fs() doesn't exist on x86, the tests no longer make any sense
> > > what-so-ever, because test coverage will be basically zero.
> > > 
> > > So don't make the code uglier just to maintain a fiction that
> > > something is tested when it isn't really.
> > 
> > Sure fine with me unless Kees screams.
> 
> To clarify: if any of x86, arm64, arm, powerpc, riscv, and s390 are
> using set_fs(), I want to keep this test. "ugly" is fine in lkdtm. :)

And Linus wants them gone entirely, so I'll need a stage fight between
the two of you.  At least for this merge window I'm only planning on
x86 and power, plus maybe riscv if I get the work done in time.  Although
helper from the maintainers would be welcome.  s390 has a driver that
still uses set_fs that will need some surgery, although it shouldn't
be too bad, but arm will be a piece of work.  Unless I get help it will
take a while.

^ permalink raw reply

* Re: [RFC PATCH 1/2] KVM: PPC: Use the ppc_inst type
From: Jordan Niethe @ 2020-09-02  8:00 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linuxppc-dev, Nicholas Piggin, kvm-ppc
In-Reply-To: <20200902061318.GE272502@thinks.paulus.ozlabs.org>

On Wed, Sep 2, 2020 at 4:18 PM Paul Mackerras <paulus@ozlabs.org> wrote:
>
> On Thu, Aug 20, 2020 at 01:39:21PM +1000, Jordan Niethe wrote:
> > The ppc_inst type was added to help cope with the addition of prefixed
> > instructions to the ISA. Convert KVM to use this new type for dealing
> > wiht instructions. For now do not try to add further support for
> > prefixed instructions.
>
> This change does seem to splatter itself across a lot of code that
> mostly or exclusively runs on machines which are not POWER10 and will
> never need to handle prefixed instructions, unfortunately.  I wonder
> if there is a less invasive way to approach this.
Something less invasive would be good.
>
> In particular we are inflicting this 64-bit struct on 32-bit platforms
> unnecessarily (I assume, correct me if I am wrong here).
No, that is something that I wanted to to avoid, on 32 bit platforms
it is a 32bit struct:

struct ppc_inst {
        u32 val;
#ifdef CONFIG_PPC64
        u32 suffix;
#endif
} __packed;
>
> How would it be to do something like:
>
> typedef unsigned long ppc_inst_t;
>
> so it is 32 bits on 32-bit platforms and 64 bits on 64-bit platforms,
> and then use that instead of 'struct ppc_inst'?  You would still need
> to change the function declarations but I think most of the function
> bodies would not need to be changed.  In particular you would avoid a
> lot of the churn related to having to add ppc_inst_val() and suchlike.

Would the idea be to get rid of `struct ppc_inst` entirely or just not
use it in kvm?
In an earlier series I did something similar (at least code shared
between 32bit and 64bit would need helpers, but 32bit only code need
not change):

#ifdef __powerpc64__

typedef struct ppc_inst {
    union {
        struct {
            u32 word;
            u32 pad;
        } __packed;
        struct {
            u32 prefix;
            u32 suffix;
        } __packed;
    };
} ppc_inst;

#else /* !__powerpc64__ */

typedef u32 ppc_inst;
#endif

However mpe wanted to avoid using a typedef
(https://patchwork.ozlabs.org/comment/2391845/)

We did also talk about just using a u64 for instructions
(https://lore.kernel.org/linuxppc-dev/1585028462.t27rstc2uf.astroid@bobo.none/)
but the concern was that as prefixed instructions act as two separate
u32s (prefix is always before the suffix regardless of endianess)
keeping it as a u64 would lead to lot of macros and potential
confusion.
But it does seem if that can avoid a lot of needless churn it might
worth the trade off.
>
> > -static inline unsigned make_dsisr(unsigned instr)
> > +static inline unsigned make_dsisr(struct ppc_inst instr)
> >  {
> >       unsigned dsisr;
> > +     u32 word = ppc_inst_val(instr);
> >
> >
> >       /* bits  6:15 --> 22:31 */
> > -     dsisr = (instr & 0x03ff0000) >> 16;
> > +     dsisr = (word & 0x03ff0000) >> 16;
> >
> >       if (IS_XFORM(instr)) {
> >               /* bits 29:30 --> 15:16 */
> > -             dsisr |= (instr & 0x00000006) << 14;
> > +             dsisr |= (word & 0x00000006) << 14;
> >               /* bit     25 -->    17 */
> > -             dsisr |= (instr & 0x00000040) << 8;
> > +             dsisr |= (word & 0x00000040) << 8;
> >               /* bits 21:24 --> 18:21 */
> > -             dsisr |= (instr & 0x00000780) << 3;
> > +             dsisr |= (word & 0x00000780) << 3;
> >       } else {
> >               /* bit      5 -->    17 */
> > -             dsisr |= (instr & 0x04000000) >> 12;
> > +             dsisr |= (word & 0x04000000) >> 12;
> >               /* bits  1: 4 --> 18:21 */
> > -             dsisr |= (instr & 0x78000000) >> 17;
> > +             dsisr |= (word & 0x78000000) >> 17;
> >               /* bits 30:31 --> 12:13 */
> >               if (IS_DSFORM(instr))
> > -                     dsisr |= (instr & 0x00000003) << 18;
> > +                     dsisr |= (word & 0x00000003) << 18;
>
> Here I would have done something like:
>
> > -static inline unsigned make_dsisr(unsigned instr)
> > +static inline unsigned make_dsisr(struct ppc_inst pi)
> >  {
> >       unsigned dsisr;
> > +     u32 instr = ppc_inst_val(pi);
>
> and left the rest of the function unchanged.
That is better.
>
> At first I wondered why we still had that function, since IBM Power
> CPUs have not set DSISR on an alignment interrupt since POWER3 days.
> It turns out it this function is used by PR KVM when it is emulating
> one of the old 32-bit PowerPC CPUs (601, 603, 604, 750, 7450 etc.).
>
> > diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
>
> Despite the file name, this code is not used on IBM Power servers.
> It is for platforms which run under an ePAPR (not server PAPR)
> hypervisor (which would be a KVM variant, but generally book E KVM not
> book 3S).
>
> Paul.

^ permalink raw reply

* Re: [PATCH 2/2] powerpc/vdso32: link vdso64 with linker
From: Christophe Leroy @ 2020-09-02  7:56 UTC (permalink / raw)
  To: Nick Desaulniers, Michael Ellerman, Nicholas Piggin
  Cc: Christophe Leroy, Joe Lawrence, Kees Cook, Fangrui Song,
	linux-kernel, clang-built-linux, Paul Mackerras, linuxppc-dev
In-Reply-To: <20200901222523.1941988-3-ndesaulniers@google.com>



On 9/1/20 10:25 PM, Nick Desaulniers wrote:
> Rather than invoke the compiler as the driver, use the linker. That way
> we can check --orphan-handling=warn support correctly, as cc-ldoption
> was removed in
> commit 055efab3120b ("kbuild: drop support for cc-ldoption").
> 
> Requires dropping the .got section.  I couldn't find how it was used in
> the vdso32.
> 
> Fixes: commit f2af201002a8 ("powerpc/build: vdso linker warning for orphan sections")
> Link: https://lore.kernel.org/lkml/CAKwvOdnn3wxYdJomvnveyD_njwRku3fABWT_bS92duihhywLJQ@mail.gmail.com/
> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
> ---
> Not sure removing .got is a good idea or not.  Otherwise I observe the
> following link error:
> powerpc-linux-gnu-ld: warning: orphan section `.got' from `arch/powerpc/kernel/vdso32/sigtramp.o' being placed in section `.got'
> powerpc-linux-gnu-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
> powerpc-linux-gnu-ld: final link failed: bad value
> 
> sigtramp.c doesn't mention anything from the GOT AFAICT, and doesn't
> look like it contains relocations that do, so I'm not sure where
> references to _GLOBAL_OFFSET_TABLE_ are coming from.

I'm getting the same but only when building for PPC64.
I don't get any reference to sigtramp.o though:

   CALL    scripts/checksyscalls.sh
   CALL    scripts/atomic/check-atomics.sh
   VDSO32A arch/powerpc/kernel/vdso32/sigtramp.o
   VDSO32A arch/powerpc/kernel/vdso32/gettimeofday.o
   VDSO32A arch/powerpc/kernel/vdso32/datapage.o
   VDSO32A arch/powerpc/kernel/vdso32/cacheflush.o
   VDSO32A arch/powerpc/kernel/vdso32/note.o
   VDSO32A arch/powerpc/kernel/vdso32/getcpu.o
   LD      arch/powerpc/kernel/vdso32/vdso32.so.dbg
powerpc64-linux-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
powerpc64-linux-ld: final link failed: Bad value

(GCC 8.1, Binutils 2.30)

So it seems that the got section is being created by the linker. Don't 
know why though.


With GCC 10.1, binutils 2.34 I get:

   LDS     arch/powerpc/kernel/vdso32/vdso32.lds
   VDSO32A arch/powerpc/kernel/vdso32/sigtramp.o
   VDSO32A arch/powerpc/kernel/vdso32/gettimeofday.o
   VDSO32A arch/powerpc/kernel/vdso32/datapage.o
   VDSO32A arch/powerpc/kernel/vdso32/cacheflush.o
   VDSO32A arch/powerpc/kernel/vdso32/note.o
   VDSO32A arch/powerpc/kernel/vdso32/getcpu.o
   LD      arch/powerpc/kernel/vdso32/vdso32.so.dbg
powerpc64-linux-ld: warning: orphan section `.branch_lt' from 
`arch/powerpc/kernel/vdso32/sigtramp.o' being placed in section `.branch_lt'
powerpc64-linux-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
powerpc64-linux-ld: final link failed: bad value

I can't see any .branch_lt section when objdumping sigtramp.o or any 
other .o

When I move sigtramp.o at the end of the definition of obj-vdso32 in 
Makefile, I then get:

powerpc64-linux-ld: warning: orphan section `.branch_lt' from 
`arch/powerpc/kernel/vdso32/gettimeofday.o' being placed in section 
`.branch_lt'
powerpc64-linux-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
powerpc64-linux-ld: final link failed: bad value


gettimeofday.o now being the first object in obj-vdso32


Christophe

> 
>   arch/powerpc/kernel/vdso32/Makefile     | 7 +++++--
>   arch/powerpc/kernel/vdso32/vdso32.lds.S | 3 ++-
>   2 files changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
> index 87ab1152d5ce..611a5951945a 100644
> --- a/arch/powerpc/kernel/vdso32/Makefile
> +++ b/arch/powerpc/kernel/vdso32/Makefile
> @@ -27,6 +27,9 @@ UBSAN_SANITIZE := n
>   ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
>   	-Wl,-soname=linux-vdso32.so.1 -Wl,--hash-style=both
>   asflags-y := -D__VDSO32__ -s
> +ldflags-y := -shared -soname linux-vdso32.so.1 \
> +	$(call ld-option, --eh-frame-hdr) \
> +	$(call ld-option, --orphan-handling=warn) -T
>   
>   obj-y += vdso32_wrapper.o
>   extra-y += vdso32.lds
> @@ -49,8 +52,8 @@ $(obj-vdso32): %.o: %.S FORCE
>   	$(call if_changed_dep,vdso32as)
>   
>   # actual build commands
> -quiet_cmd_vdso32ld = VDSO32L $@
> -      cmd_vdso32ld = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ $(call cc-ldoption, -Wl$(comma)--orphan-handling=warn) -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
> +quiet_cmd_vdso32ld = LD      $@
> +      cmd_vdso32ld = $(cmd_ld)
>   quiet_cmd_vdso32as = VDSO32A $@
>         cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) -c -o $@ $<
>   
> diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S
> index 4c985467a668..0ccdebad18b8 100644
> --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
> +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
> @@ -61,7 +61,6 @@ SECTIONS
>   	.fixup		: { *(.fixup) }
>   
>   	.dynamic	: { *(.dynamic) }		:text	:dynamic
> -	.got		: { *(.got) }			:text
>   	.plt		: { *(.plt) }
>   
>   	_end = .;
> @@ -108,7 +107,9 @@ SECTIONS
>   	.debug_varnames  0 : { *(.debug_varnames) }
>   
>   	/DISCARD/	: {
> +		*(.got)
>   		*(.note.GNU-stack)
> +		*(.branch_lt)
>   		*(.data .data.* .gnu.linkonce.d.* .sdata*)
>   		*(.bss .sbss .dynbss .dynsbss)
>   		*(.glink .iplt .plt .rela*)
> 

^ permalink raw reply

* Re: [PATCH 2/2] powerpc/vdso32: link vdso64 with linker
From: Christophe Leroy @ 2020-09-02  6:46 UTC (permalink / raw)
  To: Nick Desaulniers, Michael Ellerman, Nicholas Piggin
  Cc: Christophe Leroy, Joe Lawrence, Kees Cook, Fangrui Song,
	linux-kernel, clang-built-linux, Paul Mackerras, linuxppc-dev
In-Reply-To: <20200901222523.1941988-3-ndesaulniers@google.com>



On 09/01/2020 10:25 PM, Nick Desaulniers wrote:
> Rather than invoke the compiler as the driver, use the linker. That way
> we can check --orphan-handling=warn support correctly, as cc-ldoption
> was removed in
> commit 055efab3120b ("kbuild: drop support for cc-ldoption").
> 
> Requires dropping the .got section.  I couldn't find how it was used in
> the vdso32.

ld crashes:

   LD      arch/powerpc/kernel/vdso32/vdso32.so.dbg
/bin/sh: line 1: 23780 Segmentation fault      (core dumped) 
ppc-linux-ld -EB -m elf32ppc -shared -soname linux-vdso32.so.1 
--eh-frame-hdr --orphan-handling=warn -T 
arch/powerpc/kernel/vdso32/vdso32.lds 
arch/powerpc/kernel/vdso32/sigtramp.o 
arch/powerpc/kernel/vdso32/gettimeofday.o 
arch/powerpc/kernel/vdso32/datapage.o 
arch/powerpc/kernel/vdso32/cacheflush.o 
arch/powerpc/kernel/vdso32/note.o arch/powerpc/kernel/vdso32/getcpu.o -o 
arch/powerpc/kernel/vdso32/vdso32.so.dbg
make[4]: *** [arch/powerpc/kernel/vdso32/vdso32.so.dbg] Error 139


[root@localhost linux-powerpc]# ppc-linux-ld --version
GNU ld (GNU Binutils) 2.26.20160125


Christophe

> 
> Fixes: commit f2af201002a8 ("powerpc/build: vdso linker warning for orphan sections")
> Link: https://lore.kernel.org/lkml/CAKwvOdnn3wxYdJomvnveyD_njwRku3fABWT_bS92duihhywLJQ@mail.gmail.com/
> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
> ---
> Not sure removing .got is a good idea or not.  Otherwise I observe the
> following link error:
> powerpc-linux-gnu-ld: warning: orphan section `.got' from `arch/powerpc/kernel/vdso32/sigtramp.o' being placed in section `.got'
> powerpc-linux-gnu-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
> powerpc-linux-gnu-ld: final link failed: bad value
> 
> sigtramp.c doesn't mention anything from the GOT AFAICT, and doesn't
> look like it contains relocations that do, so I'm not sure where
> references to _GLOBAL_OFFSET_TABLE_ are coming from.
> 
>   arch/powerpc/kernel/vdso32/Makefile     | 7 +++++--
>   arch/powerpc/kernel/vdso32/vdso32.lds.S | 3 ++-
>   2 files changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
> index 87ab1152d5ce..611a5951945a 100644
> --- a/arch/powerpc/kernel/vdso32/Makefile
> +++ b/arch/powerpc/kernel/vdso32/Makefile
> @@ -27,6 +27,9 @@ UBSAN_SANITIZE := n
>   ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
>   	-Wl,-soname=linux-vdso32.so.1 -Wl,--hash-style=both
>   asflags-y := -D__VDSO32__ -s
> +ldflags-y := -shared -soname linux-vdso32.so.1 \
> +	$(call ld-option, --eh-frame-hdr) \
> +	$(call ld-option, --orphan-handling=warn) -T
>   
>   obj-y += vdso32_wrapper.o
>   extra-y += vdso32.lds
> @@ -49,8 +52,8 @@ $(obj-vdso32): %.o: %.S FORCE
>   	$(call if_changed_dep,vdso32as)
>   
>   # actual build commands
> -quiet_cmd_vdso32ld = VDSO32L $@
> -      cmd_vdso32ld = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ $(call cc-ldoption, -Wl$(comma)--orphan-handling=warn) -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
> +quiet_cmd_vdso32ld = LD      $@
> +      cmd_vdso32ld = $(cmd_ld)
>   quiet_cmd_vdso32as = VDSO32A $@
>         cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) -c -o $@ $<
>   
> diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S
> index 4c985467a668..0ccdebad18b8 100644
> --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
> +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
> @@ -61,7 +61,6 @@ SECTIONS
>   	.fixup		: { *(.fixup) }
>   
>   	.dynamic	: { *(.dynamic) }		:text	:dynamic
> -	.got		: { *(.got) }			:text
>   	.plt		: { *(.plt) }
>   
>   	_end = .;
> @@ -108,7 +107,9 @@ SECTIONS
>   	.debug_varnames  0 : { *(.debug_varnames) }
>   
>   	/DISCARD/	: {
> +		*(.got)
>   		*(.note.GNU-stack)
> +		*(.branch_lt)
>   		*(.data .data.* .gnu.linkonce.d.* .sdata*)
>   		*(.bss .sbss .dynbss .dynsbss)
>   		*(.glink .iplt .plt .rela*)
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox