LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v4 13/13] mm/debug_vm_pgtable: Avoid none pte in pte_clear_test
From: Anshuman Khandual @ 2020-09-04  4:16 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linux-mm, akpm; +Cc: linuxppc-dev
In-Reply-To: <20200902114601.183715-1-aneesh.kumar@linux.ibm.com>



On 09/02/2020 05:16 PM, Aneesh Kumar K.V wrote:
> pte_clear_tests operate on an existing pte entry. Make sure that
> is not a none pte entry.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/debug_vm_pgtable.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 9afa1354326b..c36530c69e33 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -542,9 +542,10 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
>  #endif /* PAGETABLE_P4D_FOLDED */
>  
>  static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
> -				   unsigned long vaddr)
> +				   unsigned long pfn, unsigned long vaddr,
> +				   pgprot_t prot)
>  {
> -	pte_t pte = ptep_get(ptep);
> +	pte_t pte = pfn_pte(pfn, prot);
>  
>  	pr_debug("Validating PTE clear\n");
>  	pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
> @@ -1049,7 +1050,7 @@ static int __init debug_vm_pgtable(void)
>  
>  	ptl = pte_lockptr(mm, pmdp);
>  	spin_lock(ptl);
> -	pte_clear_tests(mm, ptep, vaddr);
> +	pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
>  	pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
>  	pte_unmap_unlock(ptep, ptl);
>  
> 

Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>

^ permalink raw reply

* Re: [PATCH v4 10/13] mm/debug_vm_pgtable/thp: Use page table depost/withdraw with THP
From: Anshuman Khandual @ 2020-09-04  4:21 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linux-mm, akpm; +Cc: linuxppc-dev
In-Reply-To: <20200902114222.181353-11-aneesh.kumar@linux.ibm.com>



On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> Architectures like ppc64 use deposited page table while updating the
> huge pte entries.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/debug_vm_pgtable.c | 10 +++++++---
>  1 file changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 2bc1952e5f83..26023d990bd0 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -154,7 +154,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
>  static void __init pmd_advanced_tests(struct mm_struct *mm,
>  				      struct vm_area_struct *vma, pmd_t *pmdp,
>  				      unsigned long pfn, unsigned long vaddr,
> -				      pgprot_t prot)
> +				      pgprot_t prot, pgtable_t pgtable)
>  {
>  	pmd_t pmd;
>  
> @@ -165,6 +165,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
>  	/* Align the address wrt HPAGE_PMD_SIZE */
>  	vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
>  
> +	pgtable_trans_huge_deposit(mm, pmdp, pgtable);
> +
>  	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
>  	set_pmd_at(mm, vaddr, pmdp, pmd);
>  	pmdp_set_wrprotect(mm, vaddr, pmdp);
> @@ -193,6 +195,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
>  	pmdp_test_and_clear_young(vma, vaddr, pmdp);
>  	pmd = READ_ONCE(*pmdp);
>  	WARN_ON(pmd_young(pmd));
> +
> +	pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
>  }
>  
>  static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
> @@ -371,7 +375,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
>  static void __init pmd_advanced_tests(struct mm_struct *mm,
>  				      struct vm_area_struct *vma, pmd_t *pmdp,
>  				      unsigned long pfn, unsigned long vaddr,
> -				      pgprot_t prot)
> +				      pgprot_t prot, pgtable_t pgtable)
>  {
>  }
>  static void __init pud_advanced_tests(struct mm_struct *mm,
> @@ -1048,7 +1052,7 @@ static int __init debug_vm_pgtable(void)
>  
>  	ptl = pmd_lock(mm, pmdp);
>  	pmd_clear_tests(mm, pmdp);
> -	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
> +	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
>  	pmd_huge_tests(pmdp, pmd_aligned, prot);
>  	pmd_populate_tests(mm, pmdp, saved_ptep);
>  	spin_unlock(ptl);
> 

Moving this down further in the series has not really helped the possible
git bisect problem on arm64. Nonetheless, the patch in itself, makes sense.

Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>

^ permalink raw reply

* Re: [PATCH 12/14] x86: remove address space overrides using set_fs()
From: Al Viro @ 2020-09-04  4:41 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-arch, linuxppc-dev, Kees Cook, x86, linux-kernel,
	Luis Chamberlain, linux-fsdevel, Linus Torvalds, Alexey Dobriyan
In-Reply-To: <20200904025510.GO1236603@ZenIV.linux.org.uk>

On Fri, Sep 04, 2020 at 03:55:10AM +0100, Al Viro wrote:
> On Thu, Sep 03, 2020 at 04:22:40PM +0200, Christoph Hellwig wrote:
> 
> > diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
> > index c8a85b512796e1..94f7be4971ed04 100644
> > --- a/arch/x86/lib/getuser.S
> > +++ b/arch/x86/lib/getuser.S
> > @@ -35,10 +35,19 @@
> >  #include <asm/smap.h>
> >  #include <asm/export.h>
> >  
> > +#ifdef CONFIG_X86_5LEVEL
> > +#define LOAD_TASK_SIZE_MINUS_N(n) \
> > +	ALTERNATIVE "mov $((1 << 47) - 4096 - (n)),%rdx", \
> > +		    "mov $((1 << 56) - 4096 - (n)),%rdx", X86_FEATURE_LA57
> > +#else
> > +#define LOAD_TASK_SIZE_MINUS_N(n) \
> > +	mov $(TASK_SIZE_MAX - (n)),%_ASM_DX
> > +#endif
> 
> Wait a sec... how is that supposed to build with X86_5LEVEL?  Do you mean
> 
> #define LOAD_TASK_SIZE_MINUS_N(n) \
> 	ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \
> 		    __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57
> 
> there?

Pushed out with the following folded in.

diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 94f7be4971ed..2f052bc96866 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -37,8 +37,8 @@
 
 #ifdef CONFIG_X86_5LEVEL
 #define LOAD_TASK_SIZE_MINUS_N(n) \
-	ALTERNATIVE "mov $((1 << 47) - 4096 - (n)),%rdx", \
-		    "mov $((1 << 56) - 4096 - (n)),%rdx", X86_FEATURE_LA57
+	ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \
+		    __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57
 #else
 #define LOAD_TASK_SIZE_MINUS_N(n) \
 	mov $(TASK_SIZE_MAX - (n)),%_ASM_DX
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 445374885153..358239d77dff 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -33,8 +33,8 @@
 
 #ifdef CONFIG_X86_5LEVEL
 #define LOAD_TASK_SIZE_MINUS_N(n) \
-	ALTERNATIVE "mov $((1 << 47) - 4096 - (n)),%rbx", \
-		    "mov $((1 << 56) - 4096 - (n)),%rbx", X86_FEATURE_LA57
+	ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rbx), \
+		    __stringify(mov $((1 << 56) - 4096 - (n)),%rbx), X86_FEATURE_LA57
 #else
 #define LOAD_TASK_SIZE_MINUS_N(n) \
 	mov $(TASK_SIZE_MAX - (n)),%_ASM_BX

^ permalink raw reply related

* Re: [PATCH v4 06/13] mm/debug_vm_pgtable/THP: Mark the pte entry huge before using set_pmd/pud_at
From: Anshuman Khandual @ 2020-09-04  5:21 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linux-mm, akpm; +Cc: linuxppc-dev
In-Reply-To: <20200902114222.181353-7-aneesh.kumar@linux.ibm.com>



On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> kernel expects entries to be marked huge before we use
> set_pmd_at()/set_pud_at().
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/debug_vm_pgtable.c | 20 +++++++++++---------
>  1 file changed, 11 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 8704901f6bd8..9cafed39c236 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -155,7 +155,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
>  				      unsigned long pfn, unsigned long vaddr,
>  				      pgprot_t prot)
>  {
> -	pmd_t pmd = pfn_pmd(pfn, prot);
> +	pmd_t pmd;
>  
>  	if (!has_transparent_hugepage())
>  		return;
> @@ -164,19 +164,19 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
>  	/* Align the address wrt HPAGE_PMD_SIZE */
>  	vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
>  
> -	pmd = pfn_pmd(pfn, prot);
> +	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
>  	set_pmd_at(mm, vaddr, pmdp, pmd);
>  	pmdp_set_wrprotect(mm, vaddr, pmdp);
>  	pmd = READ_ONCE(*pmdp);
>  	WARN_ON(pmd_write(pmd));
>  
> -	pmd = pfn_pmd(pfn, prot);
> +	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
>  	set_pmd_at(mm, vaddr, pmdp, pmd);
>  	pmdp_huge_get_and_clear(mm, vaddr, pmdp);
>  	pmd = READ_ONCE(*pmdp);
>  	WARN_ON(!pmd_none(pmd));
>  
> -	pmd = pfn_pmd(pfn, prot);
> +	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
>  	pmd = pmd_wrprotect(pmd);
>  	pmd = pmd_mkclean(pmd);
>  	set_pmd_at(mm, vaddr, pmdp, pmd);
> @@ -236,7 +236,7 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
>  
>  static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
>  {
> -	pmd_t pmd = pfn_pmd(pfn, prot);
> +	pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
>  
>  	if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
>  		return;
> @@ -276,7 +276,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
>  				      unsigned long pfn, unsigned long vaddr,
>  				      pgprot_t prot)
>  {
> -	pud_t pud = pfn_pud(pfn, prot);
> +	pud_t pud;
>  
>  	if (!has_transparent_hugepage())
>  		return;
> @@ -285,25 +285,27 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
>  	/* Align the address wrt HPAGE_PUD_SIZE */
>  	vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
>  
> +	pud = pud_mkhuge(pfn_pud(pfn, prot));
>  	set_pud_at(mm, vaddr, pudp, pud);
>  	pudp_set_wrprotect(mm, vaddr, pudp);
>  	pud = READ_ONCE(*pudp);
>  	WARN_ON(pud_write(pud));
>  
>  #ifndef __PAGETABLE_PMD_FOLDED
> -	pud = pfn_pud(pfn, prot);
> +	pud = pud_mkhuge(pfn_pud(pfn, prot));
>  	set_pud_at(mm, vaddr, pudp, pud);
>  	pudp_huge_get_and_clear(mm, vaddr, pudp);
>  	pud = READ_ONCE(*pudp);
>  	WARN_ON(!pud_none(pud));
>  
> -	pud = pfn_pud(pfn, prot);
> +	pud = pud_mkhuge(pfn_pud(pfn, prot));
>  	set_pud_at(mm, vaddr, pudp, pud);
>  	pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
>  	pud = READ_ONCE(*pudp);
>  	WARN_ON(!pud_none(pud));
>  #endif /* __PAGETABLE_PMD_FOLDED */
> -	pud = pfn_pud(pfn, prot);
> +
> +	pud = pud_mkhuge(pfn_pud(pfn, prot));
>  	pud = pud_wrprotect(pud);
>  	pud = pud_mkclean(pud);
>  	set_pud_at(mm, vaddr, pudp, pud);
> 

Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>

^ permalink raw reply

* Re: [PATCH v4 07/13] mm/debug_vm_pgtable/set_pte/pmd/pud: Don't use set_*_at to update an existing pte entry
From: Anshuman Khandual @ 2020-09-04  5:34 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linux-mm, akpm; +Cc: linuxppc-dev
In-Reply-To: <20200902114222.181353-8-aneesh.kumar@linux.ibm.com>



On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> set_pte_at() should not be used to set a pte entry at locations that
> already holds a valid pte entry. Architectures like ppc64 don't do TLB
> invalidate in set_pte_at() and hence expect it to be used to set locations
> that are not a valid PTE.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/debug_vm_pgtable.c | 35 +++++++++++++++--------------------
>  1 file changed, 15 insertions(+), 20 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 9cafed39c236..de333871f407 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -79,15 +79,18 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
>  {
>  	pte_t pte = pfn_pte(pfn, prot);
>  
> +	/*
> +	 * Architectures optimize set_pte_at by avoiding TLB flush.
> +	 * This requires set_pte_at to be not used to update an
> +	 * existing pte entry. Clear pte before we do set_pte_at
> +	 */
> +
>  	pr_debug("Validating PTE advanced\n");
>  	pte = pfn_pte(pfn, prot);
>  	set_pte_at(mm, vaddr, ptep, pte);
>  	ptep_set_wrprotect(mm, vaddr, ptep);
>  	pte = ptep_get(ptep);
>  	WARN_ON(pte_write(pte));
> -
> -	pte = pfn_pte(pfn, prot);
> -	set_pte_at(mm, vaddr, ptep, pte);
>  	ptep_get_and_clear(mm, vaddr, ptep);
>  	pte = ptep_get(ptep);
>  	WARN_ON(!pte_none(pte));
> @@ -101,13 +104,11 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
>  	ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
>  	pte = ptep_get(ptep);
>  	WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
> -
> -	pte = pfn_pte(pfn, prot);
> -	set_pte_at(mm, vaddr, ptep, pte);
>  	ptep_get_and_clear_full(mm, vaddr, ptep, 1);
>  	pte = ptep_get(ptep);
>  	WARN_ON(!pte_none(pte));
>  
> +	pte = pfn_pte(pfn, prot);
>  	pte = pte_mkyoung(pte);
>  	set_pte_at(mm, vaddr, ptep, pte);
>  	ptep_test_and_clear_young(vma, vaddr, ptep);
> @@ -169,9 +170,6 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
>  	pmdp_set_wrprotect(mm, vaddr, pmdp);
>  	pmd = READ_ONCE(*pmdp);
>  	WARN_ON(pmd_write(pmd));
> -
> -	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
> -	set_pmd_at(mm, vaddr, pmdp, pmd);
>  	pmdp_huge_get_and_clear(mm, vaddr, pmdp);
>  	pmd = READ_ONCE(*pmdp);
>  	WARN_ON(!pmd_none(pmd));
> @@ -185,13 +183,11 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
>  	pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
>  	pmd = READ_ONCE(*pmdp);
>  	WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
> -
> -	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
> -	set_pmd_at(mm, vaddr, pmdp, pmd);
>  	pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
>  	pmd = READ_ONCE(*pmdp);
>  	WARN_ON(!pmd_none(pmd));
>  
> +	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
>  	pmd = pmd_mkyoung(pmd);
>  	set_pmd_at(mm, vaddr, pmdp, pmd);
>  	pmdp_test_and_clear_young(vma, vaddr, pmdp);
> @@ -292,17 +288,9 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
>  	WARN_ON(pud_write(pud));
>  
>  #ifndef __PAGETABLE_PMD_FOLDED
> -	pud = pud_mkhuge(pfn_pud(pfn, prot));
> -	set_pud_at(mm, vaddr, pudp, pud);
>  	pudp_huge_get_and_clear(mm, vaddr, pudp);
>  	pud = READ_ONCE(*pudp);
>  	WARN_ON(!pud_none(pud));
> -
> -	pud = pud_mkhuge(pfn_pud(pfn, prot));
> -	set_pud_at(mm, vaddr, pudp, pud);
> -	pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
> -	pud = READ_ONCE(*pudp);
> -	WARN_ON(!pud_none(pud));
>  #endif /* __PAGETABLE_PMD_FOLDED */
>  
>  	pud = pud_mkhuge(pfn_pud(pfn, prot));
> @@ -315,6 +303,13 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
>  	pud = READ_ONCE(*pudp);
>  	WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
>  
> +#ifndef __PAGETABLE_PMD_FOLDED
> +	pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
> +	pud = READ_ONCE(*pudp);
> +	WARN_ON(!pud_none(pud));
> +#endif /* __PAGETABLE_PMD_FOLDED */
> +
> +	pud = pud_mkhuge(pfn_pud(pfn, prot));
>  	pud = pud_mkyoung(pud);
>  	set_pud_at(mm, vaddr, pudp, pud);
>  	pudp_test_and_clear_young(vma, vaddr, pudp);
> 

Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>

^ permalink raw reply

* Re: [PATCH v4 09/13] mm/debug_vm_pgtable/locks: Take correct page table lock
From: Anshuman Khandual @ 2020-09-04  5:39 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linux-mm, akpm; +Cc: linuxppc-dev
In-Reply-To: <20200902114222.181353-10-aneesh.kumar@linux.ibm.com>



On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> Make sure we call pte accessors with correct lock held.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/debug_vm_pgtable.c | 35 ++++++++++++++++++++++-------------
>  1 file changed, 22 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index f59cf6a9b05e..2bc1952e5f83 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -1035,30 +1035,39 @@ static int __init debug_vm_pgtable(void)
>  
>  	hugetlb_basic_tests(pte_aligned, prot);
>  
> -	pte_clear_tests(mm, ptep, vaddr);
> -	pmd_clear_tests(mm, pmdp);
> -	pud_clear_tests(mm, pudp);
> -	p4d_clear_tests(mm, p4dp);
> -	pgd_clear_tests(mm, pgdp);
> +	/*
> +	 * Page table modifying tests. They need to hold
> +	 * proper page table lock.
> +	 */
>  
>  	ptl = pte_lockptr(mm, pmdp);
>  	spin_lock(ptl);
> -
> +	pte_clear_tests(mm, ptep, vaddr);
>  	pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
> -	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
> -	pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
> -	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
> -
> +	pte_unmap_unlock(ptep, ptl);
>  
> +	ptl = pmd_lock(mm, pmdp);
> +	pmd_clear_tests(mm, pmdp);
> +	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
>  	pmd_huge_tests(pmdp, pmd_aligned, prot);
> +	pmd_populate_tests(mm, pmdp, saved_ptep);
> +	spin_unlock(ptl);
> +
> +	ptl = pud_lock(mm, pudp);
> +	pud_clear_tests(mm, pudp);
> +	pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
>  	pud_huge_tests(pudp, pud_aligned, prot);
> +	pud_populate_tests(mm, pudp, saved_pmdp);
> +	spin_unlock(ptl);
>  
> -	pte_unmap_unlock(ptep, ptl);
> +	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
>  
> -	pmd_populate_tests(mm, pmdp, saved_ptep);
> -	pud_populate_tests(mm, pudp, saved_pmdp);
> +	spin_lock(&mm->page_table_lock);
> +	p4d_clear_tests(mm, p4dp);
> +	pgd_clear_tests(mm, pgdp);
>  	p4d_populate_tests(mm, p4dp, saved_pudp);
>  	pgd_populate_tests(mm, pgdp, saved_p4dp);
> +	spin_unlock(&mm->page_table_lock);
>  
>  	p4d_free(mm, saved_p4dp);
>  	pud_free(mm, saved_pudp);
> 

This patch, in itself looks good but will probably require folding with
the previous one to prevent the git bisect problem on arm64.

^ permalink raw reply

* Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3
From: Ingo Molnar @ 2020-09-04  6:00 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-arch, linuxppc-dev, Kees Cook, x86, linux-kernel,
	Luis Chamberlain, Al Viro, linux-fsdevel, Linus Torvalds,
	Alexey Dobriyan
In-Reply-To: <20200903142242.925828-1-hch@lst.de>


* Christoph Hellwig <hch@lst.de> wrote:

> Hi all,
> 
> this series removes the last set_fs() used to force a kernel address
> space for the uaccess code in the kernel read/write/splice code, and then
> stops implementing the address space overrides entirely for x86 and
> powerpc.

Cool! For the x86 bits:

  Acked-by: Ingo Molnar <mingo@kernel.org>

Thanks,

	Ingo

^ permalink raw reply

* Re: [PATCH v4 11/13] mm/debug_vm_pgtable/pmd_clear: Don't use pmd/pud_clear on pte entries
From: Anshuman Khandual @ 2020-09-04  6:03 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linux-mm, akpm; +Cc: linuxppc-dev
In-Reply-To: <20200902114222.181353-12-aneesh.kumar@linux.ibm.com>



On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> pmd_clear() should not be used to clear pmd level pte entries.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/debug_vm_pgtable.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index 26023d990bd0..b53903fdee85 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -196,6 +196,8 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
>  	pmd = READ_ONCE(*pmdp);
>  	WARN_ON(pmd_young(pmd));
>  
> +	/*  Clear the pte entries  */
> +	pmdp_huge_get_and_clear(mm, vaddr, pmdp);
>  	pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
>  }
>  
> @@ -319,6 +321,8 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
>  	pudp_test_and_clear_young(vma, vaddr, pudp);
>  	pud = READ_ONCE(*pudp);
>  	WARN_ON(pud_young(pud));
> +
> +	pudp_huge_get_and_clear(mm, vaddr, pudp);
>  }
>  
>  static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
> @@ -442,8 +446,6 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
>  	 * This entry points to next level page table page.
>  	 * Hence this must not qualify as pud_bad().
>  	 */
> -	pmd_clear(pmdp);
> -	pud_clear(pudp);
>  	pud_populate(mm, pudp, pmdp);
>  	pud = READ_ONCE(*pudp);
>  	WARN_ON(pud_bad(pud));
> @@ -575,7 +577,6 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
>  	 * This entry points to next level page table page.
>  	 * Hence this must not qualify as pmd_bad().
>  	 */
> -	pmd_clear(pmdp);
>  	pmd_populate(mm, pmdp, pgtable);
>  	pmd = READ_ONCE(*pmdp);
>  	WARN_ON(pmd_bad(pmd));
> 

Why pxxp_huge_get_and_clear() cannot be called inside pxx_populate_tests()
functions itself ? Nonetheless, this does not seem to cause any problem.

^ permalink raw reply

* Re: [PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()
From: Leonardo Bras @ 2020-09-04  6:04 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Christophe Leroy, Joel Stanley,
	Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <8f569f68-5145-676e-50a1-b13f3fbd69cc@ozlabs.ru>

On Thu, 2020-09-03 at 14:41 +1000, Alexey Kardashevskiy wrote:
> I am new to this, so I am trying to understand how a memory page mapped
> > as DMA, and used for something else could be a problem.
> 
>  From the device prospective, there is PCI space and everything from 0 
> till 1<<64 is accessible and what is that mapped to - the device does 
> not know. PHB's IOMMU is the thing to notice invalid access and raise 
> EEH but PHB only knows about PCI->physical memory mapping (with IOMMU 
> pages) but nothing about the host kernel pages. Does this help? Thanks,

According to our conversation on Slack:
1- There is a problem if a hypervisor gives to it's VMs contiguous
memory blocks that are not aligned to IOMMU pages, because then an 
iommu_map_page() could map some memory in this VM and some memory in
other VM / process.
2- To guarantee this, we should have system pagesize >= iommu_pagesize 

One way to get (2) is by doing this in enable_ddw():
	if ((query.page_size & 4) && PAGE_SHIFT >= 24) {
		page_shift = 24; /* 16MB */
	} else if ((query.page_size & 2) &&  PAGE_SHIFT >= 16 ) {
		page_shift = 16; /* 64kB */
	} else if (query.page_size & 1 &&  PAGE_SHIFT >= 12) {
		page_shift = 12; /* 4kB */
	[...]

Another way of solving this, would be adding in LoPAR documentation
that the blocksize of contiguous memory the hypervisor gives a VM
should always be aligned to IOMMU pagesize offered.

I think the best approach would be first sending the above patch, which
is faster, and then get working into adding that to documentation, so
hypervisors guarantee this.

If this gets into the docs, we can revert the patch.

What do you think?

Best regards!


^ permalink raw reply

* Re: [PATCH v4 12/13] mm/debug_vm_pgtable/hugetlb: Disable hugetlb test on ppc64
From: Anshuman Khandual @ 2020-09-04  6:19 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linux-mm, akpm; +Cc: linuxppc-dev
In-Reply-To: <20200902114222.181353-13-aneesh.kumar@linux.ibm.com>



On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> The seems to be missing quite a lot of details w.r.t allocating
> the correct pgtable_t page (huge_pte_alloc()), holding the right
> lock (huge_pte_lock()) etc. The vma used is also not a hugetlb VMA.
> 
> ppc64 do have runtime checks within CONFIG_DEBUG_VM for most of these.
> Hence disable the test on ppc64.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  mm/debug_vm_pgtable.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
> index b53903fdee85..9afa1354326b 100644
> --- a/mm/debug_vm_pgtable.c
> +++ b/mm/debug_vm_pgtable.c
> @@ -811,6 +811,7 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
>  #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
>  }
>  
> +#ifndef CONFIG_PPC_BOOK3S_64
>  static void __init hugetlb_advanced_tests(struct mm_struct *mm,
>  					  struct vm_area_struct *vma,
>  					  pte_t *ptep, unsigned long pfn,
> @@ -853,6 +854,7 @@ static void __init hugetlb_advanced_tests(struct mm_struct *mm,
>  	pte = huge_ptep_get(ptep);
>  	WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
>  }
> +#endif
>  #else  /* !CONFIG_HUGETLB_PAGE */
>  static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
>  static void __init hugetlb_advanced_tests(struct mm_struct *mm,
> @@ -1065,7 +1067,9 @@ static int __init debug_vm_pgtable(void)
>  	pud_populate_tests(mm, pudp, saved_pmdp);
>  	spin_unlock(ptl);
>  
> +#ifndef CONFIG_PPC_BOOK3S_64
>  	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
> +#endif
>  
>  	spin_lock(&mm->page_table_lock);
>  	p4d_clear_tests(mm, p4dp);
> 

Is it still required now that DEBUG_VM_PGTABLE has been dropped from powerpc
or you would like to re-enabled it back ?

https://lore.kernel.org/linuxppc-dev/159913592797.5893.5829441560236719450.b4-ty@ellerman.id.au/T/#m6d890e2fe84cf180cb875fae5f791e9c83db8d30

^ permalink raw reply

* Re: [PATCH 12/14] x86: remove address space overrides using set_fs()
From: Christoph Hellwig @ 2020-09-04  6:38 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-arch, linuxppc-dev, Kees Cook, x86, linux-kernel,
	Alexey Dobriyan, Luis Chamberlain, linux-fsdevel, Linus Torvalds,
	Christoph Hellwig
In-Reply-To: <20200904025510.GO1236603@ZenIV.linux.org.uk>

On Fri, Sep 04, 2020 at 03:55:10AM +0100, Al Viro wrote:
> On Thu, Sep 03, 2020 at 04:22:40PM +0200, Christoph Hellwig wrote:
> 
> > diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
> > index c8a85b512796e1..94f7be4971ed04 100644
> > --- a/arch/x86/lib/getuser.S
> > +++ b/arch/x86/lib/getuser.S
> > @@ -35,10 +35,19 @@
> >  #include <asm/smap.h>
> >  #include <asm/export.h>
> >  
> > +#ifdef CONFIG_X86_5LEVEL
> > +#define LOAD_TASK_SIZE_MINUS_N(n) \
> > +	ALTERNATIVE "mov $((1 << 47) - 4096 - (n)),%rdx", \
> > +		    "mov $((1 << 56) - 4096 - (n)),%rdx", X86_FEATURE_LA57
> > +#else
> > +#define LOAD_TASK_SIZE_MINUS_N(n) \
> > +	mov $(TASK_SIZE_MAX - (n)),%_ASM_DX
> > +#endif
> 
> Wait a sec... how is that supposed to build with X86_5LEVEL?  Do you mean
> 
> #define LOAD_TASK_SIZE_MINUS_N(n) \
> 	ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \
> 		    __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57
> 
> there?

Don't ask me about the how, but it builds and works with X86_5LEVEL,
and the style is copied from elsewhere..

^ permalink raw reply

* Re: [PATCH v4 00/13] mm/debug_vm_pgtable fixes
From: Anshuman Khandual @ 2020-09-04  6:48 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linux-mm, akpm
  Cc: linux-s390@vger.kernel.org, Vineet Gupta, linux-riscv,
	linux-snps-arc@lists.infradead.org, linuxppc-dev, Gerald Schaefer
In-Reply-To: <20200902114222.181353-1-aneesh.kumar@linux.ibm.com>



On 09/02/2020 05:12 PM, Aneesh Kumar K.V wrote:
> This patch series includes fixes for debug_vm_pgtable test code so that
> they follow page table updates rules correctly. The first two patches introduce
> changes w.r.t ppc64. The patches are included in this series for completeness. We can
> merge them via ppc64 tree if required.
> 
> Hugetlb test is disabled on ppc64 because that needs larger change to satisfy
> page table update rules.
> 
> These tests are broken w.r.t page table update rules and results in kernel
> crash as below. 
> 
> [   21.083519] kernel BUG at arch/powerpc/mm/pgtable.c:304!
> cpu 0x0: Vector: 700 (Program Check) at [c000000c6d1e76c0]
>     pc: c00000000009a5ec: assert_pte_locked+0x14c/0x380
>     lr: c0000000005eeeec: pte_update+0x11c/0x190
>     sp: c000000c6d1e7950
>    msr: 8000000002029033
>   current = 0xc000000c6d172c80
>   paca    = 0xc000000003ba0000   irqmask: 0x03   irq_happened: 0x01
>     pid   = 1, comm = swapper/0
> kernel BUG at arch/powerpc/mm/pgtable.c:304!
> [link register   ] c0000000005eeeec pte_update+0x11c/0x190
> [c000000c6d1e7950] 0000000000000001 (unreliable)
> [c000000c6d1e79b0] c0000000005eee14 pte_update+0x44/0x190
> [c000000c6d1e7a10] c000000001a2ca9c pte_advanced_tests+0x160/0x3d8
> [c000000c6d1e7ab0] c000000001a2d4fc debug_vm_pgtable+0x7e8/0x1338
> [c000000c6d1e7ba0] c0000000000116ec do_one_initcall+0xac/0x5f0
> [c000000c6d1e7c80] c0000000019e4fac kernel_init_freeable+0x4dc/0x5a4
> [c000000c6d1e7db0] c000000000012474 kernel_init+0x24/0x160
> [c000000c6d1e7e20] c00000000000cbd0 ret_from_kernel_thread+0x5c/0x6c
> 
> With DEBUG_VM disabled
> 
> [   20.530152] BUG: Kernel NULL pointer dereference on read at 0x00000000
> [   20.530183] Faulting instruction address: 0xc0000000000df330
> cpu 0x33: Vector: 380 (Data SLB Access) at [c000000c6d19f700]
>     pc: c0000000000df330: memset+0x68/0x104
>     lr: c00000000009f6d8: hash__pmdp_huge_get_and_clear+0xe8/0x1b0
>     sp: c000000c6d19f990
>    msr: 8000000002009033
>    dar: 0
>   current = 0xc000000c6d177480
>   paca    = 0xc00000001ec4f400   irqmask: 0x03   irq_happened: 0x01
>     pid   = 1, comm = swapper/0
> [link register   ] c00000000009f6d8 hash__pmdp_huge_get_and_clear+0xe8/0x1b0
> [c000000c6d19f990] c00000000009f748 hash__pmdp_huge_get_and_clear+0x158/0x1b0 (unreliable)
> [c000000c6d19fa10] c0000000019ebf30 pmd_advanced_tests+0x1f0/0x378
> [c000000c6d19fab0] c0000000019ed088 debug_vm_pgtable+0x79c/0x1244
> [c000000c6d19fba0] c0000000000116ec do_one_initcall+0xac/0x5f0
> [c000000c6d19fc80] c0000000019a4fac kernel_init_freeable+0x4dc/0x5a4
> [c000000c6d19fdb0] c000000000012474 kernel_init+0x24/0x160
> [c000000c6d19fe20] c00000000000cbd0 ret_from_kernel_thread+0x5c/0x6c
> 
> Changes from v3:
> * Address review feedback
> * Move page table depost and withdraw patch after adding pmdlock to avoid bisect failure.

This version

- Builds on x86, arm64, s390, arc, powerpc and riscv (defconfig with DEBUG_VM_PGTABLE)
- Runs on arm64 and x86 without any regression, atleast nothing that I have noticed
- Will be great if this could get tested on s390, arc, riscv, ppc32 platforms as well

+ linux-riscv <linux-riscv@lists.infradead.org>
+ linux-snps-arc@lists.infradead.org <linux-snps-arc@lists.infradead.org>
+ linux-s390@vger.kernel.org
+ Gerald Schaefer <gerald.schaefer@de.ibm.com>
+ Vineet Gupta <vgupta@synopsys.com>

There is still an open git bisect issue on arm64 platform which ideally should be fixed.

- Anshuman

^ permalink raw reply

* Re: [PATCH 12/14] x86: remove address space overrides using set_fs()
From: Christoph Hellwig @ 2020-09-04  7:47 UTC (permalink / raw)
  To: Al Viro
  Cc: linux-arch, linuxppc-dev, Kees Cook, x86, linux-kernel,
	Alexey Dobriyan, Luis Chamberlain, linux-fsdevel, Linus Torvalds,
	Christoph Hellwig
In-Reply-To: <20200904063813.GA12204@lst.de>

On Fri, Sep 04, 2020 at 08:38:13AM +0200, Christoph Hellwig wrote:
> > Wait a sec... how is that supposed to build with X86_5LEVEL?  Do you mean
> > 
> > #define LOAD_TASK_SIZE_MINUS_N(n) \
> > 	ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \
> > 		    __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57
> > 
> > there?
> 
> Don't ask me about the how, but it builds and works with X86_5LEVEL,
> and the style is copied from elsewhere..

Actually, it doesn't any more.  Looks like the change to pass the n
parameter as suggested by Linus broke the previously working version.

^ permalink raw reply

* RE: [PATCH 12/14] x86: remove address space overrides using set_fs()
From: David Laight @ 2020-09-04  7:59 UTC (permalink / raw)
  To: 'Linus Torvalds'
  Cc: linux-arch@vger.kernel.org, Kees Cook, x86@kernel.org,
	linux-kernel@vger.kernel.org, Christoph Hellwig, Luis Chamberlain,
	Al Viro, linux-fsdevel@vger.kernel.org,
	linuxppc-dev@lists.ozlabs.org, Alexey Dobriyan
In-Reply-To: <CAHk-=whDtnudkbZ8-hR8HiDE7zog0dv+Gu9Sx5i6SPakrDtajQ@mail.gmail.com>

From: Linus Torvalds
> Sent: 04 September 2020 00:26
> 
> On Thu, Sep 3, 2020 at 2:30 PM David Laight <David.Laight@aculab.com> wrote:
> >
> > A non-canonical (is that the right term) address between the highest
> > valid user address and the lowest valid kernel address (7ffe to fffe?)
> > will fault anyway.
> 
> Yes.
> 
> But we actually warn against that fault, because it's been a good way
> to catch places that didn't use the proper "access_ok()" pattern.
> 
> See ex_handler_uaccess() and the
> 
>         WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in
> user access. Non-canonical address?");
> 
> warning. It's been good for randomized testing - a missing range check
> on a user address will often hit this.
> 
> Of course, you should never see it in real life (and hopefully not in
> testing either any more). But belt-and-suspenders..

That could still be effective, just pick an address limit that is
appropriate for the one access_ok() is using.

Even if access_ok() uses 1<<63 there are plenty of addresses above it that fault.
But the upper limit for 5-level page tables could be used all the time.

One option is to test '(address | length) < (3<<62)' in access_ok().
That is also moderately suitable for masking invalid addresses to 0.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply

* Re: ptrace_syscall_32 is failing
From: Thomas Gleixner @ 2020-09-04 10:13 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-s390, linuxppc-dev, Vasily Gorbik, Brian Gerst,
	Heiko Carstens, X86 ML, LKML, Christian Borntraeger,
	Paul Mackerras, Catalin Marinas, Andy Lutomirski, Will Deacon,
	linux-arm-kernel
In-Reply-To: <CALCETrUuyXpG0Vhrb-9m-G8J94+2bGqdrJkKfz+-5z7dsGLK8Q@mail.gmail.com>

Andy,

On Wed, Sep 02 2020 at 09:49, Andy Lutomirski wrote:
> On Wed, Sep 2, 2020 at 1:29 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>>
>> But you might tell me where exactly you want to inject the SIGTRAP in
>> the syscall exit code flow.
>
> It would be a bit complicated.  Definitely after any signals from the
> syscall are delivered.  Right now, I think that we don't deliver a
> SIGTRAP on the instruction boundary after SYSCALL while
> single-stepping.  (I think we used to, but only sometimes, and now we
> are at least consistent.)  This is because IRET will not trap if it
> starts with TF clear and ends up setting it.  (I asked Intel to
> document this, and I think they finally did, although I haven't gotten
> around to reading the new docs.  Certainly the old docs as of a year
> or two ago had no description whatsoever of how TF changes worked.)
>
> Deciding exactly *when* a trap should occur would be nontrivial -- we
> can't trap on sigreturn() from a SIGTRAP, for example.
>
> So this isn't fully worked out.

Oh well.

>> >> I don't think we want that in general. The current variant is perfectly
>> >> fine for everything except the 32bit fast syscall nonsense. Also
>> >> irqentry_entry/exit is not equivalent to the syscall_enter/exit
>> >> counterparts.
>> >
>> > If there are any architectures in which actual work is needed to
>> > figure out whether something is a syscall in the first place, they'll
>> > want to do the usual kernel entry work before the syscall entry work.
>>
>> That's low level entry code which does not require RCU, lockdep, tracing
>> or whatever muck we setup before actual work can be done.
>>
>> arch_asm_entry()
>>   ...
>>   arch_c_entry(cause) {
>>     switch(cause) {
>>       case EXCEPTION: arch_c_exception(...);
>>       case SYSCALL: arch_c_syscall(...);
>>       ...
>>     }
>
> You're assuming that figuring out the cause doesn't need the kernel
> entry code to run first.  In the case of the 32-bit vDSO fast
> syscalls, we arguably don't know whether an entry is a syscall until
> we have done a user memory access.  Logically, we're doing:
>
> if (get_user() < 0) {
>   /* Not a syscall.  This is actually a silly operation that sets AX =
> -EFAULT and returns.  Do not audit or invoke ptrace. */
> } else {
>   /* This actually is a syscall. */
> }

Yes, that's what I've addressed with providing split interfaces.

>> You really want to differentiate between exception and syscall
>> entry/exit.
>>
>
> Why do we want to distinguish between exception and syscall
> entry/exit?  For the enter part, AFAICS the exception case boils down
> to enter_from_user_mode() and the syscall case is:
>
>         enter_from_user_mode(regs);
>         instrumentation_begin();
>
>         local_irq_enable();
>         ti_work = READ_ONCE(current_thread_info()->flags);
>         if (ti_work & SYSCALL_ENTER_WORK)
>                 syscall = syscall_trace_enter(regs, syscall, ti_work);
>         instrumentation_end();
>
> Which would decompose quite nicely as a regular (non-syscall) entry
> plus the syscall part later.

There is a difference between syscall entry and exception entry at least
in my view:

syscall:
                enter_from_user_mode(regs);
                local_irq_enable();

exception:
                enter_from_user_mode(regs);

>> we'd have:
>>
>>   arch_c_entry()
>>      irqentry_enter();
>>      local_irq_enble();
>>      nr = syscall_enter_from_user_mode_work();
>>      ...
>>
>> which enforces two calls for sane entries and more code in arch/....
>
> This is why I still like my:
>
> arch_c_entry()
>   irqentry_enter_from_user_mode();
>   generic_syscall();
>   exit...

So what we have now (with my patch applied) is either:

1) arch_c_entry()
        nr = syscall_enter_from_user_mode();
        arch_handle_syscall(nr);
        syscall_exit_to_user_mode();

or for that extra 32bit fast syscall thing:

2) arch_c_entry()
        syscall_enter_from_user_mode_prepare();
        arch_do_stuff();
        nr = syscall_enter_from_user_mode_work();
        arch_handle_syscall(nr);
        syscall_exit_to_user_mode();

So for sane cases you just use #1.

Ideally we'd not need arch_handle_syscall(nr) at all, but that does not
work with multiple ABIs supported, i.e. the compat muck.

The only way we could make that work is to have:

    syscall_enter_exit(regs, mode)
      nr = syscall_enter_from_user_mode();
      arch_handle_syscall(mode, nr);
      syscall_exit_to_user_mode();

and then arch_c_entry() becomes:

    syscall_enter_exit(regs, mode);

which means that arch_handle_syscall() would have to evaluate the mode
and chose the appropriate syscall table. Not sure whether that's a win.

Thanks,

        tglx



^ permalink raw reply

* [PATCH] powerpc/uaccess: Add pre-update addressing to __put_user_asm_goto()
From: Christophe Leroy @ 2020-09-04 10:46 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

Enable pre-update addressing mode in __put_user_asm_goto()

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/uaccess.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 7c2427f237e1..a5cfe867fbdc 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -254,7 +254,7 @@ do {								\
 		"1:	" op "%U1%X1 %0,%1	# put_user\n"	\
 		EX_TABLE(1b, %l2)				\
 		:						\
-		: "r" (x), "m" (*addr)				\
+		: "r" (x), "m<>" (*addr)				\
 		:						\
 		: label)
 
-- 
2.25.0


^ permalink raw reply related

* [PATCH 1/3] powerpc/uaccess: Switch __put_user_size_allowed() to __put_user_asm_goto()
From: Christophe Leroy @ 2020-09-04 11:01 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

__put_user_asm_goto() provides more flexibility to GCC and avoids using
a local variable to tell if the write succeeded or not.
GCC can then avoid implementing a cmp in the fast path.

See the difference for a small function like the PPC64 version of
save_general_regs() in arch/powerpc/kernel/signal_32.c:

Before the patch (unreachable nop removed):

0000000000000c10 <.save_general_regs>:
     c10:	39 20 00 2c 	li      r9,44
     c14:	39 40 00 00 	li      r10,0
     c18:	7d 29 03 a6 	mtctr   r9
     c1c:	38 c0 00 00 	li      r6,0
     c20:	48 00 00 14 	b       c34 <.save_general_regs+0x24>
     c30:	42 40 00 40 	bdz     c70 <.save_general_regs+0x60>
     c34:	28 2a 00 27 	cmpldi  r10,39
     c38:	7c c8 33 78 	mr      r8,r6
     c3c:	79 47 1f 24 	rldicr  r7,r10,3,60
     c40:	39 20 00 01 	li      r9,1
     c44:	41 82 00 0c 	beq     c50 <.save_general_regs+0x40>
     c48:	7d 23 38 2a 	ldx     r9,r3,r7
     c4c:	79 29 00 20 	clrldi  r9,r9,32
     c50:	91 24 00 00 	stw     r9,0(r4)
     c54:	2c 28 00 00 	cmpdi   r8,0
     c58:	39 4a 00 01 	addi    r10,r10,1
     c5c:	38 84 00 04 	addi    r4,r4,4
     c60:	41 82 ff d0 	beq     c30 <.save_general_regs+0x20>
     c64:	38 60 ff f2 	li      r3,-14
     c68:	4e 80 00 20 	blr
     c70:	38 60 00 00 	li      r3,0
     c74:	4e 80 00 20 	blr

0000000000000000 <.fixup>:
  cc:	39 00 ff f2 	li      r8,-14
  d0:	48 00 00 00 	b       d0 <.fixup+0xd0>
			d0: R_PPC64_REL24	.text+0xc54

After the patch:

0000000000001490 <.save_general_regs>:
    1490:	39 20 00 2c 	li      r9,44
    1494:	39 40 00 00 	li      r10,0
    1498:	7d 29 03 a6 	mtctr   r9
    149c:	60 00 00 00 	nop
    14a0:	28 2a 00 27 	cmpldi  r10,39
    14a4:	79 48 1f 24 	rldicr  r8,r10,3,60
    14a8:	39 20 00 01 	li      r9,1
    14ac:	41 82 00 0c 	beq     14b8 <.save_general_regs+0x28>
    14b0:	7d 23 40 2a 	ldx     r9,r3,r8
    14b4:	79 29 00 20 	clrldi  r9,r9,32
    14b8:	91 24 00 00 	stw     r9,0(r4)
    14bc:	39 4a 00 01 	addi    r10,r10,1
    14c0:	38 84 00 04 	addi    r4,r4,4
    14c4:	42 00 ff dc 	bdnz    14a0 <.save_general_regs+0x10>
    14c8:	38 60 00 00 	li      r3,0
    14cc:	4e 80 00 20 	blr
    14d0:	38 60 ff f2 	li      r3,-14
    14d4:	4e 80 00 20 	blr

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/uaccess.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index a5cfe867fbdc..96d1c144f92b 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -189,14 +189,14 @@ extern long __put_user_bad(void);
 
 #define __put_user_size_allowed(x, ptr, size, retval)		\
 do {								\
+	__label__ __pu_failed;					\
+								\
 	retval = 0;						\
-	switch (size) {						\
-	  case 1: __put_user_asm(x, ptr, retval, "stb"); break;	\
-	  case 2: __put_user_asm(x, ptr, retval, "sth"); break;	\
-	  case 4: __put_user_asm(x, ptr, retval, "stw"); break;	\
-	  case 8: __put_user_asm2(x, ptr, retval); break;	\
-	  default: __put_user_bad();				\
-	}							\
+	__put_user_size_goto(x, ptr, size, __pu_failed);	\
+	break;							\
+								\
+__pu_failed:							\
+	retval = -EFAULT;					\
 } while (0)
 
 #define __put_user_size(x, ptr, size, retval)			\
-- 
2.25.0


^ permalink raw reply related

* [PATCH 2/3] powerpc/uaccess: Switch __patch_instruction() to __put_user_asm_goto()
From: Christophe Leroy @ 2020-09-04 11:01 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <94ba5a5138f99522e1562dbcdb38d31aa790dc89.1599216721.git.christophe.leroy@csgroup.eu>

__patch_instruction() is the only user of __put_user_asm() outside
of asm/uaccess.h

Switch to the new __put_user_asm_goto() to enable retirement of
__put_user_asm() in a later patch.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/lib/code-patching.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 8c3934ea6220..2333625b5e31 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -21,21 +21,18 @@
 static int __patch_instruction(struct ppc_inst *exec_addr, struct ppc_inst instr,
 			       struct ppc_inst *patch_addr)
 {
-	int err = 0;
-
-	if (!ppc_inst_prefixed(instr)) {
-		__put_user_asm(ppc_inst_val(instr), patch_addr, err, "stw");
-	} else {
-		__put_user_asm(ppc_inst_as_u64(instr), patch_addr, err, "std");
-	}
-
-	if (err)
-		return err;
+	if (!ppc_inst_prefixed(instr))
+		__put_user_asm_goto(ppc_inst_val(instr), patch_addr, failed, "stw");
+	else
+		__put_user_asm_goto(ppc_inst_as_u64(instr), patch_addr, failed, "std");
 
 	asm ("dcbst 0, %0; sync; icbi 0,%1; sync; isync" :: "r" (patch_addr),
 							    "r" (exec_addr));
 
 	return 0;
+
+failed:
+	return -EFAULT;
 }
 
 int raw_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
-- 
2.25.0


^ permalink raw reply related

* [PATCH 3/3] powerpc/uaccess: Remove __put_user_asm() and __put_user_asm2()
From: Christophe Leroy @ 2020-09-04 11:01 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <94ba5a5138f99522e1562dbcdb38d31aa790dc89.1599216721.git.christophe.leroy@csgroup.eu>

__put_user_asm() and __put_user_asm2() are not used anymore.

Remove them.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/include/asm/uaccess.h | 41 ++++--------------------------
 1 file changed, 5 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 96d1c144f92b..26781b044932 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -151,42 +151,6 @@ static inline int __access_ok(unsigned long addr, unsigned long size,
 
 extern long __put_user_bad(void);
 
-/*
- * We don't tell gcc that we are accessing memory, but this is OK
- * because we do not write to any memory gcc knows about, so there
- * are no aliasing issues.
- */
-#define __put_user_asm(x, addr, err, op)			\
-	__asm__ __volatile__(					\
-		"1:	" op "%U2%X2 %1,%2	# put_user\n"	\
-		"2:\n"						\
-		".section .fixup,\"ax\"\n"			\
-		"3:	li %0,%3\n"				\
-		"	b 2b\n"					\
-		".previous\n"					\
-		EX_TABLE(1b, 3b)				\
-		: "=r" (err)					\
-		: "r" (x), "m<>" (*addr), "i" (-EFAULT), "0" (err))
-
-#ifdef __powerpc64__
-#define __put_user_asm2(x, ptr, retval)				\
-	  __put_user_asm(x, ptr, retval, "std")
-#else /* __powerpc64__ */
-#define __put_user_asm2(x, addr, err)				\
-	__asm__ __volatile__(					\
-		"1:	stw%X2 %1,%2\n"			\
-		"2:	stw%X2 %L1,%L2\n"			\
-		"3:\n"						\
-		".section .fixup,\"ax\"\n"			\
-		"4:	li %0,%3\n"				\
-		"	b 3b\n"					\
-		".previous\n"					\
-		EX_TABLE(1b, 4b)				\
-		EX_TABLE(2b, 4b)				\
-		: "=r" (err)					\
-		: "r" (x), "m" (*addr), "i" (-EFAULT), "0" (err))
-#endif /* __powerpc64__ */
-
 #define __put_user_size_allowed(x, ptr, size, retval)		\
 do {								\
 	__label__ __pu_failed;					\
@@ -249,6 +213,11 @@ do {								\
 })
 
 
+/*
+ * We don't tell gcc that we are accessing memory, but this is OK
+ * because we do not write to any memory gcc knows about, so there
+ * are no aliasing issues.
+ */
 #define __put_user_asm_goto(x, addr, label, op)			\
 	asm volatile goto(					\
 		"1:	" op "%U1%X1 %0,%1	# put_user\n"	\
-- 
2.25.0


^ permalink raw reply related

* [PATCH] kbuild: preprocess module linker script
From: Masahiro Yamada @ 2020-09-04 13:31 UTC (permalink / raw)
  To: linux-kbuild
  Cc: linux-ia64, Peter Zijlstra, Catalin Marinas, Paul Mackerras,
	linux-riscv, Will Deacon, Ard Biesheuvel, Anton Ivanov,
	linux-arch, Mauro Carvalho Chehab, Richard Weinberger,
	Masahiro Yamada, Russell King, Ingo Molnar, Geert Uytterhoeven,
	Fenghua Yu, Albert Ou, Kees Cook, Arnd Bergmann, Jeff Dike,
	Jessica Yu, linux-um, linux-m68k, Tony Luck, Paul Walmsley,
	linux-arm-kernel, Michal Marek, linux-kernel, Palmer Dabbelt,
	linuxppc-dev

There was a request to preprocess the module linker script like we do
for the vmlinux one (https://lkml.org/lkml/2020/8/21/512).

The difference between vmlinux.lds and module.lds is that the latter
is needed for external module builds, thus must be cleaned up by
'make mrproper' instead of 'make clean' (also, it must be created by
'make modules_prepare').

You cannot put it in arch/*/kernel/ because 'make clean' descends into
it. I moved arch/*/kernel/module.lds to arch/*/include/asm/module.lds.h,
which is included from scripts/module.lds.S.

scripts/module.lds is fine because 'make clean' keeps all the build
artifacts under scripts/.

You can add arch-specific sections in <asm/module.lds.h>.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Tested-by: Jessica Yu <jeyu@kernel.org>
---

 Makefile                                               |  1 -
 arch/arm/Makefile                                      |  4 ----
 .../{kernel/module.lds => include/asm/module.lds.h}    |  2 ++
 arch/arm64/Makefile                                    |  4 ----
 .../{kernel/module.lds => include/asm/module.lds.h}    |  2 ++
 arch/ia64/Makefile                                     |  1 -
 arch/ia64/{module.lds => include/asm/module.lds.h}     |  0
 arch/m68k/Makefile                                     |  1 -
 .../{kernel/module.lds => include/asm/module.lds.h}    |  0
 arch/powerpc/Makefile                                  |  1 -
 .../{kernel/module.lds => include/asm/module.lds.h}    |  0
 arch/riscv/Makefile                                    |  3 ---
 .../{kernel/module.lds => include/asm/module.lds.h}    |  3 ++-
 arch/um/include/asm/Kbuild                             |  1 +
 include/asm-generic/Kbuild                             |  1 +
 include/asm-generic/module.lds.h                       | 10 ++++++++++
 scripts/.gitignore                                     |  1 +
 scripts/Makefile                                       |  2 ++
 scripts/Makefile.modfinal                              |  5 ++---
 scripts/{module-common.lds => module.lds.S}            |  3 +++
 scripts/package/builddeb                               |  2 +-
 21 files changed, 27 insertions(+), 20 deletions(-)
 rename arch/arm/{kernel/module.lds => include/asm/module.lds.h} (72%)
 rename arch/arm64/{kernel/module.lds => include/asm/module.lds.h} (76%)
 rename arch/ia64/{module.lds => include/asm/module.lds.h} (100%)
 rename arch/m68k/{kernel/module.lds => include/asm/module.lds.h} (100%)
 rename arch/powerpc/{kernel/module.lds => include/asm/module.lds.h} (100%)
 rename arch/riscv/{kernel/module.lds => include/asm/module.lds.h} (84%)
 create mode 100644 include/asm-generic/module.lds.h
 rename scripts/{module-common.lds => module.lds.S} (93%)

diff --git a/Makefile b/Makefile
index 9cac6fde3479..3d9b56c6b47e 100644
--- a/Makefile
+++ b/Makefile
@@ -506,7 +506,6 @@ KBUILD_CFLAGS_KERNEL :=
 KBUILD_AFLAGS_MODULE  := -DMODULE
 KBUILD_CFLAGS_MODULE  := -DMODULE
 KBUILD_LDFLAGS_MODULE :=
-export KBUILD_LDS_MODULE := $(srctree)/scripts/module-common.lds
 KBUILD_LDFLAGS :=
 CLANG_FLAGS :=
 
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 4e877354515f..a0cb15de9677 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -16,10 +16,6 @@ LDFLAGS_vmlinux	+= --be8
 KBUILD_LDFLAGS_MODULE	+= --be8
 endif
 
-ifeq ($(CONFIG_ARM_MODULE_PLTS),y)
-KBUILD_LDS_MODULE	+= $(srctree)/arch/arm/kernel/module.lds
-endif
-
 GZFLAGS		:=-9
 #KBUILD_CFLAGS	+=-pipe
 
diff --git a/arch/arm/kernel/module.lds b/arch/arm/include/asm/module.lds.h
similarity index 72%
rename from arch/arm/kernel/module.lds
rename to arch/arm/include/asm/module.lds.h
index 79cb6af565e5..0e7cb4e314b4 100644
--- a/arch/arm/kernel/module.lds
+++ b/arch/arm/include/asm/module.lds.h
@@ -1,5 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifdef CONFIG_ARM_MODULE_PLTS
 SECTIONS {
 	.plt : { BYTE(0) }
 	.init.plt : { BYTE(0) }
 }
+#endif
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 55bc8546d9c7..232547ec07d8 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -115,10 +115,6 @@ endif
 
 CHECKFLAGS	+= -D__aarch64__
 
-ifeq ($(CONFIG_ARM64_MODULE_PLTS),y)
-KBUILD_LDS_MODULE	+= $(srctree)/arch/arm64/kernel/module.lds
-endif
-
 ifeq ($(CONFIG_DYNAMIC_FTRACE_WITH_REGS),y)
   KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
   CC_FLAGS_FTRACE := -fpatchable-function-entry=2
diff --git a/arch/arm64/kernel/module.lds b/arch/arm64/include/asm/module.lds.h
similarity index 76%
rename from arch/arm64/kernel/module.lds
rename to arch/arm64/include/asm/module.lds.h
index 22e36a21c113..691f15af788e 100644
--- a/arch/arm64/kernel/module.lds
+++ b/arch/arm64/include/asm/module.lds.h
@@ -1,5 +1,7 @@
+#ifdef CONFIG_ARM64_MODULE_PLTS
 SECTIONS {
 	.plt (NOLOAD) : { BYTE(0) }
 	.init.plt (NOLOAD) : { BYTE(0) }
 	.text.ftrace_trampoline (NOLOAD) : { BYTE(0) }
 }
+#endif
diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index 2876a7df1b0a..703b1c4f6d12 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -20,7 +20,6 @@ CHECKFLAGS	+= -D__ia64=1 -D__ia64__=1 -D_LP64 -D__LP64__
 
 OBJCOPYFLAGS	:= --strip-all
 LDFLAGS_vmlinux	:= -static
-KBUILD_LDS_MODULE += $(srctree)/arch/ia64/module.lds
 KBUILD_AFLAGS_KERNEL := -mconstant-gp
 EXTRA		:=
 
diff --git a/arch/ia64/module.lds b/arch/ia64/include/asm/module.lds.h
similarity index 100%
rename from arch/ia64/module.lds
rename to arch/ia64/include/asm/module.lds.h
diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile
index 4438ffb4bbe1..ea14f2046fb4 100644
--- a/arch/m68k/Makefile
+++ b/arch/m68k/Makefile
@@ -75,7 +75,6 @@ KBUILD_CPPFLAGS += -D__uClinux__
 endif
 
 KBUILD_LDFLAGS := -m m68kelf
-KBUILD_LDS_MODULE += $(srctree)/arch/m68k/kernel/module.lds
 
 ifdef CONFIG_SUN3
 LDFLAGS_vmlinux = -N
diff --git a/arch/m68k/kernel/module.lds b/arch/m68k/include/asm/module.lds.h
similarity index 100%
rename from arch/m68k/kernel/module.lds
rename to arch/m68k/include/asm/module.lds.h
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 3e8da9cf2eb9..8935658fcd06 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -65,7 +65,6 @@ UTS_MACHINE := $(subst $(space),,$(machine-y))
 ifdef CONFIG_PPC32
 KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
 else
-KBUILD_LDS_MODULE += $(srctree)/arch/powerpc/kernel/module.lds
 ifeq ($(call ld-ifversion, -ge, 225000000, y),y)
 # Have the linker provide sfpr if possible.
 # There is a corresponding test in arch/powerpc/lib/Makefile
diff --git a/arch/powerpc/kernel/module.lds b/arch/powerpc/include/asm/module.lds.h
similarity index 100%
rename from arch/powerpc/kernel/module.lds
rename to arch/powerpc/include/asm/module.lds.h
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index fb6e37db836d..8edaa8bd86d6 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -53,9 +53,6 @@ endif
 ifeq ($(CONFIG_CMODEL_MEDANY),y)
 	KBUILD_CFLAGS += -mcmodel=medany
 endif
-ifeq ($(CONFIG_MODULE_SECTIONS),y)
-	KBUILD_LDS_MODULE += $(srctree)/arch/riscv/kernel/module.lds
-endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
         KBUILD_CFLAGS += -fno-omit-frame-pointer
 endif
diff --git a/arch/riscv/kernel/module.lds b/arch/riscv/include/asm/module.lds.h
similarity index 84%
rename from arch/riscv/kernel/module.lds
rename to arch/riscv/include/asm/module.lds.h
index 295ecfb341a2..4254ff2ff049 100644
--- a/arch/riscv/kernel/module.lds
+++ b/arch/riscv/include/asm/module.lds.h
@@ -1,8 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Copyright (C) 2017 Andes Technology Corporation */
-
+#ifdef CONFIG_MODULE_SECTIONS
 SECTIONS {
 	.plt (NOLOAD) : { BYTE(0) }
 	.got (NOLOAD) : { BYTE(0) }
 	.got.plt (NOLOAD) : { BYTE(0) }
 }
+#endif
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index 8d435f8a6dec..1c63b260ecc4 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -16,6 +16,7 @@ generic-y += kdebug.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
 generic-y += mmiowb.h
+generic-y += module.lds.h
 generic-y += param.h
 generic-y += pci.h
 generic-y += percpu.h
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 74b0612601dd..7cd4e627e00e 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -40,6 +40,7 @@ mandatory-y += mmiowb.h
 mandatory-y += mmu.h
 mandatory-y += mmu_context.h
 mandatory-y += module.h
+mandatory-y += module.lds.h
 mandatory-y += msi.h
 mandatory-y += pci.h
 mandatory-y += percpu.h
diff --git a/include/asm-generic/module.lds.h b/include/asm-generic/module.lds.h
new file mode 100644
index 000000000000..f210d5c1b78b
--- /dev/null
+++ b/include/asm-generic/module.lds.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_GENERIC_MODULE_LDS_H
+#define __ASM_GENERIC_MODULE_LDS_H
+
+/*
+ * <asm/module.lds.h> can specify arch-specific sections for linking modules.
+ * Empty for the asm-generic header.
+ */
+
+#endif /* __ASM_GENERIC_MODULE_LDS_H */
diff --git a/scripts/.gitignore b/scripts/.gitignore
index 0d1c8e217cd7..a6c11316c969 100644
--- a/scripts/.gitignore
+++ b/scripts/.gitignore
@@ -8,3 +8,4 @@ asn1_compiler
 extract-cert
 sign-file
 insert-sys-cert
+/module.lds
diff --git a/scripts/Makefile b/scripts/Makefile
index bc018e4b733e..a5058bfdd0f6 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -29,6 +29,8 @@ endif
 # The following programs are only built on demand
 hostprogs += unifdef
 
+always-$(CONFIG_MODULES)	+= module.lds
+
 subdir-$(CONFIG_GCC_PLUGINS) += gcc-plugins
 subdir-$(CONFIG_MODVERSIONS) += genksyms
 subdir-$(CONFIG_SECURITY_SELINUX) += selinux
diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal
index 411c1e600e7d..ae01baf96f4e 100644
--- a/scripts/Makefile.modfinal
+++ b/scripts/Makefile.modfinal
@@ -33,11 +33,10 @@ quiet_cmd_ld_ko_o = LD [M]  $@
       cmd_ld_ko_o =                                                     \
 	$(LD) -r $(KBUILD_LDFLAGS)					\
 		$(KBUILD_LDFLAGS_MODULE) $(LDFLAGS_MODULE)		\
-		$(addprefix -T , $(KBUILD_LDS_MODULE))			\
-		-o $@ $(filter %.o, $^);				\
+		-T scripts/module.lds -o $@ $(filter %.o, $^);		\
 	$(if $(ARCH_POSTLINK), $(MAKE) -f $(ARCH_POSTLINK) $@, true)
 
-$(modules): %.ko: %.o %.mod.o $(KBUILD_LDS_MODULE) FORCE
+$(modules): %.ko: %.o %.mod.o scripts/module.lds FORCE
 	+$(call if_changed,ld_ko_o)
 
 targets += $(modules) $(modules:.ko=.mod.o)
diff --git a/scripts/module-common.lds b/scripts/module.lds.S
similarity index 93%
rename from scripts/module-common.lds
rename to scripts/module.lds.S
index d61b9e8678e8..69b9b71a6a47 100644
--- a/scripts/module-common.lds
+++ b/scripts/module.lds.S
@@ -24,3 +24,6 @@ SECTIONS {
 
 	__jump_table		0 : ALIGN(8) { KEEP(*(__jump_table)) }
 }
+
+/* bring in arch-specific sections */
+#include <asm/module.lds.h>
diff --git a/scripts/package/builddeb b/scripts/package/builddeb
index 6df3c9f8b2da..44f212e37935 100755
--- a/scripts/package/builddeb
+++ b/scripts/package/builddeb
@@ -55,7 +55,7 @@ deploy_kernel_headers () {
 		cd $srctree
 		find . arch/$SRCARCH -maxdepth 1 -name Makefile\*
 		find include scripts -type f -o -type l
-		find arch/$SRCARCH -name module.lds -o -name Kbuild.platforms -o -name Platform
+		find arch/$SRCARCH -name Kbuild.platforms -o -name Platform
 		find $(find arch/$SRCARCH -name include -o -name scripts -type d) -type f
 	) > debian/hdrsrcfiles
 
-- 
2.25.1


^ permalink raw reply related

* Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3
From: Alexey Dobriyan @ 2020-09-04 17:58 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-arch, linuxppc-dev, Kees Cook, x86, linux-kernel,
	Luis Chamberlain, Al Viro, linux-fsdevel, Linus Torvalds,
	Christoph Hellwig
In-Reply-To: <20200904060024.GA2779810@gmail.com>

On Fri, Sep 04, 2020 at 08:00:24AM +0200, Ingo Molnar wrote:
> * Christoph Hellwig <hch@lst.de> wrote:
> > this series removes the last set_fs() used to force a kernel address
> > space for the uaccess code in the kernel read/write/splice code, and then
> > stops implementing the address space overrides entirely for x86 and
> > powerpc.
> 
> Cool! For the x86 bits:
> 
>   Acked-by: Ingo Molnar <mingo@kernel.org>

set_fs() is older than some kernel hackers!

	$ cd linux-0.11/
	$ find . -type f -name '*.h' | xargs grep -e set_fs -w -n -A3
	./include/asm/segment.h:61:extern inline void set_fs(unsigned long val)
	./include/asm/segment.h-62-{
	./include/asm/segment.h-63-     __asm__("mov %0,%%fs"::"a" ((unsigned short) val));
	./include/asm/segment.h-64-}

^ permalink raw reply

* Re: remove the last set_fs() in common code, and remove it for x86 and powerpc v3
From: Linus Torvalds @ 2020-09-04 18:42 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: linux-arch, Kees Cook, the arch/x86 maintainers,
	Linux Kernel Mailing List, Christoph Hellwig, Luis Chamberlain,
	Al Viro, linux-fsdevel, linuxppc-dev, Ingo Molnar
In-Reply-To: <20200904175823.GA500051@localhost.localdomain>

On Fri, Sep 4, 2020 at 10:58 AM Alexey Dobriyan <adobriyan@gmail.com> wrote:
>
> set_fs() is older than some kernel hackers!
>
>         $ cd linux-0.11/
>         $ find . -type f -name '*.h' | xargs grep -e set_fs -w -n -A3

Oh, it's older than that. It was there (as set_fs) in 0.10, and may
even predate that. But sadly, I don't have tar-balls for 0.02 and
0.03, so can't check.

The actual use of %fs as the user space segment is already there in
0.01, but there was no 'set_fs()'. That was a simpler and more direct
time, and "get_fs()" looked like this back then:

  #define _fs() ({ \
  register unsigned short __res; \
  __asm__("mov %%fs,%%ax":"=a" (__res):); \
  __res;})

and all the setting was basically part of the kernel entry asm and. Lovely.

                 Linus

^ permalink raw reply

* Re: [PATCH 0/1] powerpc/numa: do not skip node 0 in lookup table
From: Daniel Henrique Barboza @ 2020-09-04 20:06 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <20200814203413.542050-1-danielhb413@gmail.com>

I discussed this a bit with Aneesh Kumar in IBM internal Slack, a few weeks
ago, and he informed me that that this patch does not make sense with the
design used by the kernel. The kernel will assume that, for node 0, all
associativity domains must also be zeroed. This is why node 0 is skipped
when creating the distance table.

This of course has consequences for QEMU, so based on that, I've adapted
the QEMU implementation to not touch node 0.



Daniel

On 8/14/20 5:34 PM, Daniel Henrique Barboza wrote:
> Hi,
> 
> This is a simple fix that I made while testing NUMA changes
> I'm making in QEMU [1]. Setting any non-zero value to the
> associativity of NUMA node 0 has no impact in the output
> of 'numactl' because the distance_lookup_table is never
> initialized for node 0.
> 
> Seeing through the LOPAPR spec and git history I found no
> technical reason to skip node 0, which makes me believe this is
> a bug that got under the radar up until now because no one
> attempted to set node 0 associativity like I'm doing now.
> 
> For anyone wishing to give it a spin, using the QEMU build
> in [1] and experimenting with NUMA distances, such as:
> 
> sudo ./qemu-system-ppc64 -machine pseries-5.2,accel=kvm,usb=off,dump-guest-core=off -m 65536 -overcommit mem-lock=off -smp 4,sockets=4,cores=1,threads=1 -rtc base=utc -display none -vga none -nographic -boot menu=on -device spapr-pci-host-bridge,index=1,id=pci.1 -device spapr-pci-host-bridge,index=2,id=pci.2 -device spapr-pci-host-bridge,index=3,id=pci.3 -device spapr-pci-host-bridge,index=4,id=pci.4 -device qemu-xhci,id=usb,bus=pci.0,addr=0x2 -drive file=/home/danielhb/f32.qcow2,format=qcow2,if=none,id=drive-virtio-disk0 -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x3,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1 -device usb-kbd,id=input0,bus=usb.0,port=1 -device usb-mouse,id=input1,bus=usb.0,port=2 -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4 -msg timestamp=on \
> -numa node,nodeid=0,cpus=0 -numa node,nodeid=1,cpus=1 \
> -numa node,nodeid=2,cpus=2 -numa node,nodeid=3,cpus=3 \
> -numa dist,src=0,dst=1,val=80 -numa dist,src=0,dst=2,val=80 \
> -numa dist,src=0,dst=3,val=80 -numa dist,src=1,dst=2,val=80 \
> -numa dist,src=1,dst=3,val=80 -numa dist,src=2,dst=3,val=80
> 
> The current kernel code will ignore the associativity of
> node 0, and numactl will output this:
> 
> node distances:
> node   0   1   2   3
>    0:  10  160  160  160
>    1:  160  10  80  80
>    2:  160  80  10  80
>    3:  160  80  80  10
> 
> With this patch:
> 
> node distances:
> node   0   1   2   3
>    0:  10  160  160  160
>    1:  160  10  80  40
>    2:  160  80  10  20
>    3:  160  40  20  10
> 
> 
> If anyone wonders, this patch has no conflict with the proposed
> NUMA changes in [2] because Aneesh isn't changing this line.
> 
> 
> [1] https://github.com/danielhb/qemu/tree/spapr_numa_v1
> [2] https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200731111916.243569-1-aneesh.kumar@linux.ibm.com/
> 
> 
> Daniel Henrique Barboza (1):
>    powerpc/numa: do not skip node 0 when init lookup table
> 
>   arch/powerpc/mm/numa.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 

^ permalink raw reply

* RE: remove the last set_fs() in common code, and remove it for x86 and powerpc v3
From: David Laight @ 2020-09-04 21:01 UTC (permalink / raw)
  To: 'Alexey Dobriyan', Ingo Molnar
  Cc: linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	Kees Cook, x86@kernel.org, linux-kernel@vger.kernel.org,
	Luis Chamberlain, Al Viro, linux-fsdevel@vger.kernel.org,
	Linus Torvalds, Christoph Hellwig
In-Reply-To: <20200904175823.GA500051@localhost.localdomain>

From: Alexey Dobriyan
> Sent: 04 September 2020 18:58
> 
> On Fri, Sep 04, 2020 at 08:00:24AM +0200, Ingo Molnar wrote:
> > * Christoph Hellwig <hch@lst.de> wrote:
> > > this series removes the last set_fs() used to force a kernel address
> > > space for the uaccess code in the kernel read/write/splice code, and then
> > > stops implementing the address space overrides entirely for x86 and
> > > powerpc.
> >
> > Cool! For the x86 bits:
> >
> >   Acked-by: Ingo Molnar <mingo@kernel.org>
> 
> set_fs() is older than some kernel hackers!
> 
> 	$ cd linux-0.11/
> 	$ find . -type f -name '*.h' | xargs grep -e set_fs -w -n -A3
> 	./include/asm/segment.h:61:extern inline void set_fs(unsigned long val)
> 	./include/asm/segment.h-62-{
> 	./include/asm/segment.h-63-     __asm__("mov %0,%%fs"::"a" ((unsigned short) val));
> 	./include/asm/segment.h-64-}

What is this strange %fs register you are talking about.
Figure 2-4 only has CS, DS, SS and ES.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply

* [Bug 208181] BUG: KASAN: stack-out-of-bounds in strcmp+0x58/0xd8
From: bugzilla-daemon @ 2020-09-04 22:19 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-208181-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=208181

--- Comment #17 from Erhard F. (erhard_f@mailbox.org) ---
Created attachment 292337
  --> https://bugzilla.kernel.org/attachment.cgi?id=292337&action=edit
dmesg (5.9-rc3, INLINE KASAN, PowerMac G4 DP)

Re-tried with 5.9-rc3 (inline KASAN). The original problem (stack-out-of-bounds
in strcmp+0x58/0xd8) is gone, but still problems with stack usage when doing
larger build jobs:

[...]
[ 1929.683510] do_IRQ: stack overflow: 1696
[ 1929.690727] CPU: 1 PID: 735 Comm: mount.nfs Tainted: G        W        
5.9.0-rc3-PowerMacG4 #1
[ 1929.697847] Call Trace:
[ 1929.704633] [d0ca4670] [c0a75518] dump_stack+0xfc/0x130 (unreliable)
[ 1929.711507] [d0ca46a0] [c000b094] do_IRQ+0x128/0x180
[ 1929.717998] [d0ca46d0] [c002e560] ret_from_except+0x0/0x14
[ 1929.724652] --- interrupt: 501 at _raw_spin_unlock_irqrestore+0x3c/0xa4
                   LR = _raw_spin_unlock_irqrestore+0x38/0xa4
[ 1929.738722] [d0ca47b8] [c0a6dc90] stack_depot_save+0x20c/0x390
[ 1929.746132] [d0ca4818] [c04d4b70] kasan_save_stack+0x40/0x48
[ 1929.753675] [d0ca4928] [c04d4b9c] kasan_set_track+0x24/0x30
[ 1929.761298] [d0ca4938] [c04d710c] kasan_set_free_info+0x28/0x3c
[ 1929.769073] [d0ca4948] [c04d4f74] __kasan_slab_free+0x104/0x118
[ 1929.776983] [d0ca4968] [c04ce800] slab_free_freelist_hook+0xec/0x17c
[ 1929.785111] [d0ca49a8] [c04d3468] kmem_cache_free+0x58/0x2a0
[ 1929.793391] [d0ca49f8] [c11b251c] packet_rcv+0xb9c/0xbb4
[ 1929.801797] [d0ca4a48] [c0dbfd98] dev_queue_xmit_nit+0x6e4/0x748
[ 1929.810434] [d0ca4ab8] [c0dcaf80] dev_hard_start_xmit+0xec/0x880
[ 1929.819207] [d0ca4b18] [c0ea4814] sch_direct_xmit+0x1f8/0x818
[ 1929.828111] [d0ca4bf8] [c0dcc884] __dev_queue_xmit+0xed4/0x136c
[ 1929.837202] [d0ca4d28] [c0f256dc] ip_finish_output2+0xfcc/0x1028
[ 1929.846472] [d0ca4d88] [c0f2d848] __ip_queue_xmit+0xde0/0x1018
[ 1929.855892] [d0ca4df8] [c0f929d8] __tcp_transmit_skb+0x2550/0x2cb8
[ 1929.865486] [d0ca4ee8] [c0f98470] tcp_write_xmit+0x1d28/0x3498
[ 1929.875216] [d0ca4f78] [c0f99c8c] __tcp_push_pending_frames+0xac/0x1c4
[ 1929.885189] [d0ca4f98] [c0f5a970] tcp_sendmsg_locked+0x1c50/0x2294
[ 1929.895338] [d0ca5098] [c0f5afe4] tcp_sendmsg+0x30/0x48
[ 1929.905564] [d0ca50b8] [c0d598b0] sock_sendmsg_nosec+0xf4/0x10c
[ 1929.916463] [d0ca50d8] [b0a31840] xprt_sock_sendmsg+0x2c0/0x6e8 [sunrpc]
[ 1929.927494] [d0ca51b8] [b0a34ce8] xs_tcp_send_request+0x360/0x580 [sunrpc]
[ 1929.938699] [d0ca52e8] [b0a2eae8] xprt_transmit+0x4f8/0xe30 [sunrpc]
[ 1929.950044] [d0ca5368] [b0a1dcd8] call_transmit+0x238/0x25c [sunrpc]
[ 1929.961450] [d0ca5388] [b0a6641c] __rpc_execute+0x35c/0xbf8 [sunrpc]
[ 1929.972996] [d0ca5448] [b0a21d18] rpc_run_task+0x790/0x79c [sunrpc]
[ 1929.984850] [d0ca5498] [b1282e50] nfs4_call_sync_custom+0x14/0x80 [nfsv4]
[ 1929.996821] [d0ca54b8] [b128302c] nfs4_do_call_sync+0x170/0x1a8 [nfsv4]
[ 1930.008922] [d0ca55a8] [b12b3570] nfs4_proc_lookup_common+0x314/0xc54
[nfsv4]
[ 1930.020820] [d0ca5758] [b12b4244] nfs4_proc_lookup+0x158/0x2f0 [nfsv4]
[ 1930.032753] [d0ca57f8] [b0b49544] nfs_lookup+0x2ac/0x9ac [nfs]
[ 1930.044062] [d0ca5838] [c052c984] __lookup_slow+0x278/0x2a8
[ 1930.055461] [d0ca5958] [c05340a0] walk_component+0x288/0x30c
[ 1930.066816] [d0ca5a08] [c0534e5c] path_lookupat.isra.0+0x1b8/0x438
[ 1930.078282] [d0ca5a48] [c05372a0] filename_lookup+0x144/0x1c4
[ 1930.089834] [d0ca5b98] [c05373fc] vfs_path_lookup+0x94/0xc0
[ 1930.101389] [d0ca5c18] [c05714b8] mount_subtree+0x1c4/0x250
[ 1930.113267] [d0ca5ca8] [b12e1b2c] do_nfs4_mount+0x570/0x7fc [nfsv4]
[ 1930.125298] [d0ca5d68] [b12e202c] nfs4_try_get_tree+0xfc/0x16c [nfsv4]
[ 1930.137200] [d0ca5d88] [c050e434] vfs_get_tree+0xf8/0x398
[ 1930.149133] [d0ca5db8] [c056f968] path_mount+0x1074/0x113c
[ 1930.161107] [d0ca5e78] [c056fad8] do_mount+0xa8/0xe4
[ 1930.173109] [d0ca5f08] [c0570054] sys_mount+0xa8/0xb8
[ 1930.185160] [d0ca5f38] [c002e1cc] ret_from_syscall+0x0/0x34
[ 1930.197313] --- interrupt: c01 at 0x8b5754
                   LR = 0xac0be0
[ 1930.222896] Kernel panic - not syncing: corrupted stack end detected inside
scheduler


But feel free to close this bug if appropriate as the original issue is solved.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox