LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v2 09/10] tools/perf: Add perf tools support for extended register capability in powerpc
From: Athira Rajeev @ 2020-07-13  2:36 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Michael Neuling, maddy, linuxppc-dev
In-Reply-To: <87pn962owo.fsf@mpe.ellerman.id.au>

[-- Attachment #1: Type: text/plain, Size: 6296 bytes --]



> On 08-Jul-2020, at 5:34 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> 
> Athira Rajeev <atrajeev@linux.vnet.ibm.com <mailto:atrajeev@linux.vnet.ibm.com>> writes:
>> From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
>> 
>> Add extended regs to sample_reg_mask in the tool side to use
>> with `-I?` option. Perf tools side uses extended mask to display
>> the platform supported register names (with -I? option) to the user
>> and also send this mask to the kernel to capture the extended registers
>> in each sample. Hence decide the mask value based on the processor
>> version.
>> 
>> Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
>> [Decide extended mask at run time based on platform]
>> Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>> Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
> 
> Will need an ack from perf tools folks, who are not on Cc by the looks.
> 
>> diff --git a/tools/arch/powerpc/include/uapi/asm/perf_regs.h b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
>> index f599064..485b1d5 100644
>> --- a/tools/arch/powerpc/include/uapi/asm/perf_regs.h
>> +++ b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
>> @@ -48,6 +48,18 @@ enum perf_event_powerpc_regs {
>> 	PERF_REG_POWERPC_DSISR,
>> 	PERF_REG_POWERPC_SIER,
>> 	PERF_REG_POWERPC_MMCRA,
>> -	PERF_REG_POWERPC_MAX,
>> +	/* Extended registers */
>> +	PERF_REG_POWERPC_MMCR0,
>> +	PERF_REG_POWERPC_MMCR1,
>> +	PERF_REG_POWERPC_MMCR2,
>> +	/* Max regs without the extended regs */
>> +	PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
> 
> I don't really understand this idea of a max that's not the max.

Hi Michael

This is the MAX without extended regs. This is mainly used in `arch/powerpc/perf/perf_regs.c` to define pt_regs_offset ( to get index
for other regs ) and also is used to determine whether requested register is an extended reg while capturing data in sample
( in `perf_reg_value` )

Thanks
Athira

> 
>> };
>> +
>> +#define PERF_REG_PMU_MASK	((1ULL << PERF_REG_POWERPC_MAX) - 1)
>> +
>> +/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
>> +#define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) \
>> +				- PERF_REG_PMU_MASK)
>> +
>> #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
>> diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h
>> index e18a355..46ed00d 100644
>> --- a/tools/perf/arch/powerpc/include/perf_regs.h
>> +++ b/tools/perf/arch/powerpc/include/perf_regs.h
>> @@ -64,7 +64,10 @@
>> 	[PERF_REG_POWERPC_DAR] = "dar",
>> 	[PERF_REG_POWERPC_DSISR] = "dsisr",
>> 	[PERF_REG_POWERPC_SIER] = "sier",
>> -	[PERF_REG_POWERPC_MMCRA] = "mmcra"
>> +	[PERF_REG_POWERPC_MMCRA] = "mmcra",
>> +	[PERF_REG_POWERPC_MMCR0] = "mmcr0",
>> +	[PERF_REG_POWERPC_MMCR1] = "mmcr1",
>> +	[PERF_REG_POWERPC_MMCR2] = "mmcr2",
>> };
>> 
>> static inline const char *perf_reg_name(int id)
>> diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
>> index 0a52429..9179230 100644
>> --- a/tools/perf/arch/powerpc/util/perf_regs.c
>> +++ b/tools/perf/arch/powerpc/util/perf_regs.c
>> @@ -6,9 +6,14 @@
>> 
>> #include "../../../util/perf_regs.h"
>> #include "../../../util/debug.h"
>> +#include "../../../util/event.h"
>> +#include "../../../util/header.h"
>> +#include "../../../perf-sys.h"
>> 
>> #include <linux/kernel.h>
>> 
>> +#define PVR_POWER9		0x004E
>> +
>> const struct sample_reg sample_reg_masks[] = {
>> 	SMPL_REG(r0, PERF_REG_POWERPC_R0),
>> 	SMPL_REG(r1, PERF_REG_POWERPC_R1),
>> @@ -55,6 +60,9 @@
>> 	SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
>> 	SMPL_REG(sier, PERF_REG_POWERPC_SIER),
>> 	SMPL_REG(mmcra, PERF_REG_POWERPC_MMCRA),
>> +	SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0),
>> +	SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1),
>> +	SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2),
>> 	SMPL_REG_END
>> };
>> 
>> @@ -163,3 +171,50 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
>> 
>> 	return SDT_ARG_VALID;
>> }
>> +
>> +uint64_t arch__intr_reg_mask(void)
>> +{
>> +	struct perf_event_attr attr = {
>> +		.type                   = PERF_TYPE_HARDWARE,
>> +		.config                 = PERF_COUNT_HW_CPU_CYCLES,
>> +		.sample_type            = PERF_SAMPLE_REGS_INTR,
>> +		.precise_ip             = 1,
>> +		.disabled               = 1,
>> +		.exclude_kernel         = 1,
>> +	};
>> +	int fd, ret;
>> +	char buffer[64];
>> +	u32 version;
>> +	u64 extended_mask = 0;
>> +
>> +	/* Get the PVR value to set the extended
>> +	 * mask specific to platform
> 
> Comment format is wrong, and punctuation please.
> 
>> +	 */
>> +	get_cpuid(buffer, sizeof(buffer));
>> +	ret = sscanf(buffer, "%u,", &version);
> 
> This is powerpc specific code, why not just use mfspr(SPRN_PVR), rather
> than redirecting via printf/sscanf.
> 
>> +
>> +	if (ret != 1) {
>> +		pr_debug("Failed to get the processor version, unable to output extended registers\n");
>> +		return PERF_REGS_MASK;
>> +	}
>> +
>> +	if (version == PVR_POWER9)
>> +		extended_mask = PERF_REG_PMU_MASK_300;
>> +	else
>> +		return PERF_REGS_MASK;
>> +
>> +	attr.sample_regs_intr = extended_mask;
>> +	attr.sample_period = 1;
>> +	event_attr_init(&attr);
>> +
>> +	/*
>> +	 * check if the pmu supports perf extended regs, before
>> +	 * returning the register mask to sample.
>> +	 */
>> +	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
>> +	if (fd != -1) {
>> +		close(fd);
>> +		return (extended_mask | PERF_REGS_MASK);
>> +	}
>> +	return PERF_REGS_MASK;
> 
> I think this would read a bit better like:
> 
> 	mask = PERF_REGS_MASK;
> 
> 	if (version == PVR_POWER9)
> 		extended_mask = PERF_REG_PMU_MASK_300;
>        else
>        	return mask;
> 
>        attr.sample_regs_intr = extended_mask;
>        attr.sample_period = 1;
>        event_attr_init(&attr);
> 
>        /*
>          * check if the pmu supports perf extended regs, before
>          * returning the register mask to sample.
>          */
>        fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
>        if (fd != -1) {
>                close(fd);
>                mask |= extended_mask;
>        }
> 
> 	return mask;
> 
> 
> cheers


[-- Attachment #2: Type: text/html, Size: 43626 bytes --]

^ permalink raw reply

* Re: [PATCH v4 5/7] KVM: PPC: clean up redundant kvm_run parameters in assembly
From: Tianjia Zhang @ 2020-07-13  3:07 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: wanpengli, kvm, david, heiko.carstens, peterx, linux-mips, hpa,
	kvmarm, linux-s390, frankja, chenhuacai, maz, joro, x86,
	borntraeger, mingo, julien.thierry.kdev, thuth, gor,
	suzuki.poulose, kvm-ppc, bp, tglx, linux-arm-kernel, jmattson,
	tsbogend, cohuck, christoffer.dall, sean.j.christopherson,
	linux-kernel, james.morse, pbonzini, vkuznets, linuxppc-dev
In-Reply-To: <20200526055924.GD282305@thinks.paulus.ozlabs.org>



On 2020/5/26 13:59, Paul Mackerras wrote:
> On Mon, Apr 27, 2020 at 12:35:12PM +0800, Tianjia Zhang wrote:
>> In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu'
>> structure. For historical reasons, many kvm-related function parameters
>> retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This
>> patch does a unified cleanup of these remaining redundant parameters.
> 
> Some of these changes don't look completely correct to me, see below.
> If you're expecting these patches to go through my tree, I can fix up
> the patch and commit it (with you as author), noting the changes I
> made in the commit message.  Do you want me to do that?
> 

I am very glad for you to do so, although I have submitted a new version 
of patch, I still prefer you to fix up and commit it.

Thanks and best,
Tianjia

>> diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
>> index f7ad99d972ce..0eff749d8027 100644
>> --- a/arch/powerpc/kvm/book3s_interrupts.S
>> +++ b/arch/powerpc/kvm/book3s_interrupts.S
>> @@ -55,8 +55,7 @@
>>    ****************************************************************************/
>>   
>>   /* Registers:
>> - *  r3: kvm_run pointer
>> - *  r4: vcpu pointer
>> + *  r3: vcpu pointer
>>    */
>>   _GLOBAL(__kvmppc_vcpu_run)
>>   
>> @@ -68,8 +67,8 @@ kvm_start_entry:
>>   	/* Save host state to the stack */
>>   	PPC_STLU r1, -SWITCH_FRAME_SIZE(r1)
>>   
>> -	/* Save r3 (kvm_run) and r4 (vcpu) */
>> -	SAVE_2GPRS(3, r1)
>> +	/* Save r3 (vcpu) */
>> +	SAVE_GPR(3, r1)
>>   
>>   	/* Save non-volatile registers (r14 - r31) */
>>   	SAVE_NVGPRS(r1)
>> @@ -82,11 +81,11 @@ kvm_start_entry:
>>   	PPC_STL	r0, _LINK(r1)
>>   
>>   	/* Load non-volatile guest state from the vcpu */
>> -	VCPU_LOAD_NVGPRS(r4)
>> +	VCPU_LOAD_NVGPRS(r3)
>>   
>>   kvm_start_lightweight:
>>   	/* Copy registers into shadow vcpu so we can access them in real mode */
>> -	mr	r3, r4
>> +	mr	r4, r3
> 
> This mr doesn't seem necessary.
> 
>>   	bl	FUNC(kvmppc_copy_to_svcpu)
>>   	nop
>>   	REST_GPR(4, r1)
> 
> This should be loading r4 from GPR3(r1), not GPR4(r1) - which is what
> REST_GPR(4, r1) will do.
> 
> Then, in the file but not in the patch context, there is this line:
> 
> 	PPC_LL	r3, GPR4(r1)		/* vcpu pointer */
> 
> where once again GPR4 needs to be GPR3.
> 
>> @@ -191,10 +190,10 @@ after_sprg3_load:
>>   	PPC_STL	r31, VCPU_GPR(R31)(r7)
>>   
>>   	/* Pass the exit number as 3rd argument to kvmppc_handle_exit */
> 
> The comment should be modified to say "2nd" instead of "3rd",
> otherwise it is confusing.
> 
> The rest of the patch looks OK.
> 
> Paul.
> 

^ permalink raw reply

* [PATCH V5 0/4] mm/debug_vm_pgtable: Add some more tests
From: Anshuman Khandual @ 2020-07-13  3:23 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-doc, Catalin Marinas, Heiko Carstens, Paul Mackerras,
	H. Peter Anvin, agordeev, Will Deacon, linux-riscv, linux-arch,
	linux-s390, Jonathan Corbet, x86, Mike Rapoport,
	Christian Borntraeger, Ingo Molnar, linux-arm-kernel, ziy,
	linux-snps-arc, Vasily Gorbik, Anshuman Khandual, cai,
	Paul Walmsley, Kirill A . Shutemov, Thomas Gleixner,
	gerald.schaefer, christophe.leroy, Vineet Gupta, linux-kernel,
	Palmer Dabbelt, aneesh.kumar, Borislav Petkov, Andrew Morton,
	linuxppc-dev, rppt

This series adds some more arch page table helper validation tests which
are related to core and advanced memory functions. This also creates a
documentation, enlisting expected semantics for all page table helpers as
suggested by Mike Rapoport previously (https://lkml.org/lkml/2020/1/30/40).

There are many TRANSPARENT_HUGEPAGE and ARCH_HAS_TRANSPARENT_HUGEPAGE_PUD
ifdefs scattered across the test. But consolidating all the fallback stubs
is not very straight forward because ARCH_HAS_TRANSPARENT_HUGEPAGE_PUD is
not explicitly dependent on ARCH_HAS_TRANSPARENT_HUGEPAGE.

Tested on arm64, x86 platforms but only build tested on all other enabled
platforms through ARCH_HAS_DEBUG_VM_PGTABLE i.e powerpc, arc, s390. The
following failure on arm64 still exists which was mentioned previously. It
will be fixed with the upcoming THP migration on arm64 enablement series.

WARNING .... mm/debug_vm_pgtable.c:866 debug_vm_pgtable+0x940/0xa54
WARN_ON(!pmd_present(pmd_mkinvalid(pmd_mkhuge(pmd))))

This series is based on v5.8-rc5.

Changes in V5:

- Dropped RANDOM_ORVALUE from hugetlb_advanced_tests()
- Folded in Mike's patch for the rst document
- Fixed typos in the rst document

Changes in V4: (https://patchwork.kernel.org/project/linux-mm/list/?series=313173)

- Replaced READ_ONCE() with ptep_get() while accessing PTE pointers per Christophe
- Fixed function argument alignments per Christophe

Changes in V3: (https://patchwork.kernel.org/project/linux-mm/list/?series=302483)

- Replaced HAVE_ARCH_SOFT_DIRTY with MEM_SOFT_DIRTY
- Added HAVE_ARCH_HUGE_VMAP checks in pxx_huge_tests() per Gerald
- Updated documentation for pmd_thp_tests() per Zi Yan
- Replaced READ_ONCE() with huge_ptep_get() per Gerald
- Added pte_mkhuge() and masking with PMD_MASK per Gerald
- Replaced pte_same() with holding pfn check in pxx_swap_tests()
- Added documentation for all (#ifdef #else #endif) per Gerald
- Updated pmd_protnone_tests() per Gerald
- Updated HugeTLB PTE creation in hugetlb_advanced_tests() per Gerald
- Replaced [pmd|pud]_mknotpresent() with [pmd|pud]_mkinvalid()
- Added has_transparent_hugepage() check for PMD and PUD tests
- Added a patch which debug prints all individual tests being executed
- Updated documentation for renamed [pmd|pud]_mkinvalid() helpers

Changes in V2: (https://patchwork.kernel.org/project/linux-mm/list/?series=260573)

- Dropped CONFIG_ARCH_HAS_PTE_SPECIAL per Christophe
- Dropped CONFIG_NUMA_BALANCING per Christophe
- Dropped CONFIG_HAVE_ARCH_SOFT_DIRTY per Christophe
- Dropped CONFIG_MIGRATION per Christophe
- Replaced CONFIG_S390 with __HAVE_ARCH_PMDP_INVALIDATE
- Moved page allocation & free inside swap_migration_tests() per Christophe
- Added CONFIG_TRANSPARENT_HUGEPAGE to protect pfn_pmd()
- Added CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD to protect pfn_pud()
- Added a patch for other arch advanced page table helper tests
- Added a patch creating a documentation for page table helper semantics

Changes in V1: (https://patchwork.kernel.org/patch/11408253/)

Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: linux-snps-arc@lists.infradead.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Cc: x86@kernel.org
Cc: linux-mm@kvack.org
Cc: linux-doc@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org

Anshuman Khandual (4):
  mm/debug_vm_pgtable: Add tests validating arch helpers for core MM features
  mm/debug_vm_pgtable: Add tests validating advanced arch page table helpers
  mm/debug_vm_pgtable: Add debug prints for individual tests
  Documentation/mm: Add descriptions for arch page table helpers

 Documentation/vm/arch_pgtable_helpers.rst | 258 +++++++++
 mm/debug_vm_pgtable.c                     | 666 +++++++++++++++++++++-
 2 files changed, 922 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/vm/arch_pgtable_helpers.rst

-- 
2.20.1


^ permalink raw reply

* [PATCH V5 1/4] mm/debug_vm_pgtable: Add tests validating arch helpers for core MM features
From: Anshuman Khandual @ 2020-07-13  3:23 UTC (permalink / raw)
  To: linux-mm
  Cc: Catalin Marinas, Heiko Carstens, Paul Mackerras, H. Peter Anvin,
	agordeev, Will Deacon, linux-riscv, linux-arch, linux-s390, x86,
	Mike Rapoport, Christian Borntraeger, Ingo Molnar,
	linux-arm-kernel, ziy, linux-snps-arc, Vasily Gorbik,
	Anshuman Khandual, cai, Paul Walmsley, Kirill A . Shutemov,
	Thomas Gleixner, gerald.schaefer, christophe.leroy, Vineet Gupta,
	linux-kernel, Palmer Dabbelt, aneesh.kumar, Borislav Petkov,
	Andrew Morton, linuxppc-dev, rppt
In-Reply-To: <1594610587-4172-1-git-send-email-anshuman.khandual@arm.com>

This adds new tests validating arch page table helpers for these following
core memory features. These tests create and test specific mapping types at
various page table levels.

1. SPECIAL mapping
2. PROTNONE mapping
3. DEVMAP mapping
4. SOFTDIRTY mapping
5. SWAP mapping
6. MIGRATION mapping
7. HUGETLB mapping
8. THP mapping

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: linux-snps-arc@lists.infradead.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Cc: x86@kernel.org
Cc: linux-mm@kvack.org
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Tested-by: Vineet Gupta <vgupta@synopsys.com>	#arc
Reviewed-by: Zi Yan <ziy@nvidia.com>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 mm/debug_vm_pgtable.c | 302 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 301 insertions(+), 1 deletion(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 61ab16fb2e36..2fac47db3eb7 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -282,6 +282,278 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
 	WARN_ON(pmd_bad(pmd));
 }
 
+static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
+{
+	pte_t pte = pfn_pte(pfn, prot);
+
+	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL))
+		return;
+
+	WARN_ON(!pte_special(pte_mkspecial(pte)));
+}
+
+static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+	pte_t pte = pfn_pte(pfn, prot);
+
+	if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+		return;
+
+	WARN_ON(!pte_protnone(pte));
+	WARN_ON(!pte_present(pte));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+	pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+
+	if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+		return;
+
+	WARN_ON(!pmd_protnone(pmd));
+	WARN_ON(!pmd_present(pmd));
+}
+#else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+	pte_t pte = pfn_pte(pfn, prot);
+
+	WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+	pmd_t pmd = pfn_pmd(pfn, prot);
+
+	WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+	pud_t pud = pfn_pud(pfn, prot);
+
+	WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
+}
+#else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else  /* CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#else
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+	pte_t pte = pfn_pte(pfn, prot);
+
+	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+		return;
+
+	WARN_ON(!pte_soft_dirty(pte_mksoft_dirty(pte)));
+	WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte)));
+}
+
+static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+	pte_t pte = pfn_pte(pfn, prot);
+
+	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+		return;
+
+	WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
+	WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+	pmd_t pmd = pfn_pmd(pfn, prot);
+
+	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+		return;
+
+	WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
+	WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
+}
+
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+	pmd_t pmd = pfn_pmd(pfn, prot);
+
+	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) ||
+		!IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
+		return;
+
+	WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
+	WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
+}
+#else  /* !CONFIG_ARCH_HAS_PTE_DEVMAP */
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+}
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+	swp_entry_t swp;
+	pte_t pte;
+
+	pte = pfn_pte(pfn, prot);
+	swp = __pte_to_swp_entry(pte);
+	pte = __swp_entry_to_pte(swp);
+	WARN_ON(pfn != pte_pfn(pte));
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+	swp_entry_t swp;
+	pmd_t pmd;
+
+	pmd = pfn_pmd(pfn, prot);
+	swp = __pmd_to_swp_entry(pmd);
+	pmd = __swp_entry_to_pmd(swp);
+	WARN_ON(pfn != pmd_pfn(pmd));
+}
+#else  /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static void __init swap_migration_tests(void)
+{
+	struct page *page;
+	swp_entry_t swp;
+
+	if (!IS_ENABLED(CONFIG_MIGRATION))
+		return;
+	/*
+	 * swap_migration_tests() requires a dedicated page as it needs to
+	 * be locked before creating a migration entry from it. Locking the
+	 * page that actually maps kernel text ('start_kernel') can be real
+	 * problematic. Lets allocate a dedicated page explicitly for this
+	 * purpose that will be freed subsequently.
+	 */
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		pr_err("page allocation failed\n");
+		return;
+	}
+
+	/*
+	 * make_migration_entry() expects given page to be
+	 * locked, otherwise it stumbles upon a BUG_ON().
+	 */
+	__SetPageLocked(page);
+	swp = make_migration_entry(page, 1);
+	WARN_ON(!is_migration_entry(swp));
+	WARN_ON(!is_write_migration_entry(swp));
+
+	make_migration_entry_read(&swp);
+	WARN_ON(!is_migration_entry(swp));
+	WARN_ON(is_write_migration_entry(swp));
+
+	swp = make_migration_entry(page, 0);
+	WARN_ON(!is_migration_entry(swp));
+	WARN_ON(is_write_migration_entry(swp));
+	__ClearPageLocked(page);
+	__free_page(page);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+	struct page *page;
+	pte_t pte;
+
+	/*
+	 * Accessing the page associated with the pfn is safe here,
+	 * as it was previously derived from a real kernel symbol.
+	 */
+	page = pfn_to_page(pfn);
+	pte = mk_huge_pte(page, prot);
+
+	WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
+	WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
+	WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+	pte = pfn_pte(pfn, prot);
+
+	WARN_ON(!pte_huge(pte_mkhuge(pte)));
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+}
+#else  /* !CONFIG_HUGETLB_PAGE */
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+	pmd_t pmd;
+
+	if (!has_transparent_hugepage())
+		return;
+
+	/*
+	 * pmd_trans_huge() and pmd_present() must return positive after
+	 * MMU invalidation with pmd_mkinvalid(). This behavior is an
+	 * optimization for transparent huge page. pmd_trans_huge() must
+	 * be true if pmd_page() returns a valid THP to avoid taking the
+	 * pmd_lock when others walk over non transhuge pmds (i.e. there
+	 * are no THP allocated). Especially when splitting a THP and
+	 * removing the present bit from the pmd, pmd_trans_huge() still
+	 * needs to return true. pmd_present() should be true whenever
+	 * pmd_trans_huge() returns true.
+	 */
+	pmd = pfn_pmd(pfn, prot);
+	WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd)));
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+	WARN_ON(!pmd_trans_huge(pmd_mkinvalid(pmd_mkhuge(pmd))));
+	WARN_ON(!pmd_present(pmd_mkinvalid(pmd_mkhuge(pmd))));
+#endif /* __HAVE_ARCH_PMDP_INVALIDATE */
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+	pud_t pud;
+
+	if (!has_transparent_hugepage())
+		return;
+
+	pud = pfn_pud(pfn, prot);
+	WARN_ON(!pud_trans_huge(pud_mkhuge(pud)));
+
+	/*
+	 * pud_mkinvalid() has been dropped for now. Enable back
+	 * these tests when it comes back with a modified pud_present().
+	 *
+	 * WARN_ON(!pud_trans_huge(pud_mkinvalid(pud_mkhuge(pud))));
+	 * WARN_ON(!pud_present(pud_mkinvalid(pud_mkhuge(pud))));
+	 */
+}
+#else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 static unsigned long __init get_random_vaddr(void)
 {
 	unsigned long random_vaddr, random_pages, total_user_pages;
@@ -303,7 +575,7 @@ static int __init debug_vm_pgtable(void)
 	pmd_t *pmdp, *saved_pmdp, pmd;
 	pte_t *ptep;
 	pgtable_t saved_ptep;
-	pgprot_t prot;
+	pgprot_t prot, protnone;
 	phys_addr_t paddr;
 	unsigned long vaddr, pte_aligned, pmd_aligned;
 	unsigned long pud_aligned, p4d_aligned, pgd_aligned;
@@ -318,6 +590,12 @@ static int __init debug_vm_pgtable(void)
 		return 1;
 	}
 
+	/*
+	 * __P000 (or even __S000) will help create page table entries with
+	 * PROT_NONE permission as required for pxx_protnone_tests().
+	 */
+	protnone = __P000;
+
 	/*
 	 * PFN for mapping at PTE level is determined from a standard kernel
 	 * text symbol. But pfns for higher page table levels are derived by
@@ -373,6 +651,28 @@ static int __init debug_vm_pgtable(void)
 	p4d_populate_tests(mm, p4dp, saved_pudp);
 	pgd_populate_tests(mm, pgdp, saved_p4dp);
 
+	pte_special_tests(pte_aligned, prot);
+	pte_protnone_tests(pte_aligned, protnone);
+	pmd_protnone_tests(pmd_aligned, protnone);
+
+	pte_devmap_tests(pte_aligned, prot);
+	pmd_devmap_tests(pmd_aligned, prot);
+	pud_devmap_tests(pud_aligned, prot);
+
+	pte_soft_dirty_tests(pte_aligned, prot);
+	pmd_soft_dirty_tests(pmd_aligned, prot);
+	pte_swap_soft_dirty_tests(pte_aligned, prot);
+	pmd_swap_soft_dirty_tests(pmd_aligned, prot);
+
+	pte_swap_tests(pte_aligned, prot);
+	pmd_swap_tests(pmd_aligned, prot);
+
+	swap_migration_tests();
+	hugetlb_basic_tests(pte_aligned, prot);
+
+	pmd_thp_tests(pmd_aligned, prot);
+	pud_thp_tests(pud_aligned, prot);
+
 	p4d_free(mm, saved_p4dp);
 	pud_free(mm, saved_pudp);
 	pmd_free(mm, saved_pmdp);
-- 
2.20.1


^ permalink raw reply related

* [PATCH V5 2/4] mm/debug_vm_pgtable: Add tests validating advanced arch page table helpers
From: Anshuman Khandual @ 2020-07-13  3:23 UTC (permalink / raw)
  To: linux-mm
  Cc: Catalin Marinas, Heiko Carstens, Paul Mackerras, H. Peter Anvin,
	agordeev, Will Deacon, linux-riscv, linux-arch, linux-s390, x86,
	Mike Rapoport, Christian Borntraeger, Ingo Molnar,
	linux-arm-kernel, ziy, linux-snps-arc, Vasily Gorbik,
	Anshuman Khandual, cai, Paul Walmsley, Kirill A . Shutemov,
	Thomas Gleixner, gerald.schaefer, christophe.leroy, Vineet Gupta,
	linux-kernel, Palmer Dabbelt, aneesh.kumar, Borislav Petkov,
	Andrew Morton, linuxppc-dev, rppt
In-Reply-To: <1594610587-4172-1-git-send-email-anshuman.khandual@arm.com>

This adds new tests validating for these following arch advanced page table
helpers. These tests create and test specific mapping types at various page
table levels.

1. pxxp_set_wrprotect()
2. pxxp_get_and_clear()
3. pxxp_set_access_flags()
4. pxxp_get_and_clear_full()
5. pxxp_test_and_clear_young()
6. pxx_leaf()
7. pxx_set_huge()
8. pxx_(clear|mk)_savedwrite()
9. huge_pxxp_xxx()

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: linux-snps-arc@lists.infradead.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Cc: x86@kernel.org
Cc: linux-mm@kvack.org
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Tested-by: Vineet Gupta <vgupta@synopsys.com>	#arc
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 mm/debug_vm_pgtable.c | 312 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 312 insertions(+)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 2fac47db3eb7..9c7c11eecf17 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/pfn_t.h>
 #include <linux/printk.h>
+#include <linux/pgtable.h>
 #include <linux/random.h>
 #include <linux/spinlock.h>
 #include <linux/swap.h>
@@ -28,6 +29,7 @@
 #include <linux/start_kernel.h>
 #include <linux/sched/mm.h>
 #include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
 
 #define VMFLAGS	(VM_READ|VM_WRITE|VM_EXEC)
 
@@ -55,6 +57,55 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
 	WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
 }
 
+static void __init pte_advanced_tests(struct mm_struct *mm,
+				      struct vm_area_struct *vma, pte_t *ptep,
+				      unsigned long pfn, unsigned long vaddr,
+				      pgprot_t prot)
+{
+	pte_t pte = pfn_pte(pfn, prot);
+
+	pte = pfn_pte(pfn, prot);
+	set_pte_at(mm, vaddr, ptep, pte);
+	ptep_set_wrprotect(mm, vaddr, ptep);
+	pte = ptep_get(ptep);
+	WARN_ON(pte_write(pte));
+
+	pte = pfn_pte(pfn, prot);
+	set_pte_at(mm, vaddr, ptep, pte);
+	ptep_get_and_clear(mm, vaddr, ptep);
+	pte = ptep_get(ptep);
+	WARN_ON(!pte_none(pte));
+
+	pte = pfn_pte(pfn, prot);
+	pte = pte_wrprotect(pte);
+	pte = pte_mkclean(pte);
+	set_pte_at(mm, vaddr, ptep, pte);
+	pte = pte_mkwrite(pte);
+	pte = pte_mkdirty(pte);
+	ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
+	pte = ptep_get(ptep);
+	WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
+
+	pte = pfn_pte(pfn, prot);
+	set_pte_at(mm, vaddr, ptep, pte);
+	ptep_get_and_clear_full(mm, vaddr, ptep, 1);
+	pte = ptep_get(ptep);
+	WARN_ON(!pte_none(pte));
+
+	pte = pte_mkyoung(pte);
+	set_pte_at(mm, vaddr, ptep, pte);
+	ptep_test_and_clear_young(vma, vaddr, ptep);
+	pte = ptep_get(ptep);
+	WARN_ON(pte_young(pte));
+}
+
+static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+	pte_t pte = pfn_pte(pfn, prot);
+
+	WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
+	WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
+}
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
 {
@@ -77,6 +128,90 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
 	WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
 }
 
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+				      struct vm_area_struct *vma, pmd_t *pmdp,
+				      unsigned long pfn, unsigned long vaddr,
+				      pgprot_t prot)
+{
+	pmd_t pmd = pfn_pmd(pfn, prot);
+
+	if (!has_transparent_hugepage())
+		return;
+
+	/* Align the address wrt HPAGE_PMD_SIZE */
+	vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+
+	pmd = pfn_pmd(pfn, prot);
+	set_pmd_at(mm, vaddr, pmdp, pmd);
+	pmdp_set_wrprotect(mm, vaddr, pmdp);
+	pmd = READ_ONCE(*pmdp);
+	WARN_ON(pmd_write(pmd));
+
+	pmd = pfn_pmd(pfn, prot);
+	set_pmd_at(mm, vaddr, pmdp, pmd);
+	pmdp_huge_get_and_clear(mm, vaddr, pmdp);
+	pmd = READ_ONCE(*pmdp);
+	WARN_ON(!pmd_none(pmd));
+
+	pmd = pfn_pmd(pfn, prot);
+	pmd = pmd_wrprotect(pmd);
+	pmd = pmd_mkclean(pmd);
+	set_pmd_at(mm, vaddr, pmdp, pmd);
+	pmd = pmd_mkwrite(pmd);
+	pmd = pmd_mkdirty(pmd);
+	pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
+	pmd = READ_ONCE(*pmdp);
+	WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
+
+	pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+	set_pmd_at(mm, vaddr, pmdp, pmd);
+	pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
+	pmd = READ_ONCE(*pmdp);
+	WARN_ON(!pmd_none(pmd));
+
+	pmd = pmd_mkyoung(pmd);
+	set_pmd_at(mm, vaddr, pmdp, pmd);
+	pmdp_test_and_clear_young(vma, vaddr, pmdp);
+	pmd = READ_ONCE(*pmdp);
+	WARN_ON(pmd_young(pmd));
+}
+
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+	pmd_t pmd = pfn_pmd(pfn, prot);
+
+	/*
+	 * PMD based THP is a leaf entry.
+	 */
+	pmd = pmd_mkhuge(pmd);
+	WARN_ON(!pmd_leaf(pmd));
+}
+
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+	pmd_t pmd;
+
+	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+		return;
+	/*
+	 * X86 defined pmd_set_huge() verifies that the given
+	 * PMD is not a populated non-leaf entry.
+	 */
+	WRITE_ONCE(*pmdp, __pmd(0));
+	WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
+	WARN_ON(!pmd_clear_huge(pmdp));
+	pmd = READ_ONCE(*pmdp);
+	WARN_ON(!pmd_none(pmd));
+}
+
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+	pmd_t pmd = pfn_pmd(pfn, prot);
+
+	WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
+	WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
+}
+
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
 {
@@ -100,12 +235,119 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
 	 */
 	WARN_ON(!pud_bad(pud_mkhuge(pud)));
 }
+
+static void __init pud_advanced_tests(struct mm_struct *mm,
+				      struct vm_area_struct *vma, pud_t *pudp,
+				      unsigned long pfn, unsigned long vaddr,
+				      pgprot_t prot)
+{
+	pud_t pud = pfn_pud(pfn, prot);
+
+	if (!has_transparent_hugepage())
+		return;
+
+	/* Align the address wrt HPAGE_PUD_SIZE */
+	vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
+
+	set_pud_at(mm, vaddr, pudp, pud);
+	pudp_set_wrprotect(mm, vaddr, pudp);
+	pud = READ_ONCE(*pudp);
+	WARN_ON(pud_write(pud));
+
+#ifndef __PAGETABLE_PMD_FOLDED
+	pud = pfn_pud(pfn, prot);
+	set_pud_at(mm, vaddr, pudp, pud);
+	pudp_huge_get_and_clear(mm, vaddr, pudp);
+	pud = READ_ONCE(*pudp);
+	WARN_ON(!pud_none(pud));
+
+	pud = pfn_pud(pfn, prot);
+	set_pud_at(mm, vaddr, pudp, pud);
+	pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+	pud = READ_ONCE(*pudp);
+	WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+	pud = pfn_pud(pfn, prot);
+	pud = pud_wrprotect(pud);
+	pud = pud_mkclean(pud);
+	set_pud_at(mm, vaddr, pudp, pud);
+	pud = pud_mkwrite(pud);
+	pud = pud_mkdirty(pud);
+	pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
+	pud = READ_ONCE(*pudp);
+	WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
+
+	pud = pud_mkyoung(pud);
+	set_pud_at(mm, vaddr, pudp, pud);
+	pudp_test_and_clear_young(vma, vaddr, pudp);
+	pud = READ_ONCE(*pudp);
+	WARN_ON(pud_young(pud));
+}
+
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+	pud_t pud = pfn_pud(pfn, prot);
+
+	/*
+	 * PUD based THP is a leaf entry.
+	 */
+	pud = pud_mkhuge(pud);
+	WARN_ON(!pud_leaf(pud));
+}
+
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+	pud_t pud;
+
+	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+		return;
+	/*
+	 * X86 defined pud_set_huge() verifies that the given
+	 * PUD is not a populated non-leaf entry.
+	 */
+	WRITE_ONCE(*pudp, __pud(0));
+	WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
+	WARN_ON(!pud_clear_huge(pudp));
+	pud = READ_ONCE(*pudp);
+	WARN_ON(!pud_none(pud));
+}
 #else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_advanced_tests(struct mm_struct *mm,
+				      struct vm_area_struct *vma, pud_t *pudp,
+				      unsigned long pfn, unsigned long vaddr,
+				      pgprot_t prot)
+{
+}
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 #else  /* !CONFIG_TRANSPARENT_HUGEPAGE */
 static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
 static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+				      struct vm_area_struct *vma, pmd_t *pmdp,
+				      unsigned long pfn, unsigned long vaddr,
+				      pgprot_t prot)
+{
+}
+static void __init pud_advanced_tests(struct mm_struct *mm,
+				      struct vm_area_struct *vma, pud_t *pudp,
+				      unsigned long pfn, unsigned long vaddr,
+				      pgprot_t prot)
+{
+}
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
@@ -495,8 +737,56 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
 	WARN_ON(!pte_huge(pte_mkhuge(pte)));
 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
 }
+
+static void __init hugetlb_advanced_tests(struct mm_struct *mm,
+					  struct vm_area_struct *vma,
+					  pte_t *ptep, unsigned long pfn,
+					  unsigned long vaddr, pgprot_t prot)
+{
+	struct page *page = pfn_to_page(pfn);
+	pte_t pte = ptep_get(ptep);
+	unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK;
+
+	pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));
+	set_huge_pte_at(mm, vaddr, ptep, pte);
+	barrier();
+	WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
+	huge_pte_clear(mm, vaddr, ptep, PMD_SIZE);
+	pte = huge_ptep_get(ptep);
+	WARN_ON(!huge_pte_none(pte));
+
+	pte = mk_huge_pte(page, prot);
+	set_huge_pte_at(mm, vaddr, ptep, pte);
+	barrier();
+	huge_ptep_set_wrprotect(mm, vaddr, ptep);
+	pte = huge_ptep_get(ptep);
+	WARN_ON(huge_pte_write(pte));
+
+	pte = mk_huge_pte(page, prot);
+	set_huge_pte_at(mm, vaddr, ptep, pte);
+	barrier();
+	huge_ptep_get_and_clear(mm, vaddr, ptep);
+	pte = huge_ptep_get(ptep);
+	WARN_ON(!huge_pte_none(pte));
+
+	pte = mk_huge_pte(page, prot);
+	pte = huge_pte_wrprotect(pte);
+	set_huge_pte_at(mm, vaddr, ptep, pte);
+	barrier();
+	pte = huge_pte_mkwrite(pte);
+	pte = huge_pte_mkdirty(pte);
+	huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
+	pte = huge_ptep_get(ptep);
+	WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
+}
 #else  /* !CONFIG_HUGETLB_PAGE */
 static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init hugetlb_advanced_tests(struct mm_struct *mm,
+					  struct vm_area_struct *vma,
+					  pte_t *ptep, unsigned long pfn,
+					  unsigned long vaddr, pgprot_t prot)
+{
+}
 #endif /* CONFIG_HUGETLB_PAGE */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -568,6 +858,7 @@ static unsigned long __init get_random_vaddr(void)
 
 static int __init debug_vm_pgtable(void)
 {
+	struct vm_area_struct *vma;
 	struct mm_struct *mm;
 	pgd_t *pgdp;
 	p4d_t *p4dp, *saved_p4dp;
@@ -596,6 +887,12 @@ static int __init debug_vm_pgtable(void)
 	 */
 	protnone = __P000;
 
+	vma = vm_area_alloc(mm);
+	if (!vma) {
+		pr_err("vma allocation failed\n");
+		return 1;
+	}
+
 	/*
 	 * PFN for mapping at PTE level is determined from a standard kernel
 	 * text symbol. But pfns for higher page table levels are derived by
@@ -644,6 +941,20 @@ static int __init debug_vm_pgtable(void)
 	p4d_clear_tests(mm, p4dp);
 	pgd_clear_tests(mm, pgdp);
 
+	pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+	pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
+	pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+	hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+
+	pmd_leaf_tests(pmd_aligned, prot);
+	pud_leaf_tests(pud_aligned, prot);
+
+	pmd_huge_tests(pmdp, pmd_aligned, prot);
+	pud_huge_tests(pudp, pud_aligned, prot);
+
+	pte_savedwrite_tests(pte_aligned, prot);
+	pmd_savedwrite_tests(pmd_aligned, prot);
+
 	pte_unmap_unlock(ptep, ptl);
 
 	pmd_populate_tests(mm, pmdp, saved_ptep);
@@ -678,6 +989,7 @@ static int __init debug_vm_pgtable(void)
 	pmd_free(mm, saved_pmdp);
 	pte_free(mm, saved_ptep);
 
+	vm_area_free(vma);
 	mm_dec_nr_puds(mm);
 	mm_dec_nr_pmds(mm);
 	mm_dec_nr_ptes(mm);
-- 
2.20.1


^ permalink raw reply related

* [PATCH V5 3/4] mm/debug_vm_pgtable: Add debug prints for individual tests
From: Anshuman Khandual @ 2020-07-13  3:23 UTC (permalink / raw)
  To: linux-mm
  Cc: Catalin Marinas, Heiko Carstens, Paul Mackerras, H. Peter Anvin,
	agordeev, Will Deacon, linux-riscv, linux-arch, linux-s390, x86,
	Mike Rapoport, Christian Borntraeger, Ingo Molnar,
	linux-arm-kernel, ziy, linux-snps-arc, Vasily Gorbik,
	Anshuman Khandual, cai, Paul Walmsley, Kirill A . Shutemov,
	Thomas Gleixner, gerald.schaefer, christophe.leroy, Vineet Gupta,
	linux-kernel, Palmer Dabbelt, aneesh.kumar, Borislav Petkov,
	Andrew Morton, linuxppc-dev, rppt
In-Reply-To: <1594610587-4172-1-git-send-email-anshuman.khandual@arm.com>

This adds debug print information that enlists all tests getting executed
on a given platform. With dynamic debug enabled, the following information
will be splashed during boot. For compactness purpose, dropped both time
stamp and prefix (i.e debug_vm_pgtable) from this sample output.

[debug_vm_pgtable      ]: Validating architecture page table helpers
[pte_basic_tests       ]: Validating PTE basic
[pmd_basic_tests       ]: Validating PMD basic
[p4d_basic_tests       ]: Validating P4D basic
[pgd_basic_tests       ]: Validating PGD basic
[pte_clear_tests       ]: Validating PTE clear
[pmd_clear_tests       ]: Validating PMD clear
[pte_advanced_tests    ]: Validating PTE advanced
[pmd_advanced_tests    ]: Validating PMD advanced
[hugetlb_advanced_tests]: Validating HugeTLB advanced
[pmd_leaf_tests        ]: Validating PMD leaf
[pmd_huge_tests        ]: Validating PMD huge
[pte_savedwrite_tests  ]: Validating PTE saved write
[pmd_savedwrite_tests  ]: Validating PMD saved write
[pmd_populate_tests    ]: Validating PMD populate
[pte_special_tests     ]: Validating PTE special
[pte_protnone_tests    ]: Validating PTE protnone
[pmd_protnone_tests    ]: Validating PMD protnone
[pte_devmap_tests      ]: Validating PTE devmap
[pmd_devmap_tests      ]: Validating PMD devmap
[pte_swap_tests        ]: Validating PTE swap
[swap_migration_tests  ]: Validating swap migration
[hugetlb_basic_tests   ]: Validating HugeTLB basic
[pmd_thp_tests         ]: Validating PMD based THP

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: linux-snps-arc@lists.infradead.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Cc: x86@kernel.org
Cc: linux-mm@kvack.org
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Tested-by: Vineet Gupta <vgupta@synopsys.com>	#arc
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 mm/debug_vm_pgtable.c | 46 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 9c7c11eecf17..0db4390435be 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -8,7 +8,7 @@
  *
  * Author: Anshuman Khandual <anshuman.khandual@arm.com>
  */
-#define pr_fmt(fmt) "debug_vm_pgtable: %s: " fmt, __func__
+#define pr_fmt(fmt) "debug_vm_pgtable: [%-25s]: " fmt, __func__
 
 #include <linux/gfp.h>
 #include <linux/highmem.h>
@@ -48,6 +48,7 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
 {
 	pte_t pte = pfn_pte(pfn, prot);
 
+	pr_debug("Validating PTE basic\n");
 	WARN_ON(!pte_same(pte, pte));
 	WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
 	WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
@@ -64,6 +65,7 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
 {
 	pte_t pte = pfn_pte(pfn, prot);
 
+	pr_debug("Validating PTE advanced\n");
 	pte = pfn_pte(pfn, prot);
 	set_pte_at(mm, vaddr, ptep, pte);
 	ptep_set_wrprotect(mm, vaddr, ptep);
@@ -103,6 +105,7 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 {
 	pte_t pte = pfn_pte(pfn, prot);
 
+	pr_debug("Validating PTE saved write\n");
 	WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
 	WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
 }
@@ -114,6 +117,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
 	if (!has_transparent_hugepage())
 		return;
 
+	pr_debug("Validating PMD basic\n");
 	WARN_ON(!pmd_same(pmd, pmd));
 	WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
 	WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
@@ -138,6 +142,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
 	if (!has_transparent_hugepage())
 		return;
 
+	pr_debug("Validating PMD advanced\n");
 	/* Align the address wrt HPAGE_PMD_SIZE */
 	vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
 
@@ -180,6 +185,7 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
 {
 	pmd_t pmd = pfn_pmd(pfn, prot);
 
+	pr_debug("Validating PMD leaf\n");
 	/*
 	 * PMD based THP is a leaf entry.
 	 */
@@ -193,6 +199,8 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
 
 	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
 		return;
+
+	pr_debug("Validating PMD huge\n");
 	/*
 	 * X86 defined pmd_set_huge() verifies that the given
 	 * PMD is not a populated non-leaf entry.
@@ -208,6 +216,7 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
 {
 	pmd_t pmd = pfn_pmd(pfn, prot);
 
+	pr_debug("Validating PMD saved write\n");
 	WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
 	WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
 }
@@ -220,6 +229,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
 	if (!has_transparent_hugepage())
 		return;
 
+	pr_debug("Validating PUD basic\n");
 	WARN_ON(!pud_same(pud, pud));
 	WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud))));
 	WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud))));
@@ -246,6 +256,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
 	if (!has_transparent_hugepage())
 		return;
 
+	pr_debug("Validating PUD advanced\n");
 	/* Align the address wrt HPAGE_PUD_SIZE */
 	vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
 
@@ -288,6 +299,7 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
 {
 	pud_t pud = pfn_pud(pfn, prot);
 
+	pr_debug("Validating PUD leaf\n");
 	/*
 	 * PUD based THP is a leaf entry.
 	 */
@@ -301,6 +313,8 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
 
 	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
 		return;
+
+	pr_debug("Validating PUD huge\n");
 	/*
 	 * X86 defined pud_set_huge() verifies that the given
 	 * PUD is not a populated non-leaf entry.
@@ -354,6 +368,7 @@ static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
 {
 	p4d_t p4d;
 
+	pr_debug("Validating P4D basic\n");
 	memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t));
 	WARN_ON(!p4d_same(p4d, p4d));
 }
@@ -362,6 +377,7 @@ static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
 {
 	pgd_t pgd;
 
+	pr_debug("Validating PGD basic\n");
 	memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t));
 	WARN_ON(!pgd_same(pgd, pgd));
 }
@@ -374,6 +390,7 @@ static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
 	if (mm_pmd_folded(mm))
 		return;
 
+	pr_debug("Validating PUD clear\n");
 	pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
 	WRITE_ONCE(*pudp, pud);
 	pud_clear(pudp);
@@ -388,6 +405,8 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
 
 	if (mm_pmd_folded(mm))
 		return;
+
+	pr_debug("Validating PUD populate\n");
 	/*
 	 * This entry points to next level page table page.
 	 * Hence this must not qualify as pud_bad().
@@ -414,6 +433,7 @@ static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
 	if (mm_pud_folded(mm))
 		return;
 
+	pr_debug("Validating P4D clear\n");
 	p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
 	WRITE_ONCE(*p4dp, p4d);
 	p4d_clear(p4dp);
@@ -429,6 +449,7 @@ static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
 	if (mm_pud_folded(mm))
 		return;
 
+	pr_debug("Validating P4D populate\n");
 	/*
 	 * This entry points to next level page table page.
 	 * Hence this must not qualify as p4d_bad().
@@ -447,6 +468,7 @@ static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
 	if (mm_p4d_folded(mm))
 		return;
 
+	pr_debug("Validating PGD clear\n");
 	pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
 	WRITE_ONCE(*pgdp, pgd);
 	pgd_clear(pgdp);
@@ -462,6 +484,7 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
 	if (mm_p4d_folded(mm))
 		return;
 
+	pr_debug("Validating PGD populate\n");
 	/*
 	 * This entry points to next level page table page.
 	 * Hence this must not qualify as pgd_bad().
@@ -490,6 +513,7 @@ static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
 {
 	pte_t pte = ptep_get(ptep);
 
+	pr_debug("Validating PTE clear\n");
 	pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
 	set_pte_at(mm, vaddr, ptep, pte);
 	barrier();
@@ -502,6 +526,7 @@ static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
 {
 	pmd_t pmd = READ_ONCE(*pmdp);
 
+	pr_debug("Validating PMD clear\n");
 	pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
 	WRITE_ONCE(*pmdp, pmd);
 	pmd_clear(pmdp);
@@ -514,6 +539,7 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
 {
 	pmd_t pmd;
 
+	pr_debug("Validating PMD populate\n");
 	/*
 	 * This entry points to next level page table page.
 	 * Hence this must not qualify as pmd_bad().
@@ -531,6 +557,7 @@ static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
 	if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL))
 		return;
 
+	pr_debug("Validating PTE special\n");
 	WARN_ON(!pte_special(pte_mkspecial(pte)));
 }
 
@@ -541,6 +568,7 @@ static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
 	if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
 		return;
 
+	pr_debug("Validating PTE protnone\n");
 	WARN_ON(!pte_protnone(pte));
 	WARN_ON(!pte_present(pte));
 }
@@ -553,6 +581,7 @@ static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
 	if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
 		return;
 
+	pr_debug("Validating PMD protnone\n");
 	WARN_ON(!pmd_protnone(pmd));
 	WARN_ON(!pmd_present(pmd));
 }
@@ -565,6 +594,7 @@ static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
 {
 	pte_t pte = pfn_pte(pfn, prot);
 
+	pr_debug("Validating PTE devmap\n");
 	WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
 }
 
@@ -573,6 +603,7 @@ static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
 {
 	pmd_t pmd = pfn_pmd(pfn, prot);
 
+	pr_debug("Validating PMD devmap\n");
 	WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
 }
 
@@ -581,6 +612,7 @@ static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
 {
 	pud_t pud = pfn_pud(pfn, prot);
 
+	pr_debug("Validating PUD devmap\n");
 	WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
 }
 #else  /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -603,6 +635,7 @@ static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
 	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
 		return;
 
+	pr_debug("Validating PTE soft dirty\n");
 	WARN_ON(!pte_soft_dirty(pte_mksoft_dirty(pte)));
 	WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte)));
 }
@@ -614,6 +647,7 @@ static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
 	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
 		return;
 
+	pr_debug("Validating PTE swap soft dirty\n");
 	WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
 	WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
 }
@@ -626,6 +660,7 @@ static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
 	if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
 		return;
 
+	pr_debug("Validating PMD soft dirty\n");
 	WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
 	WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
 }
@@ -638,6 +673,7 @@ static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
 		!IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
 		return;
 
+	pr_debug("Validating PMD swap soft dirty\n");
 	WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
 	WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
 }
@@ -653,6 +689,7 @@ static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
 	swp_entry_t swp;
 	pte_t pte;
 
+	pr_debug("Validating PTE swap\n");
 	pte = pfn_pte(pfn, prot);
 	swp = __pte_to_swp_entry(pte);
 	pte = __swp_entry_to_pte(swp);
@@ -665,6 +702,7 @@ static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
 	swp_entry_t swp;
 	pmd_t pmd;
 
+	pr_debug("Validating PMD swap\n");
 	pmd = pfn_pmd(pfn, prot);
 	swp = __pmd_to_swp_entry(pmd);
 	pmd = __swp_entry_to_pmd(swp);
@@ -681,6 +719,8 @@ static void __init swap_migration_tests(void)
 
 	if (!IS_ENABLED(CONFIG_MIGRATION))
 		return;
+
+	pr_debug("Validating swap migration\n");
 	/*
 	 * swap_migration_tests() requires a dedicated page as it needs to
 	 * be locked before creating a migration entry from it. Locking the
@@ -720,6 +760,7 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
 	struct page *page;
 	pte_t pte;
 
+	pr_debug("Validating HugeTLB basic\n");
 	/*
 	 * Accessing the page associated with the pfn is safe here,
 	 * as it was previously derived from a real kernel symbol.
@@ -747,6 +788,7 @@ static void __init hugetlb_advanced_tests(struct mm_struct *mm,
 	pte_t pte = ptep_get(ptep);
 	unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK;
 
+	pr_debug("Validating HugeTLB advanced\n");
 	pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));
 	set_huge_pte_at(mm, vaddr, ptep, pte);
 	barrier();
@@ -797,6 +839,7 @@ static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
 	if (!has_transparent_hugepage())
 		return;
 
+	pr_debug("Validating PMD based THP\n");
 	/*
 	 * pmd_trans_huge() and pmd_present() must return positive after
 	 * MMU invalidation with pmd_mkinvalid(). This behavior is an
@@ -825,6 +868,7 @@ static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
 	if (!has_transparent_hugepage())
 		return;
 
+	pr_debug("Validating PUD based THP\n");
 	pud = pfn_pud(pfn, prot);
 	WARN_ON(!pud_trans_huge(pud_mkhuge(pud)));
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH V5 4/4] Documentation/mm: Add descriptions for arch page table helpers
From: Anshuman Khandual @ 2020-07-13  3:23 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-doc, Catalin Marinas, Heiko Carstens, Paul Mackerras,
	H. Peter Anvin, agordeev, Will Deacon, linux-riscv, linux-arch,
	linux-s390, Jonathan Corbet, x86, Mike Rapoport,
	Christian Borntraeger, Ingo Molnar, linux-arm-kernel, ziy,
	linux-snps-arc, Vasily Gorbik, Anshuman Khandual, cai,
	Paul Walmsley, Kirill A . Shutemov, Thomas Gleixner,
	gerald.schaefer, christophe.leroy, Vineet Gupta, linux-kernel,
	Palmer Dabbelt, aneesh.kumar, Borislav Petkov, Andrew Morton,
	linuxppc-dev, rppt
In-Reply-To: <1594610587-4172-1-git-send-email-anshuman.khandual@arm.com>

This adds a specific description file for all arch page table helpers which
is in sync with the semantics being tested via CONFIG_DEBUG_VM_PGTABLE. All
future changes either to these descriptions here or the debug test should
always remain in sync.

Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: linux-snps-arc@lists.infradead.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: linux-riscv@lists.infradead.org
Cc: x86@kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Suggested-by: Mike Rapoport <rppt@kernel.org>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 Documentation/vm/arch_pgtable_helpers.rst | 258 ++++++++++++++++++++++
 mm/debug_vm_pgtable.c                     |   6 +
 2 files changed, 264 insertions(+)
 create mode 100644 Documentation/vm/arch_pgtable_helpers.rst

diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst
new file mode 100644
index 000000000000..f3591ee3aaa8
--- /dev/null
+++ b/Documentation/vm/arch_pgtable_helpers.rst
@@ -0,0 +1,258 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. _arch_page_table_helpers:
+
+===============================
+Architecture Page Table Helpers
+===============================
+
+Generic MM expects architectures (with MMU) to provide helpers to create, access
+and modify page table entries at various level for different memory functions.
+These page table helpers need to conform to a common semantics across platforms.
+Following tables describe the expected semantics which can also be tested during
+boot via CONFIG_DEBUG_VM_PGTABLE option. All future changes in here or the debug
+test need to be in sync.
+
+======================
+PTE Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pte_same                  | Tests whether both PTE entries are the same      |
++---------------------------+--------------------------------------------------+
+| pte_bad                   | Tests a non-table mapped PTE                     |
++---------------------------+--------------------------------------------------+
+| pte_present               | Tests a valid mapped PTE                         |
++---------------------------+--------------------------------------------------+
+| pte_young                 | Tests a young PTE                                |
++---------------------------+--------------------------------------------------+
+| pte_dirty                 | Tests a dirty PTE                                |
++---------------------------+--------------------------------------------------+
+| pte_write                 | Tests a writable PTE                             |
++---------------------------+--------------------------------------------------+
+| pte_special               | Tests a special PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_protnone              | Tests a PROT_NONE PTE                            |
++---------------------------+--------------------------------------------------+
+| pte_devmap                | Tests a ZONE_DEVICE mapped PTE                   |
++---------------------------+--------------------------------------------------+
+| pte_soft_dirty            | Tests a soft dirty PTE                           |
++---------------------------+--------------------------------------------------+
+| pte_swp_soft_dirty        | Tests a soft dirty swapped PTE                   |
++---------------------------+--------------------------------------------------+
+| pte_mkyoung               | Creates a young PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkold                 | Creates an old PTE                               |
++---------------------------+--------------------------------------------------+
+| pte_mkdirty               | Creates a dirty PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkclean               | Creates a clean PTE                              |
++---------------------------+--------------------------------------------------+
+| pte_mkwrite               | Creates a writable PTE                           |
++---------------------------+--------------------------------------------------+
+| pte_mkwrprotect           | Creates a write protected PTE                    |
++---------------------------+--------------------------------------------------+
+| pte_mkspecial             | Creates a special PTE                            |
++---------------------------+--------------------------------------------------+
+| pte_mkdevmap              | Creates a ZONE_DEVICE mapped PTE                 |
++---------------------------+--------------------------------------------------+
+| pte_mksoft_dirty          | Creates a soft dirty PTE                         |
++---------------------------+--------------------------------------------------+
+| pte_clear_soft_dirty      | Clears a soft dirty PTE                          |
++---------------------------+--------------------------------------------------+
+| pte_swp_mksoft_dirty      | Creates a soft dirty swapped PTE                 |
++---------------------------+--------------------------------------------------+
+| pte_swp_clear_soft_dirty  | Clears a soft dirty swapped PTE                  |
++---------------------------+--------------------------------------------------+
+| pte_mknotpresent          | Invalidates a mapped PTE                         |
++---------------------------+--------------------------------------------------+
+| ptep_get_and_clear        | Clears a PTE                                     |
++---------------------------+--------------------------------------------------+
+| ptep_get_and_clear_full   | Clears a PTE                                     |
++---------------------------+--------------------------------------------------+
+| ptep_test_and_clear_young | Clears young from a PTE                          |
++---------------------------+--------------------------------------------------+
+| ptep_set_wrprotect        | Converts into a write protected PTE              |
++---------------------------+--------------------------------------------------+
+| ptep_set_access_flags     | Converts into a more permissive PTE              |
++---------------------------+--------------------------------------------------+
+
+======================
+PMD Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pmd_same                  | Tests whether both PMD entries are the same      |
++---------------------------+--------------------------------------------------+
+| pmd_bad                   | Tests a non-table mapped PMD                     |
++---------------------------+--------------------------------------------------+
+| pmd_leaf                  | Tests a leaf mapped PMD                          |
++---------------------------+--------------------------------------------------+
+| pmd_huge                  | Tests a HugeTLB mapped PMD                       |
++---------------------------+--------------------------------------------------+
+| pmd_trans_huge            | Tests a Transparent Huge Page (THP) at PMD       |
++---------------------------+--------------------------------------------------+
+| pmd_present               | Tests a valid mapped PMD                         |
++---------------------------+--------------------------------------------------+
+| pmd_young                 | Tests a young PMD                                |
++---------------------------+--------------------------------------------------+
+| pmd_dirty                 | Tests a dirty PMD                                |
++---------------------------+--------------------------------------------------+
+| pmd_write                 | Tests a writable PMD                             |
++---------------------------+--------------------------------------------------+
+| pmd_special               | Tests a special PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_protnone              | Tests a PROT_NONE PMD                            |
++---------------------------+--------------------------------------------------+
+| pmd_devmap                | Tests a ZONE_DEVICE mapped PMD                   |
++---------------------------+--------------------------------------------------+
+| pmd_soft_dirty            | Tests a soft dirty PMD                           |
++---------------------------+--------------------------------------------------+
+| pmd_swp_soft_dirty        | Tests a soft dirty swapped PMD                   |
++---------------------------+--------------------------------------------------+
+| pmd_mkyoung               | Creates a young PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkold                 | Creates an old PMD                               |
++---------------------------+--------------------------------------------------+
+| pmd_mkdirty               | Creates a dirty PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkclean               | Creates a clean PMD                              |
++---------------------------+--------------------------------------------------+
+| pmd_mkwrite               | Creates a writable PMD                           |
++---------------------------+--------------------------------------------------+
+| pmd_mkwrprotect           | Creates a write protected PMD                    |
++---------------------------+--------------------------------------------------+
+| pmd_mkspecial             | Creates a special PMD                            |
++---------------------------+--------------------------------------------------+
+| pmd_mkdevmap              | Creates a ZONE_DEVICE mapped PMD                 |
++---------------------------+--------------------------------------------------+
+| pmd_mksoft_dirty          | Creates a soft dirty PMD                         |
++---------------------------+--------------------------------------------------+
+| pmd_clear_soft_dirty      | Clears a soft dirty PMD                          |
++---------------------------+--------------------------------------------------+
+| pmd_swp_mksoft_dirty      | Creates a soft dirty swapped PMD                 |
++---------------------------+--------------------------------------------------+
+| pmd_swp_clear_soft_dirty  | Clears a soft dirty swapped PMD                  |
++---------------------------+--------------------------------------------------+
+| pmd_mkinvalid             | Invalidates a mapped PMD [1]                     |
++---------------------------+--------------------------------------------------+
+| pmd_set_huge              | Creates a PMD huge mapping                       |
++---------------------------+--------------------------------------------------+
+| pmd_clear_huge            | Clears a PMD huge mapping                        |
++---------------------------+--------------------------------------------------+
+| pmdp_get_and_clear        | Clears a PMD                                     |
++---------------------------+--------------------------------------------------+
+| pmdp_get_and_clear_full   | Clears a PMD                                     |
++---------------------------+--------------------------------------------------+
+| pmdp_test_and_clear_young | Clears young from a PMD                          |
++---------------------------+--------------------------------------------------+
+| pmdp_set_wrprotect        | Converts into a write protected PMD              |
++---------------------------+--------------------------------------------------+
+| pmdp_set_access_flags     | Converts into a more permissive PMD              |
++---------------------------+--------------------------------------------------+
+
+======================
+PUD Page Table Helpers
+======================
+
++---------------------------+--------------------------------------------------+
+| pud_same                  | Tests whether both PUD entries are the same      |
++---------------------------+--------------------------------------------------+
+| pud_bad                   | Tests a non-table mapped PUD                     |
++---------------------------+--------------------------------------------------+
+| pud_leaf                  | Tests a leaf mapped PUD                          |
++---------------------------+--------------------------------------------------+
+| pud_huge                  | Tests a HugeTLB mapped PUD                       |
++---------------------------+--------------------------------------------------+
+| pud_trans_huge            | Tests a Transparent Huge Page (THP) at PUD       |
++---------------------------+--------------------------------------------------+
+| pud_present               | Tests a valid mapped PUD                         |
++---------------------------+--------------------------------------------------+
+| pud_young                 | Tests a young PUD                                |
++---------------------------+--------------------------------------------------+
+| pud_dirty                 | Tests a dirty PUD                                |
++---------------------------+--------------------------------------------------+
+| pud_write                 | Tests a writable PUD                             |
++---------------------------+--------------------------------------------------+
+| pud_devmap                | Tests a ZONE_DEVICE mapped PUD                   |
++---------------------------+--------------------------------------------------+
+| pud_mkyoung               | Creates a young PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkold                 | Creates an old PUD                               |
++---------------------------+--------------------------------------------------+
+| pud_mkdirty               | Creates a dirty PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkclean               | Creates a clean PUD                              |
++---------------------------+--------------------------------------------------+
+| pud_mkwrite               | Creates a writable PUD                           |
++---------------------------+--------------------------------------------------+
+| pud_mkwrprotect           | Creates a write protected PUD                    |
++---------------------------+--------------------------------------------------+
+| pud_mkdevmap              | Creates a ZONE_DEVICE mapped PUD                 |
++---------------------------+--------------------------------------------------+
+| pud_mkinvalid             | Invalidates a mapped PUD [1]                     |
++---------------------------+--------------------------------------------------+
+| pud_set_huge              | Creates a PUD huge mapping                       |
++---------------------------+--------------------------------------------------+
+| pud_clear_huge            | Clears a PUD huge mapping                        |
++---------------------------+--------------------------------------------------+
+| pudp_get_and_clear        | Clears a PUD                                     |
++---------------------------+--------------------------------------------------+
+| pudp_get_and_clear_full   | Clears a PUD                                     |
++---------------------------+--------------------------------------------------+
+| pudp_test_and_clear_young | Clears young from a PUD                          |
++---------------------------+--------------------------------------------------+
+| pudp_set_wrprotect        | Converts into a write protected PUD              |
++---------------------------+--------------------------------------------------+
+| pudp_set_access_flags     | Converts into a more permissive PUD              |
++---------------------------+--------------------------------------------------+
+
+==========================
+HugeTLB Page Table Helpers
+==========================
+
++---------------------------+--------------------------------------------------+
+| pte_huge                  | Tests a HugeTLB                                  |
++---------------------------+--------------------------------------------------+
+| pte_mkhuge                | Creates a HugeTLB                                |
++---------------------------+--------------------------------------------------+
+| huge_pte_dirty            | Tests a dirty HugeTLB                            |
++---------------------------+--------------------------------------------------+
+| huge_pte_write            | Tests a writable HugeTLB                         |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkdirty          | Creates a dirty HugeTLB                          |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkwrite          | Creates a writable HugeTLB                       |
++---------------------------+--------------------------------------------------+
+| huge_pte_mkwrprotect      | Creates a write protected HugeTLB                |
++---------------------------+--------------------------------------------------+
+| huge_ptep_get_and_clear   | Clears a HugeTLB                                 |
++---------------------------+--------------------------------------------------+
+| huge_ptep_set_wrprotect   | Converts into a write protected HugeTLB          |
++---------------------------+--------------------------------------------------+
+| huge_ptep_set_access_flags  | Converts into a more permissive HugeTLB        |
++---------------------------+--------------------------------------------------+
+
+========================
+SWAP Page Table Helpers
+========================
+
++---------------------------+--------------------------------------------------+
+| __pte_to_swp_entry        | Creates a swapped entry (arch) from a mapped PTE |
++---------------------------+--------------------------------------------------+
+| __swp_to_pte_entry        | Creates a mapped PTE from a swapped entry (arch) |
++---------------------------+--------------------------------------------------+
+| __pmd_to_swp_entry        | Creates a swapped entry (arch) from a mapped PMD |
++---------------------------+--------------------------------------------------+
+| __swp_to_pmd_entry        | Creates a mapped PMD from a swapped entry (arch) |
++---------------------------+--------------------------------------------------+
+| is_migration_entry        | Tests a migration (read or write) swapped entry  |
++---------------------------+--------------------------------------------------+
+| is_write_migration_entry  | Tests a write migration swapped entry            |
++---------------------------+--------------------------------------------------+
+| make_migration_entry_read | Converts into read migration swapped entry       |
++---------------------------+--------------------------------------------------+
+| make_migration_entry      | Creates a migration swapped entry (read or write)|
++---------------------------+--------------------------------------------------+
+
+[1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 0db4390435be..e86c3d824693 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -31,6 +31,12 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 
+/*
+ * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics
+ * expectations that are being validated here. All future changes in here
+ * or the documentation need to be in sync.
+ */
+
 #define VMFLAGS	(VM_READ|VM_WRITE|VM_EXEC)
 
 /*
-- 
2.20.1


^ permalink raw reply related

* Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-07-13  4:45 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <CALCETrVqHDLo09HcaoeOoAVK8w+cNWkSNTLkDDU=evUhaXkyhQ@mail.gmail.com>

Excerpts from Andy Lutomirski's message of July 11, 2020 3:04 am:
> On Thu, Jul 9, 2020 at 6:57 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> And get rid of the generic sync_core_before_usermode facility.
>>
>> This helper is the wrong way around I think. The idea that membarrier
>> state requires a core sync before returning to user is the easy one
>> that does not need hiding behind membarrier calls. The gap in core
>> synchronization due to x86's sysret/sysexit and lazy tlb mode, is the
>> tricky detail that is better put in x86 lazy tlb code.
>>
>> Consider if an arch did not synchronize core in switch_mm either, then
>> membarrier_mm_sync_core_before_usermode would be in the wrong place
>> but arch specific mmu context functions would still be the right place.
>> There is also a exit_lazy_tlb case that is not covered by this call, which
>> could be a bugs (kthread use mm the membarrier process's mm then context
>> switch back to the process without switching mm or lazy mm switch).
>>
>> This makes lazy tlb code a bit more modular.
> 
> The mm-switching and TLB-management has often had the regrettable
> property that it gets wired up in a way that seems to work at the time
> but doesn't have clear semantics, and I'm a bit concerned that this
> patch is in that category.

It's much more explicit in the core code about where hooks are called
after this patch. And then the x86 membarrier implementation details
are contained to the x86 code where they belong, and we don't have the
previous hook with unclear semantics missing from core code.

> If I'm understanding right, you're trying
> to enforce the property that exiting lazy TLB mode will promise to
> sync the core eventually.  But this has all kinds of odd properties:
> 
>  - Why is exit_lazy_tlb() getting called at all in the relevant cases?

It's a property of how MEMBARRIER_SYNC_CORE is implemented by arch/x86,
see the membarrier comment in finish_task_switch (for analogous reason).

>  When is it permissible to call it?

Comment for the asm-generic code says it's to be called when the lazy
active mm becomes non-lazy.

> I look at your new code and see:
> 
>> +/*
>> + * Ensure that a core serializing instruction is issued before returning
>> + * to user-mode, if a SYNC_CORE was requested. x86 implements return to
>> + * user-space through sysexit, sysrel, and sysretq, which are not core
>> + * serializing.
>> + *
>> + * See the membarrier comment in finish_task_switch as to why this is done
>> + * in exit_lazy_tlb.
>> + */
>> +#define exit_lazy_tlb exit_lazy_tlb
>> +static inline void exit_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
>> +{
>> +       /* Switching mm is serializing with write_cr3 */
>> +        if (tsk->mm != mm)
>> +                return;
> 
> And my brain says WTF?  Surely you meant something like if
> (WARN_ON_ONCE(tsk->mm != mm)) { /* egads, what even happened?  how do
> we try to recover well enough to get a crashed logged at least? */ }

No, the active mm can be unlazied by switching to a different mm.

> So this needs actual documentation, preferably in comments near the
> function, of what the preconditions are and what this mm parameter is.
> Once that's done, then we could consider whether it's appropriate to
> have this function promise to sync the core under some conditions.

It's documented in generic code. I prefer not to duplicate comments
too much but I can add a "refer to asm-generic version for usage" or
something if you'd like?

>  - This whole structure seems to rely on the idea that switching mm
> syncs something.

Which whole structure? The x86 implementation of sync core explicitly
does rely on that, yes. But I've pulled that out of core code with
this patch.

> I periodically ask chip vendor for non-serializing
> mm switches.  Specifically, in my dream world, we have totally
> separate user and kernel page tables.  Changing out the user tables
> doesn't serialize or even create a fence.  Instead it creates the
> minimum required pipeline hazard such that user memory access and
> switches to usermode will make sure they access memory through the
> correct page tables.  I haven't convinced a chip vendor yet, but there
> are quite a few hundreds of cycles to be saved here.

The fundmaental difficulty is that the kernel can still access user
mappings any time after the switch. We can probably handwave ways
around it by serializing lazily when encountering the next user
access and hoping that most of your mm switches result in a kernel
exit that serializes or some other unavoidable serialize so you can
avoid the mm switch one. In practice it sounds like a lot of trouble.
But anyway the sync core could presumably be adjusted or reasoned to
still be correct, depending on how it works.

> With this in
> mind, I see the fencing aspects of the TLB handling code as somewhat
> of an accident.  I'm fine with documenting them and using them to
> optimize other paths, but I think it should be explicit.  For example:
> 
> /* Also does a full barrier?  (Or a sync_core()-style barrier.)
> However, if you rely on this, you must document it in a comment where
> you call this function. *?
> void switch_mm_irqs_off()
> {
> }
> 
> This is kind of like how we strongly encourage anyone using smp_?mb()
> to document what they are fencing against.

Hmm. I don't think anything outside core scheduler/arch code is allowed
to assume that, because they don't really know if schedule() will cause
a switch. Hopefully nobody does, I would agree it shouldn't be 
encouraged.

It is pretty fundamental to how we do task CPU migration so I don't see
it ever going away. A push model where the source CPU has to release 
tasks that it last ran before they can be run elsewhere is unworkable. 
(Or maybe it's not, but no getting around that would require careful
audits of said low level code).

> Also, as it stands, I can easily see in_irq() ceasing to promise to
> serialize.  There are older kernels for which it does not promise to
> serialize.  And I have plans to make it stop serializing in the
> nearish future.

You mean x86's return from interrupt? Sounds fun... you'll konw where to 
update the membarrier sync code, at least :)

Thanks,
Nick

^ permalink raw reply

* Re: [RFC PATCH 7/7] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-07-13  4:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-arch, Arnd Bergmann, x86, linux-kernel, linux-mm,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20200710093556.GY4800@hirez.programming.kicks-ass.net>

Excerpts from Peter Zijlstra's message of July 10, 2020 7:35 pm:
> On Fri, Jul 10, 2020 at 11:56:46AM +1000, Nicholas Piggin wrote:
>> On big systems, the mm refcount can become highly contented when doing
>> a lot of context switching with threaded applications (particularly
>> switching between the idle thread and an application thread).
>> 
>> Abandoning lazy tlb slows switching down quite a bit in the important
>> user->idle->user cases, so so instead implement a non-refcounted scheme
>> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>> any remaining lazy ones.
>> 
>> On a 16-socket 192-core POWER8 system, a context switching benchmark
>> with as many software threads as CPUs (so each switch will go in and
>> out of idle), upstream can achieve a rate of about 1 million context
>> switches per second. After this patch it goes up to 118 million.
> 
> That's mighty impressive, however:

Well, it's the usual case of "find a bouncing line and scale up the
machine size until you achieve your desired improvements" :) But we
are looking at some fundamental scalabilities and seeing if we can
improve a few things.

> 
>> +static void shoot_lazy_tlbs(struct mm_struct *mm)
>> +{
>> +	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
>> +		smp_call_function_many(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
>> +		do_shoot_lazy_tlb(mm);
>> +	}
>> +}
> 
> IIRC you (power) never clear a CPU from that mask, so for other
> workloads I can see this resulting in massive amounts of IPIs.
> 
> For instance, take as many processes as you have CPUs. For each,
> manually walk the task across all CPUs and exit. Again.
> 
> Clearly, that's an extreme, but still...

We do have some issues with that, it does tend to be very self-limiting
though, short lived tasks that can drive lots of exits won't get to run
on a lot of cores.

It's worth keeping an eye on, it may not be too hard to mitigate the IPIs
doing something dumb like collecting a queue of mms before killing a
batch of them.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH 4/5] dma-mapping: add a dma_ops_bypass flag to struct device
From: Alexey Kardashevskiy @ 2020-07-13  4:59 UTC (permalink / raw)
  To: Christoph Hellwig, iommu
  Cc: Björn Töpel, Daniel Borkmann, Greg Kroah-Hartman,
	Joerg Roedel, Robin Murphy, linux-kernel, Jesper Dangaard Brouer,
	linuxppc-dev, Lu Baolu
In-Reply-To: <20200708152449.316476-5-hch@lst.de>



On 09/07/2020 01:24, Christoph Hellwig wrote:
> Several IOMMU drivers have a bypass mode where they can use a direct
> mapping if the devices DMA mask is large enough.  Add generic support
> to the core dma-mapping code to do that to switch those drivers to
> a common solution.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  include/linux/device.h |  8 +++++
>  kernel/dma/Kconfig     |  8 +++++
>  kernel/dma/mapping.c   | 74 +++++++++++++++++++++++++++++-------------
>  3 files changed, 68 insertions(+), 22 deletions(-)
> 
> diff --git a/include/linux/device.h b/include/linux/device.h
> index 4c4af98321ebd6..1f71acf37f78d7 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -523,6 +523,11 @@ struct dev_links_info {
>   *		  sync_state() callback.
>   * @dma_coherent: this particular device is dma coherent, even if the
>   *		architecture supports non-coherent devices.
> + * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
> + *		streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
> + *		and optionall (if the coherent mask is large enough) also


s/optionall/optional/g

Otherwise the series looks good and works well on powernv and pseries.
Thanks,



-- 
Alexey

^ permalink raw reply

* Re: [PATCH v2 0/3] Power10 basic energy management
From: Nicholas Piggin @ 2020-07-13  5:23 UTC (permalink / raw)
  To: benh, ego, linux-kernel, linuxppc-dev, mikey, mpe, paulus,
	pratik.r.sampat, Pratik Rajesh Sampat, ravi.bangoria, svaidy
In-Reply-To: <20200710052207.12003-1-psampat@linux.ibm.com>

Excerpts from Pratik Rajesh Sampat's message of July 10, 2020 3:22 pm:
> Changelog v1 --> v2:
> 1. Save-restore DAWR and DAWRX unconditionally as they are lost in
> shallow idle states too
> 2. Rename pnv_first_spr_loss_level to pnv_first_fullstate_loss_level to
> correct naming terminology
> 
> Pratik Rajesh Sampat (3):
>   powerpc/powernv/idle: Exclude mfspr on HID1,4,5 on P9 and above
>   powerpc/powernv/idle: save-restore DAWR0,DAWRX0 for P10
>   powerpc/powernv/idle: Rename pnv_first_spr_loss_level variable
> 
>  arch/powerpc/platforms/powernv/idle.c | 34 +++++++++++++++++----------
>  1 file changed, 22 insertions(+), 12 deletions(-)

These look okay to me, but the CPU_FTR_ARCH_300 test for 
pnv_power9_idle_init() is actually wrong, it should be a PVR test 
because idle is not completely architected (not even shallow stop 
states, unfortunately).

It doesn't look like we support POWER10 idle correctly yet, and on older
kernels it wouldn't work even if we fixed newer, so ideally the PVR 
check would be backported as a fix in the front of the series.

Sadly, we have no OPAL idle driver yet. Hopefully we will before the
next processor shows up :P

Thanks,
Nick

^ permalink raw reply

* Re: [v3 1/5] KVM: PPC: Book3S HV: Disable page merging in H_SVM_INIT_START
From: Bharata B Rao @ 2020-07-13  5:29 UTC (permalink / raw)
  To: Ram Pai
  Cc: ldufour, cclaudio, kvm-ppc, sathnaga, aneesh.kumar, sukadev,
	linuxppc-dev, bauerman, david
In-Reply-To: <1594458827-31866-2-git-send-email-linuxram@us.ibm.com>

On Sat, Jul 11, 2020 at 02:13:43AM -0700, Ram Pai wrote:
> Merging of pages associated with each memslot of a SVM is
> disabled the page is migrated in H_SVM_PAGE_IN handler.
> 
> This operation should have been done much earlier; the moment the VM
> is initiated for secure-transition. Delaying this operation, increases
> the probability for those pages to acquire new references , making it
> impossible to migrate those pages in H_SVM_PAGE_IN handler.
> 
> Disable page-migration in H_SVM_INIT_START handling.

While it is a good idea to disable KSM merging for all VMAs during
H_SVM_INIT_START, I am curious if you did observe an actual case of
ksm_madvise() failing which resulted in subsequent H_SVM_PAGE_IN
failing to migrate?

> 
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
>  arch/powerpc/kvm/book3s_hv_uvmem.c | 96 +++++++++++++++++++++++++++++---------
>  1 file changed, 74 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
> index 3d987b1..bfc3841 100644
> --- a/arch/powerpc/kvm/book3s_hv_uvmem.c
> +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
> @@ -211,6 +211,65 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm,
>  	return false;
>  }
>  
> +static int kvmppc_memslot_page_merge(struct kvm *kvm,
> +		struct kvm_memory_slot *memslot, bool merge)
> +{
> +	unsigned long gfn = memslot->base_gfn;
> +	unsigned long end, start = gfn_to_hva(kvm, gfn);
> +	int ret = 0;
> +	struct vm_area_struct *vma;
> +	int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE;
> +
> +	if (kvm_is_error_hva(start))
> +		return H_STATE;

This and other cases below seem to be a new return value from
H_SVM_INIT_START. May be update the documentation too along with
this patch?

> +
> +	end = start + (memslot->npages << PAGE_SHIFT);
> +
> +	down_write(&kvm->mm->mmap_sem);

When you rebase the patches against latest upstream you may want to
replace the above and other instances by mmap_write/read_lock().

> +	do {
> +		vma = find_vma_intersection(kvm->mm, start, end);
> +		if (!vma) {
> +			ret = H_STATE;
> +			break;
> +		}
> +		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
> +			  merge_flag, &vma->vm_flags);
> +		if (ret) {
> +			ret = H_STATE;
> +			break;
> +		}
> +		start = vma->vm_end + 1;
> +	} while (end > vma->vm_end);
> +
> +	up_write(&kvm->mm->mmap_sem);
> +	return ret;
> +}
> +
> +static int __kvmppc_page_merge(struct kvm *kvm, bool merge)
> +{
> +	struct kvm_memslots *slots;
> +	struct kvm_memory_slot *memslot;
> +	int ret = 0;
> +
> +	slots = kvm_memslots(kvm);
> +	kvm_for_each_memslot(memslot, slots) {
> +		ret = kvmppc_memslot_page_merge(kvm, memslot, merge);
> +		if (ret)
> +			break;
> +	}
> +	return ret;
> +}
> +
> +static inline int kvmppc_disable_page_merge(struct kvm *kvm)
> +{
> +	return __kvmppc_page_merge(kvm, false);
> +}
> +
> +static inline int kvmppc_enable_page_merge(struct kvm *kvm)
> +{
> +	return __kvmppc_page_merge(kvm, true);
> +}
> +
>  unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
>  {
>  	struct kvm_memslots *slots;
> @@ -232,11 +291,18 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
>  		return H_AUTHORITY;
>  
>  	srcu_idx = srcu_read_lock(&kvm->srcu);
> +
> +	/* disable page-merging for all memslot */
> +	ret = kvmppc_disable_page_merge(kvm);
> +	if (ret)
> +		goto out;
> +
> +	/* register the memslot */
>  	slots = kvm_memslots(kvm);
>  	kvm_for_each_memslot(memslot, slots) {
>  		if (kvmppc_uvmem_slot_init(kvm, memslot)) {
>  			ret = H_PARAMETER;
> -			goto out;
> +			break;
>  		}
>  		ret = uv_register_mem_slot(kvm->arch.lpid,
>  					   memslot->base_gfn << PAGE_SHIFT,
> @@ -245,9 +311,12 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm)
>  		if (ret < 0) {
>  			kvmppc_uvmem_slot_free(kvm, memslot);
>  			ret = H_PARAMETER;
> -			goto out;
> +			break;
>  		}
>  	}
> +
> +	if (ret)
> +		kvmppc_enable_page_merge(kvm);

Is there any use of enabling KSM merging in the failure path here?
Won't UV terminate the VM if H_SVM_INIT_START fails? If there is no need,
you can do away with some extra routines above.

>  out:
>  	srcu_read_unlock(&kvm->srcu, srcu_idx);
>  	return ret;
> @@ -384,7 +453,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
>   */
>  static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
>  		   unsigned long end, unsigned long gpa, struct kvm *kvm,
> -		   unsigned long page_shift, bool *downgrade)
> +		   unsigned long page_shift)
>  {
>  	unsigned long src_pfn, dst_pfn = 0;
>  	struct migrate_vma mig;
> @@ -400,18 +469,6 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
>  	mig.src = &src_pfn;
>  	mig.dst = &dst_pfn;
>  
> -	/*
> -	 * We come here with mmap_sem write lock held just for
> -	 * ksm_madvise(), otherwise we only need read mmap_sem.
> -	 * Hence downgrade to read lock once ksm_madvise() is done.
> -	 */
> -	ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
> -			  MADV_UNMERGEABLE, &vma->vm_flags);

I haven't seen the subsequent patches yet, but guess you are
taking care of disabling KSM mering for hot-plugged memory too.

Regards,
Bharata.

^ permalink raw reply

* [PATCH v6] powerpc/fadump: fix race between pstore write and fadump crash trigger
From: Sourabh Jain @ 2020-07-13  5:24 UTC (permalink / raw)
  To: mpe; +Cc: mahesh, linux-kernel, hbathini, linuxppc-dev

When we enter into fadump crash path via system reset we fail to update
the pstore.

On the system reset path we first update the pstore then we go for fadump
crash. But the problem here is when all the CPUs try to get the pstore
lock to initiate the pstore write, only one CPUs will acquire the lock
and proceed with the pstore write. Since it in NMI context CPUs that fail
to get lock do not wait for their turn to write to the pstore and simply
proceed with the next operation which is fadump crash. One of the CPU who
proceeded with fadump crash path triggers the crash and does not wait for
the CPU who gets the pstore lock to complete the pstore update.

Timeline diagram to depicts the sequence of events that leads to an
unsuccessful pstore update when we hit fadump crash path via system reset.

                 1    2     3    ...      n   CPU Threads
                 |    |     |             |
                 |    |     |             |
 Reached to   -->|--->|---->| ----------->|
 system reset    |    |     |             |
 path            |    |     |             |
                 |    |     |             |
 Try to       -->|--->|---->|------------>|
 acquire the     |    |     |             |
 pstore lock     |    |     |             |
                 |    |     |             |
                 |    |     |             |
 Got the      -->| +->|     |             |<-+
 pstore lock     | |  |     |             |  |-->  Didn't get the
                 | --------------------------+     lock and moving
                 |    |     |             |        ahead on fadump
                 |    |     |             |        crash path
                 |    |     |             |
  Begins the  -->|    |     |             |
  process to     |    |     |             |<-- Got the chance to
  update the     |    |     |             |    trigger the crash
  pstore         | -> |     |    ... <-   |
                 | |  |     |         |   |
                 | |  |     |         |   |<-- Triggers the
                 | |  |     |         |   |    crash
                 | |  |     |         |   |      ^
                 | |  |     |         |   |      |
  Writing to  -->| |  |     |         |   |      |
  pstore         | |  |     |         |   |      |
                   |                  |          |
       ^           |__________________|          |
       |               CPU Relax                 |
       |                                         |
       +-----------------------------------------+
                          |
                          v
            Race: crash triggered before pstore
                  update completes

To avoid this race condition a barrier is added on crash_fadump path, it
prevents the CPU to trigger the crash until all the online CPUs completes
their task.

A barrier is added to make sure all the secondary CPUs hit the
crash_fadump function before we initiates the crash. A timeout is kept to
ensure the primary CPU (one who initiates the crash) do not wait for
secondary CPUs indefinitely.

Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
---
 arch/powerpc/kernel/fadump.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

---
Chanagelog:

v1 -> v3:
   - https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-April/208267.html

v3 -> v4:

   - Now the primary CPU (one who triggers dump) waits for all secondary
     CPUs to enter and then initiates the crash.

v4 -> v5:
    - Fixed a build failure reported by kernel test robot <lkp at intel.com>
      Now the cpus_in_crash variable is defined outside CONFIG_CMA
      config option.

v5 -> v6
    - Changed a variable name cpus_in_crash -> cpus_in_fadump.
---

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 78ab9a6ee6ac..1858896d6809 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -32,11 +32,20 @@
 #include <asm/fadump-internal.h>
 #include <asm/setup.h>
 
+/*
+ * The CPU who acquired the lock to trigger the fadump crash should
+ * wait for other CPUs to enter.
+ *
+ * The timeout is in milliseconds.
+ */
+#define CRASH_TIMEOUT		500
+
 static struct fw_dump fw_dump;
 
 static void __init fadump_reserve_crash_area(u64 base);
 
 struct kobject *fadump_kobj;
+static atomic_t cpus_in_fadump;
 
 #ifndef CONFIG_PRESERVE_FA_DUMP
 static DEFINE_MUTEX(fadump_mutex);
@@ -668,8 +677,11 @@ early_param("fadump_reserve_mem", early_fadump_reserve_mem);
 
 void crash_fadump(struct pt_regs *regs, const char *str)
 {
+	unsigned int msecs;
 	struct fadump_crash_info_header *fdh = NULL;
 	int old_cpu, this_cpu;
+	/* Do not include first CPU */
+	unsigned int ncpus = num_online_cpus() - 1;
 
 	if (!should_fadump_crash())
 		return;
@@ -685,6 +697,8 @@ void crash_fadump(struct pt_regs *regs, const char *str)
 	old_cpu = cmpxchg(&crashing_cpu, -1, this_cpu);
 
 	if (old_cpu != -1) {
+		atomic_inc(&cpus_in_fadump);
+
 		/*
 		 * We can't loop here indefinitely. Wait as long as fadump
 		 * is in force. If we race with fadump un-registration this
@@ -708,6 +722,16 @@ void crash_fadump(struct pt_regs *regs, const char *str)
 
 	fdh->online_mask = *cpu_online_mask;
 
+	/*
+	 * If we came in via system reset, wait a while for the secondary
+	 * CPUs to enter.
+	 */
+	if (TRAP(&(fdh->regs)) == 0x100) {
+		msecs = CRASH_TIMEOUT;
+		while ((atomic_read(&cpus_in_fadump) < ncpus) && (--msecs > 0))
+			mdelay(1);
+	}
+
 	fw_dump.ops->fadump_trigger(fdh, str);
 }
 
-- 
2.25.4


^ permalink raw reply related

* Re: [PATCH v2 2/3] powerpc/powernv/idle: save-restore DAWR0, DAWRX0 for P10
From: Nicholas Piggin @ 2020-07-13  5:52 UTC (permalink / raw)
  To: benh, ego, linux-kernel, linuxppc-dev, mikey, mpe, paulus,
	pratik.r.sampat, Pratik Rajesh Sampat, ravi.bangoria, svaidy
In-Reply-To: <20200710052207.12003-3-psampat@linux.ibm.com>

Excerpts from Pratik Rajesh Sampat's message of July 10, 2020 3:22 pm:
> Additional registers DAWR0, DAWRX0 may be lost on Power 10 for
> stop levels < 4.
> Therefore save the values of these SPRs before entering a  "stop"
> state and restore their values on wakeup.

Hmm, where do you get this from? Documentation I see says DAWR is lost
on POWER9 but not P10.

Does idle thread even need to save DAWR, or does it get switched when
going to a thread that has a watchpoint set?

Thanks,
Nick

> 
> Signed-off-by: Pratik Rajesh Sampat <psampat@linux.ibm.com>
> ---
>  arch/powerpc/platforms/powernv/idle.c | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
> index 19d94d021357..f2e2a6a4c274 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -600,6 +600,8 @@ struct p9_sprs {
>  	u64 iamr;
>  	u64 amor;
>  	u64 uamor;
> +	u64 dawr0;
> +	u64 dawrx0;
>  };
>  
>  static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
> @@ -687,6 +689,10 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
>  	sprs.iamr	= mfspr(SPRN_IAMR);
>  	sprs.amor	= mfspr(SPRN_AMOR);
>  	sprs.uamor	= mfspr(SPRN_UAMOR);
> +	if (cpu_has_feature(CPU_FTR_ARCH_31)) {
> +		sprs.dawr0 = mfspr(SPRN_DAWR0);
> +		sprs.dawrx0 = mfspr(SPRN_DAWRX0);
> +	}
>  
>  	srr1 = isa300_idle_stop_mayloss(psscr);		/* go idle */
>  
> @@ -710,6 +716,10 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
>  		mtspr(SPRN_IAMR,	sprs.iamr);
>  		mtspr(SPRN_AMOR,	sprs.amor);
>  		mtspr(SPRN_UAMOR,	sprs.uamor);
> +		if (cpu_has_feature(CPU_FTR_ARCH_31)) {
> +			mtspr(SPRN_DAWR0, sprs.dawr0);
> +			mtspr(SPRN_DAWRX0, sprs.dawrx0);
> +		}
>  
>  		/*
>  		 * Workaround for POWER9 DD2.0, if we lost resources, the ERAT
> -- 
> 2.25.4
> 
> 

^ permalink raw reply

* Re: [PATCH v2 1/3] powerpc/powernv/idle: Exclude mfspr on HID1, 4, 5 on P9 and above
From: Nicholas Piggin @ 2020-07-13  5:53 UTC (permalink / raw)
  To: benh, ego, linux-kernel, linuxppc-dev, mikey, mpe, paulus,
	pratik.r.sampat, Pratik Rajesh Sampat, ravi.bangoria, svaidy
In-Reply-To: <20200710052207.12003-2-psampat@linux.ibm.com>

Excerpts from Pratik Rajesh Sampat's message of July 10, 2020 3:22 pm:
> POWER9 onwards the support for the registers HID1, HID4, HID5 has been
> receded.
> Although mfspr on the above registers worked in Power9, In Power10
> simulator is unrecognized. Moving their assignment under the
> check for machines lower than Power9

Seems like a good fix.

Thanks,
Nick

> 
> Signed-off-by: Pratik Rajesh Sampat <psampat@linux.ibm.com>
> Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
> ---
>  arch/powerpc/platforms/powernv/idle.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
> index 2dd467383a88..19d94d021357 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -73,9 +73,6 @@ static int pnv_save_sprs_for_deep_states(void)
>  	 */
>  	uint64_t lpcr_val	= mfspr(SPRN_LPCR);
>  	uint64_t hid0_val	= mfspr(SPRN_HID0);
> -	uint64_t hid1_val	= mfspr(SPRN_HID1);
> -	uint64_t hid4_val	= mfspr(SPRN_HID4);
> -	uint64_t hid5_val	= mfspr(SPRN_HID5);
>  	uint64_t hmeer_val	= mfspr(SPRN_HMEER);
>  	uint64_t msr_val = MSR_IDLE;
>  	uint64_t psscr_val = pnv_deepest_stop_psscr_val;
> @@ -117,6 +114,9 @@ static int pnv_save_sprs_for_deep_states(void)
>  
>  			/* Only p8 needs to set extra HID regiters */
>  			if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
> +				uint64_t hid1_val = mfspr(SPRN_HID1);
> +				uint64_t hid4_val = mfspr(SPRN_HID4);
> +				uint64_t hid5_val = mfspr(SPRN_HID5);
>  
>  				rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
>  				if (rc != 0)
> -- 
> 2.25.4
> 
> 

^ permalink raw reply

* [PATCH kernel] powerpc/dma: Fallback to dma_ops when persistent memory present
From: Alexey Kardashevskiy @ 2020-07-13  6:23 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Leonardo Bras, Brian J King, Alexey Kardashevskiy,
	Aneesh Kumar K . V, Christoph Hellwig, Wen Xiong

So far we have been using huge DMA windows to map all the RAM available.
The RAM is normally mapped to the VM address space contiguously, and
there is always a reasonable upper limit for possible future hot plugged
RAM which makes it easy to map all RAM via IOMMU.

Now there is persistent memory ("ibm,pmemory" in the FDT) which (unlike
normal RAM) can map anywhere in the VM space beyond the maximum RAM size
and since it can be used for DMA, it requires extending the huge window
up to MAX_PHYSMEM_BITS which requires hypervisor support for:
1. huge TCE tables;
2. multilevel TCE tables;
3. huge IOMMU pages.

Certain hypervisors cannot do either so the only option left is
restricting the huge DMA window to include only RAM and fallback to
the default DMA window for persistent memory.

This checks if the system has persistent memory. If it does not,
the DMA bypass mode is selected, i.e.
* dev->bus_dma_limit = 0
* dev->dma_ops_bypass = true <- this avoid calling dma_ops for mapping.

If there is such memory, this creates identity mapping only for RAM and
disables the DMA bypass mode which makes generic DMA code use indirect
dma_ops which may have performance impact:
* dev->bus_dma_limit = bus_offset + max_ram_size
  for example 0x0800.0000.8000.0000 for a 2GB VM
* dev->dma_ops_bypass = false <- this forces indirect calls to dma_ops for
  every mapping which then directs these to small or huge window.

This should not change the existing behaviour when no persistent memory.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---

This is based on
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=188385

Leonardo, this makes
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=187348
not apply but the conflict should be quite trivial to resolve
(do not rush, let me finish reviewing this first). Cheers,



---
 arch/powerpc/kernel/dma-iommu.c        | 68 +++++++++++++++++++++++++-
 arch/powerpc/platforms/pseries/iommu.c | 41 +++++++++++++---
 2 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 569fecd7b5b2..9fe5f0aefa9d 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -10,6 +10,16 @@
 #include <linux/pci.h>
 #include <asm/iommu.h>
 
+static inline bool can_map_direct(struct device *dev, phys_addr_t addr)
+{
+	return dev->bus_dma_limit >= phys_to_dma(dev, addr);
+}
+
+static inline bool dma_handle_direct(struct device *dev, dma_addr_t dma_handle)
+{
+	return dma_handle >= dev->archdata.dma_offset;
+}
+
 /*
  * Generic iommu implementation
  */
@@ -44,6 +54,12 @@ static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page,
 				     enum dma_data_direction direction,
 				     unsigned long attrs)
 {
+	if (dev->bus_dma_limit &&
+	    can_map_direct(dev, (phys_addr_t) page_to_phys(page) +
+			   offset + size))
+		return dma_direct_map_page(dev, page, offset, size, direction,
+					   attrs);
+
 	return iommu_map_page(dev, get_iommu_table_base(dev), page, offset,
 			      size, dma_get_mask(dev), direction, attrs);
 }
@@ -53,6 +69,12 @@ static void dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
 				 size_t size, enum dma_data_direction direction,
 				 unsigned long attrs)
 {
+	if (dev->bus_dma_limit &&
+	    dma_handle_direct(dev, dma_handle + size)) {
+		dma_direct_unmap_page(dev, dma_handle, size, direction, attrs);
+		return;
+	}
+
 	iommu_unmap_page(get_iommu_table_base(dev), dma_handle, size, direction,
 			 attrs);
 }
@@ -62,6 +84,22 @@ static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
 			    int nelems, enum dma_data_direction direction,
 			    unsigned long attrs)
 {
+	if (dev->bus_dma_limit) {
+		struct scatterlist *s;
+		bool direct = true;
+		int i;
+
+		for_each_sg(sglist, s, nelems, i) {
+			direct = can_map_direct(dev,
+					sg_phys(s) + s->offset + s->length);
+			if (!direct)
+				break;
+		}
+		if (direct)
+			return dma_direct_map_sg(dev, sglist, nelems, direction,
+						 attrs);
+	}
+
 	return ppc_iommu_map_sg(dev, get_iommu_table_base(dev), sglist, nelems,
 				dma_get_mask(dev), direction, attrs);
 }
@@ -70,6 +108,24 @@ static void dma_iommu_unmap_sg(struct device *dev, struct scatterlist *sglist,
 		int nelems, enum dma_data_direction direction,
 		unsigned long attrs)
 {
+	if (dev->bus_dma_limit) {
+		struct scatterlist *s;
+		bool direct = true;
+		int i;
+
+		for_each_sg(sglist, s, nelems, i) {
+			direct = dma_handle_direct(dev,
+						   s->dma_address + s->length);
+			if (!direct)
+				break;
+		}
+		if (direct) {
+			dma_direct_unmap_sg(dev, sglist, nelems, direction,
+					    attrs);
+			return;
+		}
+	}
+
 	ppc_iommu_unmap_sg(get_iommu_table_base(dev), sglist, nelems,
 			   direction, attrs);
 }
@@ -90,8 +146,16 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
 	struct iommu_table *tbl = get_iommu_table_base(dev);
 
 	if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
-		dev->dma_ops_bypass = true;
-		dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
+		/*
+		 * dma_iommu_bypass_supported() sets dma_max when there is
+		 * 1:1 mapping but it is somehow limited.
+		 * ibm,pmemory is one example.
+		 */
+		dev->dma_ops_bypass = dev->bus_dma_limit == 0;
+		if (!dev->dma_ops_bypass)
+			dev_warn(dev, "iommu: 64-bit OK but using default ops\n");
+		else
+			dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
 		return 1;
 	}
 
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 6d47b4a3ce39..1996f83021fe 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -816,7 +816,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
 			np, ret);
 }
 
-static u64 find_existing_ddw(struct device_node *pdn)
+static u64 find_existing_ddw(struct device_node *pdn, int *window_shift)
 {
 	struct direct_window *window;
 	const struct dynamic_dma_window_prop *direct64;
@@ -828,6 +828,7 @@ static u64 find_existing_ddw(struct device_node *pdn)
 		if (window->device == pdn) {
 			direct64 = window->prop;
 			dma_addr = be64_to_cpu(direct64->dma_base);
+			*window_shift = be32_to_cpu(direct64->window_shift);
 			break;
 		}
 	}
@@ -990,11 +991,13 @@ static phys_addr_t ddw_memory_hotplug_max(void)
  */
 static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
-	int len, ret;
+	int len = 0, ret;
+	bool pmem_present = of_find_node_by_type(NULL, "ibm,pmemory") != NULL;
+	int max_ram_len = order_base_2(ddw_memory_hotplug_max());
 	struct ddw_query_response query;
 	struct ddw_create_response create;
 	int page_shift;
-	u64 dma_addr, max_addr;
+	u64 dma_addr;
 	struct device_node *dn;
 	u32 ddw_avail[3];
 	struct direct_window *window;
@@ -1004,7 +1007,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 
 	mutex_lock(&direct_window_init_mutex);
 
-	dma_addr = find_existing_ddw(pdn);
+	dma_addr = find_existing_ddw(pdn, &len);
 	if (dma_addr != 0)
 		goto out_unlock;
 
@@ -1066,14 +1069,27 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 	}
 	/* verify the window * number of ptes will map the partition */
 	/* check largest block * page size > max memory hotplug addr */
-	max_addr = ddw_memory_hotplug_max();
-	if (query.largest_available_block < (max_addr >> page_shift)) {
+	/*
+	 * The "ibm,pmemory" can appear anywhere in the address space.
+	 * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
+	 * for the upper limit and fallback to max RAM otherwise but this
+	 * disables device::dma_ops_bypass.
+	 */
+	len = max_ram_len;
+	if (pmem_present) {
+		if (query.largest_available_block >=
+		    (1ULL << (MAX_PHYSMEM_BITS - page_shift)))
+			len = MAX_PHYSMEM_BITS - page_shift;
+		else
+			dev_info(&dev->dev, "Skipping ibm,pmemory");
+	}
+
+	if (query.largest_available_block < (1ULL << (len - page_shift))) {
 		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
-			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
+			  "%llu-sized pages\n", 1ULL << len, query.largest_available_block,
 			  1ULL << page_shift);
 		goto out_failed;
 	}
-	len = order_base_2(max_addr);
 	win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
 	if (!win64) {
 		dev_info(&dev->dev,
@@ -1151,6 +1167,15 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 
 out_unlock:
 	mutex_unlock(&direct_window_init_mutex);
+
+	/*
+	 * If we have persistent memory and the window size is only as big
+	 * as RAM, then we failed to create a window to cover persistent
+	 * memory and need to set the DMA limit.
+	 */
+	if (pmem_present && dma_addr && (len == max_ram_len))
+		dev->dev.bus_dma_limit = dma_addr + (1ULL << len);
+
 	return dma_addr;
 }
 
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW
From: Alexey Kardashevskiy @ 2020-07-13  7:33 UTC (permalink / raw)
  To: Leonardo Bras, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200703061844.111865-5-leobras.c@gmail.com>



On 03/07/2020 16:18, Leonardo Bras wrote:
> On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
> default DMA window for the device, before attempting to configure a DDW,
> in order to make the maximum resources available for the next DDW to be
> created.
> 
> This is a requirement for using DDW on devices in which hypervisor
> allows only one DMA window.
> 
> If setting up a new DDW fails anywhere after the removal of this
> default DMA window, it's needed to restore the default DMA window.
> For this, an implementation of ibm,reset-pe-dma-windows rtas call is
> needed:
> 
> Platforms supporting the DDW option starting with LoPAR level 2.7 implement
> ibm,ddw-extensions. The first extension available (index 2) carries the
> token for ibm,reset-pe-dma-windows rtas call, which is used to restore
> the default DMA window for a device, if it has been deleted.
> 
> It does so by resetting the TCE table allocation for the PE to it's
> boot time value, available in "ibm,dma-window" device tree node.
> 
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 83 +++++++++++++++++++++-----
>  1 file changed, 69 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 4e33147825cc..5b520ac354c6 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -1066,6 +1066,38 @@ static phys_addr_t ddw_memory_hotplug_max(void)
>  	return max_addr;
>  }
>  
> +/*
> + * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
> + * ibm,ddw-extensions, which carries the rtas token for
> + * ibm,reset-pe-dma-windows.
> + * That rtas-call can be used to restore the default DMA window for the device.
> + */
> +static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
> +{
> +	int ret;
> +	u32 cfg_addr, reset_dma_win;
> +	u64 buid;
> +	struct device_node *dn;
> +	struct pci_dn *pdn;
> +
> +	ret = ddw_read_ext(par_dn, DDW_EXT_RESET_DMA_WIN, &reset_dma_win);
> +	if (ret)
> +		return;
> +
> +	dn = pci_device_to_OF_node(dev);
> +	pdn = PCI_DN(dn);
> +	buid = pdn->phb->buid;
> +	cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
> +
> +	ret = rtas_call(reset_dma_win, 3, 1, NULL, cfg_addr, BUID_HI(buid),
> +			BUID_LO(buid));
> +	if (ret)
> +		dev_info(&dev->dev,
> +			 "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
> +			 reset_dma_win, cfg_addr, BUID_HI(buid), BUID_LO(buid),
> +			 ret);
> +}
> +
>  /*
>   * If the PE supports dynamic dma windows, and there is space for a table
>   * that can map all pages in a linear offset, then setup such a table,
> @@ -1079,7 +1111,7 @@ static phys_addr_t ddw_memory_hotplug_max(void)
>   */
>  static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  {
> -	int len, ret;
> +	int len, ret, reset_win_ext;

Make it "reset_token".

>  	struct ddw_query_response query;
>  	struct ddw_create_response create;
>  	int page_shift;
> @@ -1087,7 +1119,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  	struct device_node *dn;
>  	u32 ddw_avail[DDW_APPLICABLE_SIZE];
>  	struct direct_window *window;
> -	struct property *win64;
> +	struct property *win64, *default_win = NULL;
>  	struct dynamic_dma_window_prop *ddwprop;
>  	struct failed_ddw_pdn *fpdn;
>  
> @@ -1122,7 +1154,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  	if (ret)
>  		goto out_failed;
>  
> -       /*
> +	/*
>  	 * Query if there is a second window of size to map the
>  	 * whole partition.  Query returns number of windows, largest
>  	 * block assigned to PE (partition endpoint), and two bitmasks
> @@ -1133,14 +1165,34 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  	if (ret != 0)
>  		goto out_failed;
>  
> +	/*
> +	 * If there is no window available, remove the default DMA window,
> +	 * if it's present. This will make all the resources available to the
> +	 * new DDW window.
> +	 * If anything fails after this, we need to restore it, so also check
> +	 * for extensions presence.
> +	 */
>  	if (query.windows_available == 0) {
> -		/*
> -		 * no additional windows are available for this device.
> -		 * We might be able to reallocate the existing window,
> -		 * trading in for a larger page size.
> -		 */
> -		dev_dbg(&dev->dev, "no free dynamic windows");
> -		goto out_failed;
> +		default_win = of_find_property(pdn, "ibm,dma-window", NULL);
> +		if (!default_win)
> +			goto out_failed;
> +
> +		reset_win_ext = ddw_read_ext(pdn, DDW_EXT_RESET_DMA_WIN, NULL);
> +		if (reset_win_ext)
> +			goto out_failed;
> +
> +		remove_dma_window(pdn, ddw_avail, default_win);
> +
> +		/* Query again, to check if the window is available */
> +		ret = query_ddw(dev, ddw_avail, &query, pdn);
> +		if (ret != 0)
> +			goto out_restore_defwin;
> +
> +		if (query.windows_available == 0) {
> +			/* no windows are available for this device. */
> +			dev_dbg(&dev->dev, "no free dynamic windows");
> +			goto out_restore_defwin;
> +		}
>  	}
>  	if (query.page_size & 4) {
>  		page_shift = 24; /* 16MB */
> @@ -1151,7 +1203,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  	} else {
>  		dev_dbg(&dev->dev, "no supported direct page size in mask %x",
>  			  query.page_size);
> -		goto out_failed;
> +		goto out_restore_defwin;
>  	}
>  	/* verify the window * number of ptes will map the partition */
>  	/* check largest block * page size > max memory hotplug addr */
> @@ -1160,14 +1212,14 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
>  			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
>  			  1ULL << page_shift);
> -		goto out_failed;
> +		goto out_restore_defwin;
>  	}
>  	len = order_base_2(max_addr);
>  	win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
>  	if (!win64) {
>  		dev_info(&dev->dev,
>  			"couldn't allocate property for 64bit dma window\n");
> -		goto out_failed;
> +		goto out_restore_defwin;
>  	}
>  	win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
>  	win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
> @@ -1230,8 +1282,11 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  	kfree(win64->value);
>  	kfree(win64);
>  
> -out_failed:
> +out_restore_defwin:
> +	if (default_win && reset_win_ext == 0)


reset_win_ext potentially may be uninitialized here. Yeah I know it is
tied to default_win but still.

After looking at this function for a few minutes, it could use some
refactoring (way too many gotos)  such as:

1. move (query.page_size & xx) checks before "if
(query.windows_available == 0)"

2. move "win64 = kzalloc(sizeof(struct property), GFP_KERNEL)" before
"if (query.windows_available == 0)"

3. call "reset_dma_window(dev, pdn)" inside the "if
(query.windows_available == 0)" branch.

Then you can drop all "goto out_restore_defwin" and move default_win and
reset_win_ext inside "if (query.windows_available == 0)".

The rest of the series is good as it is, however it may conflict with
https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200713062348.100552-1-aik@ozlabs.ru/
and the patchset it is made on top of -
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=188385 .
thanks,


> +		reset_dma_window(dev, pdn);
>  
> +out_failed:
>  	fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
>  	if (!fpdn)
>  		goto out_unlock;
> 

-- 
Alexey

^ permalink raw reply

* [PATCH] powerpc/boot: add DTB to 'targets'
From: Masahiro Yamada @ 2020-07-13  7:56 UTC (permalink / raw)
  To: linux-kbuild
  Cc: Christophe Leroy, Arnd Bergmann, Masahiro Yamada, Michal Simek,
	linux-kernel, Paul Mackerras, linuxppc-dev

PowerPC always re-builds DTB even if nothing has been changed.

As for other architectures, arch/*/boot/dts/Makefile builds DTB by
using the dtb-y syntax.

In contrast, arch/powerpc/boot/dts/(fsl/)Makefile does nothing unless
CONFIG_OF_ALL_DTBS is defined. Instead, arch/powerpc/boot/Makefile
builds DTB on demand. You need to add DTB to 'targets' explicitly
so .*.cmd files are included.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---

I want to apply this to kbuild tree because this is needed
to fix the build error caused by another kbuild patch:

https://lkml.org/lkml/2020/7/7/134


 arch/powerpc/boot/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 63d7456b9518..8792323707fd 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -366,6 +366,8 @@ initrd-y := $(patsubst zImage%, zImage.initrd%, \
 		$(patsubst treeImage%, treeImage.initrd%, $(image-y)))))
 initrd-y := $(filter-out $(image-y), $(initrd-y))
 targets	+= $(image-y) $(initrd-y)
+targets += $(foreach x, dtbImage uImage cuImage simpleImage treeImage, \
+		$(patsubst $(x).%, dts/%.dtb, $(filter $(x).%, $(image-y))))
 
 $(addprefix $(obj)/, $(initrd-y)): $(obj)/ramdisk.image.gz
 
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH 01/15] powernv/pci: Add pci_bus_to_pnvhb() helper
From: Alexey Kardashevskiy @ 2020-07-13  8:28 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200710052340.737567-2-oohall@gmail.com>



On 10/07/2020 15:23, Oliver O'Halloran wrote:
> Add a helper to go from a pci_bus structure to the pnv_phb that hosts that
> bus. There's a lot of instances of the following pattern:
> 
> 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> 	struct pnv_phb *phb = hose->private_data;
> 
> Without any other uses of the pci_controller inside the function. This is
> hard to read since it requires you to memorise the contents of the
> private data fields and kind of error prone since it involves blindly
> assigning a void pointer. Add a helper to make it more concise and
> explicit.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 88 +++++++----------------
>  arch/powerpc/platforms/powernv/pci.c      | 14 ++--
>  arch/powerpc/platforms/powernv/pci.h      | 10 +++
>  3 files changed, 38 insertions(+), 74 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 31c3e6d58c41..687919db0347 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -252,8 +252,7 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb)
>  static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
>  					 unsigned long *pe_bitmap)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
>  	struct resource *r;
>  	resource_size_t base, sgsz, start, end;
>  	int segno, i;
> @@ -351,8 +350,7 @@ static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
>  
>  static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
>  	struct pnv_ioda_pe *master_pe, *pe;
>  	unsigned long size, *pe_alloc;
>  	int i;
> @@ -673,8 +671,7 @@ struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn)
>  
>  struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(dev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
>  	struct pci_dn *pdn = pci_get_pdn(dev);
>  
>  	if (!pdn)
> @@ -1069,8 +1066,7 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
>  
>  static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(dev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
>  	struct pci_dn *pdn = pci_get_pdn(dev);
>  	struct pnv_ioda_pe *pe;
>  
> @@ -1129,8 +1125,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>   */
>  static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
>  	struct pnv_ioda_pe *pe = NULL;
>  	unsigned int pe_num;
>  
> @@ -1196,8 +1191,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
>  	struct pnv_ioda_pe *pe;
>  	struct pci_dev *gpu_pdev;
>  	struct pci_dn *npu_pdn;
> -	struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus);
>  
>  	/*
>  	 * Intentionally leak a reference on the npu device (for
> @@ -1300,16 +1294,12 @@ static void pnv_pci_ioda_setup_nvlink(void)
>  #ifdef CONFIG_PCI_IOV
>  static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
>  {
> -	struct pci_bus        *bus;
> -	struct pci_controller *hose;
>  	struct pnv_phb        *phb;
>  	struct pci_dn         *pdn;
>  	int                    i, j;
>  	int                    m64_bars;
>  
> -	bus = pdev->bus;
> -	hose = pci_bus_to_host(bus);
> -	phb = hose->private_data;
> +	phb = pci_bus_to_pnvhb(pdev->bus);
>  	pdn = pci_get_pdn(pdev);
>  
>  	if (pdn->m64_single_mode)
> @@ -1333,8 +1323,6 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
>  
>  static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>  {
> -	struct pci_bus        *bus;
> -	struct pci_controller *hose;
>  	struct pnv_phb        *phb;
>  	struct pci_dn         *pdn;
>  	unsigned int           win;
> @@ -1346,9 +1334,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>  	int                    pe_num;
>  	int                    m64_bars;
>  
> -	bus = pdev->bus;
> -	hose = pci_bus_to_host(bus);
> -	phb = hose->private_data;
> +	phb = pci_bus_to_pnvhb(pdev->bus);
>  	pdn = pci_get_pdn(pdev);
>  	total_vfs = pci_sriov_get_totalvfs(pdev);
>  
> @@ -1459,15 +1445,11 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
>  
>  static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>  {
> -	struct pci_bus        *bus;
> -	struct pci_controller *hose;
>  	struct pnv_phb        *phb;
>  	struct pnv_ioda_pe    *pe, *pe_n;
>  	struct pci_dn         *pdn;
>  
> -	bus = pdev->bus;
> -	hose = pci_bus_to_host(bus);
> -	phb = hose->private_data;
> +	phb = pci_bus_to_pnvhb(pdev->bus);
>  	pdn = pci_get_pdn(pdev);
>  
>  	if (!pdev->is_physfn)
> @@ -1492,16 +1474,12 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>  
>  static void pnv_pci_sriov_disable(struct pci_dev *pdev)
>  {
> -	struct pci_bus        *bus;
> -	struct pci_controller *hose;
>  	struct pnv_phb        *phb;
>  	struct pnv_ioda_pe    *pe;
>  	struct pci_dn         *pdn;
>  	u16                    num_vfs, i;
>  
> -	bus = pdev->bus;
> -	hose = pci_bus_to_host(bus);
> -	phb = hose->private_data;
> +	phb = pci_bus_to_pnvhb(pdev->bus);
>  	pdn = pci_get_pdn(pdev);
>  	num_vfs = pdn->num_vfs;
>  
> @@ -1535,17 +1513,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  				       struct pnv_ioda_pe *pe);
>  static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>  {
> -	struct pci_bus        *bus;
> -	struct pci_controller *hose;
>  	struct pnv_phb        *phb;
>  	struct pnv_ioda_pe    *pe;
>  	int                    pe_num;
>  	u16                    vf_index;
>  	struct pci_dn         *pdn;
>  
> -	bus = pdev->bus;
> -	hose = pci_bus_to_host(bus);
> -	phb = hose->private_data;
> +	phb = pci_bus_to_pnvhb(pdev->bus);
>  	pdn = pci_get_pdn(pdev);
>  
>  	if (!pdev->is_physfn)
> @@ -1572,7 +1546,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>  		pe->rid = (vf_bus << 8) | vf_devfn;
>  
>  		pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
> -			hose->global_number, pdev->bus->number,
> +			pci_domain_nr(pdev->bus), pdev->bus->number,
>  			PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
>  
>  		if (pnv_ioda_configure_pe(phb, pe)) {
> @@ -1602,17 +1576,13 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
>  
>  static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>  {
> -	struct pci_bus        *bus;
> -	struct pci_controller *hose;
>  	struct pnv_phb        *phb;
>  	struct pnv_ioda_pe    *pe;
>  	struct pci_dn         *pdn;
>  	int                    ret;
>  	u16                    i;
>  
> -	bus = pdev->bus;
> -	hose = pci_bus_to_host(bus);
> -	phb = hose->private_data;
> +	phb = pci_bus_to_pnvhb(pdev->bus);
>  	pdn = pci_get_pdn(pdev);
>  
>  	if (phb->type == PNV_PHB_IODA2) {
> @@ -1735,8 +1705,7 @@ static int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>  
>  static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
>  	struct pci_dn *pdn = pci_get_pdn(pdev);
>  	struct pnv_ioda_pe *pe;
>  
> @@ -1847,8 +1816,7 @@ static int pnv_pci_ioda_dma_64bit_bypass(struct pnv_ioda_pe *pe)
>  static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
>  		u64 dma_mask)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
>  	struct pci_dn *pdn = pci_get_pdn(pdev);
>  	struct pnv_ioda_pe *pe;
>  
> @@ -2766,8 +2734,7 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
>  #ifdef CONFIG_PCI_IOV
>  static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
>  	const resource_size_t gate = phb->ioda.m64_segsize >> 2;
>  	struct resource *res;
>  	int i;
> @@ -3101,10 +3068,9 @@ static void pnv_pci_ioda_fixup(void)
>  static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
>  						unsigned long type)
>  {
> -	struct pci_dev *bridge;
> -	struct pci_controller *hose = pci_bus_to_host(bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
>  	int num_pci_bridges = 0;
> +	struct pci_dev *bridge;
>  
>  	bridge = bus->self;
>  	while (bridge) {
> @@ -3190,8 +3156,7 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
>  
>  static void pnv_pci_configure_bus(struct pci_bus *bus)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
>  	struct pci_dev *bridge = bus->self;
>  	struct pnv_ioda_pe *pe;
>  	bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
> @@ -3237,8 +3202,7 @@ static resource_size_t pnv_pci_default_alignment(void)
>  static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>  						      int resno)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
>  	struct pci_dn *pdn = pci_get_pdn(pdev);
>  	resource_size_t align;
>  
> @@ -3274,8 +3238,7 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
>   */
>  static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(dev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
>  	struct pci_dn *pdn;
>  
>  	/* The function is probably called while the PEs have
> @@ -3488,8 +3451,7 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
>  
>  static void pnv_pci_release_device(struct pci_dev *pdev)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
>  	struct pci_dn *pdn = pci_get_pdn(pdev);
>  	struct pnv_ioda_pe *pe;
>  
> @@ -3534,8 +3496,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
>  
>  static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus)
>  {
> -	struct pci_controller *hose = bus->sysdata;
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
>  	struct pnv_ioda_pe *pe;
>  
>  	list_for_each_entry(pe, &phb->ioda.pe_list, list) {
> @@ -3873,8 +3834,7 @@ void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
>  
>  static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(dev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
>  
>  	if (!machine_is(powernv))
>  		return;
> diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
> index 091fe1cf386b..9b9bca169275 100644
> --- a/arch/powerpc/platforms/powernv/pci.c
> +++ b/arch/powerpc/platforms/powernv/pci.c
> @@ -162,8 +162,7 @@ EXPORT_SYMBOL_GPL(pnv_pci_set_power_state);
>  
>  int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
>  	struct msi_desc *entry;
>  	struct msi_msg msg;
>  	int hwirq;
> @@ -211,8 +210,7 @@ int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
>  
>  void pnv_teardown_msi_irqs(struct pci_dev *pdev)
>  {
> -	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> -	struct pnv_phb *phb = hose->private_data;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
>  	struct msi_desc *entry;
>  	irq_hw_number_t hwirq;
>  
> @@ -824,10 +822,9 @@ EXPORT_SYMBOL(pnv_pci_get_phb_node);
>  
>  int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
>  {
> -	__be64 val;
> -	struct pci_controller *hose;
> -	struct pnv_phb *phb;
> +	struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
>  	u64 tunnel_bar;
> +	__be64 val;
>  	int rc;
>  
>  	if (!opal_check_token(OPAL_PCI_GET_PBCQ_TUNNEL_BAR))
> @@ -835,9 +832,6 @@ int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
>  	if (!opal_check_token(OPAL_PCI_SET_PBCQ_TUNNEL_BAR))
>  		return -ENXIO;
>  
> -	hose = pci_bus_to_host(dev->bus);
> -	phb = hose->private_data;
> -
>  	mutex_lock(&tunnel_mutex);
>  	rc = opal_pci_get_pbcq_tunnel_bar(phb->opal_id, &val);
>  	if (rc != OPAL_SUCCESS) {
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index 51c254f2f3cb..0727dec9a0d1 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -260,4 +260,14 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
>  
>  extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
>  
> +static inline struct pnv_phb *pci_bus_to_pnvhb(struct pci_bus *bus)
> +{
> +	struct pci_controller *hose = bus->sysdata;
> +
> +	if (hose)
> +		return hose->private_data;


Since it is powernv, private_data should not ever be NULL so we want
BUG_ON here may be?


> +
> +	return NULL;
> +}
> +
>  #endif /* __POWERNV_PCI_H */
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 02/15] powerpc/powernv/pci: Always tear down DMA windows on PE release
From: Alexey Kardashevskiy @ 2020-07-13  8:30 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200710052340.737567-3-oohall@gmail.com>



On 10/07/2020 15:23, Oliver O'Halloran wrote:
> Currently we have these two functions:
> 
> 	pnv_pci_ioda2_release_dma_pe(), and
> 	pnv_pci_ioda2_release_pe_dma()
> 
> The first is used when tearing down VF PEs and the other is used for normal
> devices. There's very little difference between the two though. The latter
> (non-VF) will skip a call to pnv_pci_ioda2_unset_window() unless
> CONFIG_IOMMU_API=y is set. There's no real point in doing this so fold the
> two together.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>



Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>


> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 30 +++--------------------
>  1 file changed, 3 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 687919db0347..bfb40607aa0e 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1422,26 +1422,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>  	return -EBUSY;
>  }
>  
> -static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
> -		int num);
> -
> -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
> -{
> -	struct iommu_table    *tbl;
> -	int64_t               rc;
> -
> -	tbl = pe->table_group.tables[0];
> -	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
> -	if (rc)
> -		pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
> -
> -	pnv_pci_ioda2_set_bypass(pe, false);
> -	if (pe->table_group.group) {
> -		iommu_group_put(pe->table_group.group);
> -		BUG_ON(pe->table_group.group);
> -	}
> -	iommu_tce_table_put(tbl);
> -}
> +static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
>  
>  static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>  {
> @@ -1455,11 +1436,12 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
>  	if (!pdev->is_physfn)
>  		return;
>  
> +	/* FIXME: Use pnv_ioda_release_pe()? */
>  	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
>  		if (pe->parent_dev != pdev)
>  			continue;
>  
> -		pnv_pci_ioda2_release_dma_pe(pdev, pe);
> +		pnv_pci_ioda2_release_pe_dma(pe);
>  
>  		/* Remove from list */
>  		mutex_lock(&phb->ioda.pe_list_mutex);
> @@ -2429,7 +2411,6 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
>  	return 0;
>  }
>  
> -#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
>  static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
>  		int num)
>  {
> @@ -2453,7 +2434,6 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
>  
>  	return ret;
>  }
> -#endif
>  
>  #ifdef CONFIG_IOMMU_API
>  unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
> @@ -3334,18 +3314,14 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
>  {
>  	struct iommu_table *tbl = pe->table_group.tables[0];
>  	unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
> -#ifdef CONFIG_IOMMU_API
>  	int64_t rc;
> -#endif
>  
>  	if (!weight)
>  		return;
>  
> -#ifdef CONFIG_IOMMU_API
>  	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
>  	if (rc)
>  		pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
> -#endif
>  
>  	pnv_pci_ioda2_set_bypass(pe, false);
>  	if (pe->table_group.group) {
> 

-- 
Alexey

^ permalink raw reply

* [PATCH] powerpc: Add cputime_to_nsecs()
From: Anton Blanchard @ 2020-07-13  8:36 UTC (permalink / raw)
  To: benh, paulus, mpe, npiggin; +Cc: linuxppc-dev

Generic code has a wrapper to implement cputime_to_nsecs() on top of
cputime_to_usecs() but we can easily return the full nanosecond
resolution directly.

Signed-off-by: Anton Blanchard <anton@ozlabs.org>
---
 arch/powerpc/include/asm/cputime.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 0fccd5ea1e9a..9335b93924b4 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -36,6 +36,8 @@ static inline unsigned long cputime_to_usecs(const cputime_t ct)
 	return mulhdu((__force u64) ct, __cputime_usec_factor);
 }
 
+#define cputime_to_nsecs(cputime) tb_to_ns((__force u64)cputime)
+
 /*
  * PPC64 uses PACA which is task independent for storing accounting data while
  * PPC32 uses struct thread_info, therefore at task switch the accounting data
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH 2/2] powerpc/powernv: Move pnv_ioda_setup_bus_dma under CONFIG_IOMMU_API
From: Alexey Kardashevskiy @ 2020-07-13  8:39 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev; +Cc: kernel test robot
In-Reply-To: <20200705133557.443607-2-oohall@gmail.com>



On 05/07/2020 23:35, Oliver O'Halloran wrote:
> pnv_ioda_setup_bus_dma() is only used when a passed through PE is
> returned to the host. If the kernel is built without IOMMU support
> this is dead code.

True.

Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>




> Move it under the #ifdef with the rest of the
> IOMMU API support.
> 
> Reported-by: kernel test robot <lkp@intel.com>
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 26 +++++++++++------------
>  1 file changed, 13 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index c2d46d28114b..31c3e6d58c41 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1885,19 +1885,6 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
>  	return false;
>  }
>  
> -static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
> -{
> -	struct pci_dev *dev;
> -
> -	list_for_each_entry(dev, &bus->devices, bus_list) {
> -		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
> -		dev->dev.archdata.dma_offset = pe->tce_bypass_base;
> -
> -		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
> -			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
> -	}
> -}
> -
>  static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
>  						     bool real_mode)
>  {
> @@ -2547,6 +2534,19 @@ static long pnv_pci_ioda2_create_table_userspace(
>  	return ret;
>  }
>  
> +static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
> +{
> +	struct pci_dev *dev;
> +
> +	list_for_each_entry(dev, &bus->devices, bus_list) {
> +		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
> +		dev->dev.archdata.dma_offset = pe->tce_bypass_base;
> +
> +		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
> +			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
> +	}
> +}
> +
>  static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
>  {
>  	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 03/14] powerpc/eeh: Move vf_index out of pci_dn and into eeh_dev
From: Alexey Kardashevskiy @ 2020-07-13  8:55 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev; +Cc: mahesh
In-Reply-To: <20200706013619.459420-4-oohall@gmail.com>



On 06/07/2020 11:36, Oliver O'Halloran wrote:
> Drivers that do not support the PCI error handling callbacks are handled by
> tearing down the device and re-probing them. If the device to be removed is
> a virtual function we need to know the index of the index of the VF so that

Too many indexes in "the index of the index of "?


> we can remove it with the pci_iov_{add|remove}_virtfn() API.
> 
> Currently this is handled by looking up the pci_dn, and using the vf_index
> that was stashed there when the pci_dn for the VF was created in
> pcibios_sriov_enable(). We would like to eliminate the use of pci_dn
> outside of pseries though so we need to provide the generic EEH code with
> some other way to find the vf_index.
> 
> The easiest thing to do here is move the vf_index field out of pci_dn and
> into eeh_dev.  Currently pci_dn and eeh_dev are allocated and initialized
> together so this is a fairly minimal change in preparation for splitting
> pci_dn and eeh_dev in the future.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>



Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>


> ---
>  arch/powerpc/include/asm/eeh.h        | 3 +++
>  arch/powerpc/include/asm/pci-bridge.h | 1 -
>  arch/powerpc/kernel/eeh_driver.c      | 6 ++----
>  arch/powerpc/kernel/pci_dn.c          | 7 ++++---
>  4 files changed, 9 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index e22881a0c415..3d648e042835 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -148,7 +148,10 @@ struct eeh_dev {
>  	struct pci_dn *pdn;		/* Associated PCI device node	*/
>  	struct pci_dev *pdev;		/* Associated PCI device	*/
>  	bool in_error;			/* Error flag for edev		*/
> +
> +	/* VF specific properties */
>  	struct pci_dev *physfn;		/* Associated SRIOV PF		*/
> +	int vf_index;			/* Index of this VF 		*/
>  };
>  
>  /* "fmt" must be a simple literal string */
> diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
> index b92e81b256e5..d2a2a14e56f9 100644
> --- a/arch/powerpc/include/asm/pci-bridge.h
> +++ b/arch/powerpc/include/asm/pci-bridge.h
> @@ -202,7 +202,6 @@ struct pci_dn {
>  #define IODA_INVALID_PE		0xFFFFFFFF
>  	unsigned int pe_number;
>  #ifdef CONFIG_PCI_IOV
> -	int     vf_index;		/* VF index in the PF */
>  	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
>  	u16     num_vfs;		/* number of VFs enabled*/
>  	unsigned int *pe_num_map;	/* PE# for the first VF PE or array */
> diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
> index 7b048cee767c..b70b9273f45a 100644
> --- a/arch/powerpc/kernel/eeh_driver.c
> +++ b/arch/powerpc/kernel/eeh_driver.c
> @@ -477,7 +477,7 @@ static void *eeh_add_virt_device(struct eeh_dev *edev)
>  	}
>  
>  #ifdef CONFIG_PCI_IOV
> -	pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index);
> +	pci_iov_add_virtfn(edev->physfn, edev->vf_index);
>  #endif
>  	return NULL;
>  }
> @@ -521,9 +521,7 @@ static void eeh_rmv_device(struct eeh_dev *edev, void *userdata)
>  
>  	if (edev->physfn) {
>  #ifdef CONFIG_PCI_IOV
> -		struct pci_dn *pdn = eeh_dev_to_pdn(edev);
> -
> -		pci_iov_remove_virtfn(edev->physfn, pdn->vf_index);
> +		pci_iov_remove_virtfn(edev->physfn, edev->vf_index);
>  		edev->pdev = NULL;
>  #endif
>  		if (rmv_data)
> diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
> index f790a8d06f50..bf11ac8427ac 100644
> --- a/arch/powerpc/kernel/pci_dn.c
> +++ b/arch/powerpc/kernel/pci_dn.c
> @@ -146,7 +146,6 @@ static struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
>  
>  #ifdef CONFIG_PCI_IOV
>  static struct pci_dn *add_one_sriov_vf_pdn(struct pci_dn *parent,
> -					   int vf_index,
>  					   int busno, int devfn)
>  {
>  	struct pci_dn *pdn;
> @@ -163,7 +162,6 @@ static struct pci_dn *add_one_sriov_vf_pdn(struct pci_dn *parent,
>  	pdn->parent = parent;
>  	pdn->busno = busno;
>  	pdn->devfn = devfn;
> -	pdn->vf_index = vf_index;
>  	pdn->pe_number = IODA_INVALID_PE;
>  	INIT_LIST_HEAD(&pdn->child_list);
>  	INIT_LIST_HEAD(&pdn->list);
> @@ -194,7 +192,7 @@ struct pci_dn *add_sriov_vf_pdns(struct pci_dev *pdev)
>  	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
>  		struct eeh_dev *edev __maybe_unused;
>  
> -		pdn = add_one_sriov_vf_pdn(parent, i,
> +		pdn = add_one_sriov_vf_pdn(parent,
>  					   pci_iov_virtfn_bus(pdev, i),
>  					   pci_iov_virtfn_devfn(pdev, i));
>  		if (!pdn) {
> @@ -207,7 +205,10 @@ struct pci_dn *add_sriov_vf_pdns(struct pci_dev *pdev)
>  		/* Create the EEH device for the VF */
>  		edev = eeh_dev_init(pdn);
>  		BUG_ON(!edev);
> +
> +		/* FIXME: these should probably be populated by the EEH probe */
>  		edev->physfn = pdev;
> +		edev->vf_index = i;
>  #endif /* CONFIG_EEH */
>  	}
>  	return pci_get_pdn(pdev);
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 04/14] powerpc/pseries: Stop using pdn->pe_number
From: Alexey Kardashevskiy @ 2020-07-13  8:59 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev; +Cc: mahesh
In-Reply-To: <20200706013619.459420-5-oohall@gmail.com>



On 06/07/2020 11:36, Oliver O'Halloran wrote:
> The pci_dn->pe_number field is mainly used to track the IODA PE number of a
> device on PowerNV. At some point it grew a user in the pseries SR-IOV
> support which muddies the waters a bit, so remove it.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>


Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>



> ---
>  arch/powerpc/platforms/pseries/eeh_pseries.c | 10 ++++------
>  1 file changed, 4 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
> index ace117f99d94..18a2522b9b5e 100644
> --- a/arch/powerpc/platforms/pseries/eeh_pseries.c
> +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
> @@ -52,8 +52,6 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
>  	dev_dbg(&pdev->dev, "EEH: Setting up device\n");
>  #ifdef CONFIG_PCI_IOV
>  	if (pdev->is_virtfn) {
> -		struct pci_dn *physfn_pdn;
> -
>  		pdn->device_id  =  pdev->device;
>  		pdn->vendor_id  =  pdev->vendor;
>  		pdn->class_code =  pdev->class;
> @@ -63,8 +61,6 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
>  		 * completion from platform.
>  		 */
>  		pdn->last_allow_rc =  0;
> -		physfn_pdn      =  pci_get_pdn(pdev->physfn);
> -		pdn->pe_number  =  physfn_pdn->pe_num_map[pdn->vf_index];
>  	}
>  #endif
>  	pseries_eeh_init_edev(pdn);
> @@ -772,8 +768,8 @@ int pseries_send_allow_unfreeze(struct pci_dn *pdn,
>  
>  static int pseries_call_allow_unfreeze(struct eeh_dev *edev)
>  {
> +	int cur_vfs = 0, rc = 0, vf_index, bus, devfn, vf_pe_num;
>  	struct pci_dn *pdn, *tmp, *parent, *physfn_pdn;
> -	int cur_vfs = 0, rc = 0, vf_index, bus, devfn;
>  	u16 *vf_pe_array;
>  
>  	vf_pe_array = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
> @@ -806,8 +802,10 @@ static int pseries_call_allow_unfreeze(struct eeh_dev *edev)
>  			}
>  		} else {
>  			pdn = pci_get_pdn(edev->pdev);
> -			vf_pe_array[0] = cpu_to_be16(pdn->pe_number);
>  			physfn_pdn = pci_get_pdn(edev->physfn);
> +
> +			vf_pe_num = physfn_pdn->pe_num_map[edev->vf_index];
> +			vf_pe_array[0] = cpu_to_be16(vf_pe_num);
>  			rc = pseries_send_allow_unfreeze(physfn_pdn,
>  							 vf_pe_array, 1);
>  			pdn->last_allow_rc = rc;
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 03/14] powerpc/eeh: Move vf_index out of pci_dn and into eeh_dev
From: Oliver O'Halloran @ 2020-07-13  9:02 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev, Mahesh J Salgaonkar
In-Reply-To: <7853bbc2-715b-110a-2d96-8d32e6141261@ozlabs.ru>

On Mon, Jul 13, 2020 at 6:56 PM Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>
>
>
> On 06/07/2020 11:36, Oliver O'Halloran wrote:
> > Drivers that do not support the PCI error handling callbacks are handled by
> > tearing down the device and re-probing them. If the device to be removed is
> > a virtual function we need to know the index of the index of the VF so that
>
> Too many indexes in "the index of the index of "?

I'll index you!

(yes)

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox