* [PATCH] powerpc/pci: Remove unimplemented prototypes
From: Oliver O'Halloran @ 2020-09-02 3:51 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Oliver O'Halloran
The corresponding definitions were deleted in commit 3d5134ee8341
("[POWERPC] Rewrite IO allocation & mapping on powerpc64") which
was merged a mere 13 years ago.
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
arch/powerpc/include/asm/ppc-pci.h | 4 ----
1 file changed, 4 deletions(-)
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 7f4be5a05eb3..0745422a8e57 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -13,10 +13,6 @@
extern unsigned long isa_io_base;
-extern void pci_setup_phb_io(struct pci_controller *hose, int primary);
-extern void pci_setup_phb_io_dynamic(struct pci_controller *hose, int primary);
-
-
extern struct list_head hose_list;
extern struct pci_dev *isa_bridge_pcidev; /* may be NULL if no ISA bus */
--
2.26.2
^ permalink raw reply related
* [PATCH] powerpc/powernv/pci: Drop VF MPS fixup
From: Oliver O'Halloran @ 2020-09-02 3:51 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Oliver O'Halloran
The MPS field in the VF config space is marked as reserved in current
versions of the SR-IOV spec. In other words, this fixup doesn't do
anything.
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
arch/powerpc/platforms/powernv/eeh-powernv.c | 18 ------------------
1 file changed, 18 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 9af8c3b98853..0cabe4e632e3 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1689,24 +1689,6 @@ static struct eeh_ops pnv_eeh_ops = {
.notify_resume = NULL
};
-#ifdef CONFIG_PCI_IOV
-static void pnv_pci_fixup_vf_mps(struct pci_dev *pdev)
-{
- struct pci_dn *pdn = pci_get_pdn(pdev);
- int parent_mps;
-
- if (!pdev->is_virtfn)
- return;
-
- /* Synchronize MPS for VF and PF */
- parent_mps = pcie_get_mps(pdev->physfn);
- if ((128 << pdev->pcie_mpss) >= parent_mps)
- pcie_set_mps(pdev, parent_mps);
- pdn->mps = pcie_get_mps(pdev);
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps);
-#endif /* CONFIG_PCI_IOV */
-
/**
* eeh_powernv_init - Register platform dependent EEH operations
*
--
2.26.2
^ permalink raw reply related
* Re: [PATCH v3 13/13] mm/debug_vm_pgtable: populate a pte entry before fetching it
From: Aneesh Kumar K.V @ 2020-09-02 3:58 UTC (permalink / raw)
To: Anshuman Khandual, linux-mm, akpm
Cc: linux-arch, linux-s390, Christophe Leroy, x86, Mike Rapoport,
Qian Cai, Gerald Schaefer, Vineet Gupta, linux-snps-arc,
linuxppc-dev, linux-arm-kernel
In-Reply-To: <4ba15b8f-ac90-17ec-9b95-0451e2a38e98@arm.com>
On 9/2/20 9:19 AM, Anshuman Khandual wrote:
>
>
> On 09/01/2020 03:28 PM, Aneesh Kumar K.V wrote:
>> On 9/1/20 1:08 PM, Anshuman Khandual wrote:
>>>
>>>
>>> On 09/01/2020 12:07 PM, Aneesh Kumar K.V wrote:
>>>> On 9/1/20 8:55 AM, Anshuman Khandual wrote:
>>>>>
>>>>>
>>>>> On 08/27/2020 01:34 PM, Aneesh Kumar K.V wrote:
>>>>>> pte_clear_tests operate on an existing pte entry. Make sure that is not a none
>>>>>> pte entry.
>>>>>>
>>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>>> ---
>>>>>> mm/debug_vm_pgtable.c | 6 ++++--
>>>>>> 1 file changed, 4 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
>>>>>> index 21329c7d672f..8527ebb75f2c 100644
>>>>>> --- a/mm/debug_vm_pgtable.c
>>>>>> +++ b/mm/debug_vm_pgtable.c
>>>>>> @@ -546,7 +546,7 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
>>>>>> static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
>>>>>> unsigned long vaddr)
>>>>>> {
>>>>>> - pte_t pte = ptep_get(ptep);
>>>>>> + pte_t pte = ptep_get_and_clear(mm, vaddr, ptep);
>>>>>
>>>>> Seems like ptep_get_and_clear() here just clears the entry in preparation
>>>>> for a following set_pte_at() which otherwise would have been a problem on
>>>>> ppc64 as you had pointed out earlier i.e set_pte_at() should not update an
>>>>> existing valid entry. So the commit message here is bit misleading.
>>>>>
>>>>
>>>> and also fetch the pte value which is used further.
>>>>
>>>>
>>>>>> pr_debug("Validating PTE clear\n");
>>>>>> pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
>>>>>> @@ -944,7 +944,7 @@ static int __init debug_vm_pgtable(void)
>>>>>> p4d_t *p4dp, *saved_p4dp;
>>>>>> pud_t *pudp, *saved_pudp;
>>>>>> pmd_t *pmdp, *saved_pmdp, pmd;
>>>>>> - pte_t *ptep;
>>>>>> + pte_t *ptep, pte;
>>>>>> pgtable_t saved_ptep;
>>>>>> pgprot_t prot, protnone;
>>>>>> phys_addr_t paddr;
>>>>>> @@ -1049,6 +1049,8 @@ static int __init debug_vm_pgtable(void)
>>>>>> */
>>>>>> ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
>>>>>> + pte = pfn_pte(pte_aligned, prot);
>>>>>> + set_pte_at(mm, vaddr, ptep, pte);
>>>>>
>>>>> Not here, creating and populating an entry must be done in respective
>>>>> test functions itself. Besides, this seems bit redundant as well. The
>>>>> test pte_clear_tests() with the above change added, already
>>>>>
>>>>> - Clears the PTEP entry with ptep_get_and_clear()
>>>>
>>>> and fetch the old value set previously.
>>>
>>> In that case, please move above two lines i.e
>>>
>>> pte = pfn_pte(pte_aligned, prot);
>>> set_pte_at(mm, vaddr, ptep, pte);
>>>
>>> from debug_vm_pgtable() to pte_clear_tests() and update it's arguments
>>> as required.
>>>
>>
>> Frankly, I don't understand what these tests are testing. It all looks like some random clear and set.
>
> The idea here is to have some value with some randomness preferably, in
> a given PTEP before attempting to clear the entry, in order to make sure
> that pte_clear() is indeed clearing something of non-zero value.
>
>>
>> static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
>> unsigned long vaddr, unsigned long pfn,
>> pgprot_t prot)
>> {
>>
>> pte_t pte = pfn_pte(pfn, prot);
>> set_pte_at(mm, vaddr, ptep, pte);
>>
>> pte = ptep_get_and_clear(mm, vaddr, ptep);
>
> Looking at this again, this preceding pfn_pte() followed by set_pte_at()
> is not really required. Its reasonable to start with what ever was there
> in the PTEP as a seed value which anyway gets added with RANDOM_ORVALUE.
> s/ptep_get/ptep_get_and_clear is sufficient to take care of the powerpc
> set_pte_at() constraint.
>
But the way test is written we had none pte before. That is why I added
that set_pte_at to put something there. With none pte the below sequence
fails.
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
set_pte_at(mm, vaddr, ptep, pte);
because nobody is marking a _PAGE_PTE there.
pte_t pte = pfn_pte(pfn, prot);
pr_debug("Validating PTE clear\n");
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
set_pte_at(mm, vaddr, ptep, pte);
barrier();
pte_clear(mm, vaddr, ptep);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
will that work for you?
-aneesh
^ permalink raw reply
* [PATCH v3] powerpc/mm: Remove DEBUG_VM_PGTABLE support on powerpc
From: Aneesh Kumar K.V @ 2020-09-02 4:01 UTC (permalink / raw)
To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, Anshuman Khandual
The test is broken w.r.t page table update rules and results in kernel
crash as below. Disable the support until we get the tests updated.
[ 21.083519] kernel BUG at arch/powerpc/mm/pgtable.c:304!
cpu 0x0: Vector: 700 (Program Check) at [c000000c6d1e76c0]
pc: c00000000009a5ec: assert_pte_locked+0x14c/0x380
lr: c0000000005eeeec: pte_update+0x11c/0x190
sp: c000000c6d1e7950
msr: 8000000002029033
current = 0xc000000c6d172c80
paca = 0xc000000003ba0000 irqmask: 0x03 irq_happened: 0x01
pid = 1, comm = swapper/0
kernel BUG at arch/powerpc/mm/pgtable.c:304!
[link register ] c0000000005eeeec pte_update+0x11c/0x190
[c000000c6d1e7950] 0000000000000001 (unreliable)
[c000000c6d1e79b0] c0000000005eee14 pte_update+0x44/0x190
[c000000c6d1e7a10] c000000001a2ca9c pte_advanced_tests+0x160/0x3d8
[c000000c6d1e7ab0] c000000001a2d4fc debug_vm_pgtable+0x7e8/0x1338
[c000000c6d1e7ba0] c0000000000116ec do_one_initcall+0xac/0x5f0
[c000000c6d1e7c80] c0000000019e4fac kernel_init_freeable+0x4dc/0x5a4
[c000000c6d1e7db0] c000000000012474 kernel_init+0x24/0x160
[c000000c6d1e7e20] c00000000000cbd0 ret_from_kernel_thread+0x5c/0x6c
With DEBUG_VM disabled
[ 20.530152] BUG: Kernel NULL pointer dereference on read at 0x00000000
[ 20.530183] Faulting instruction address: 0xc0000000000df330
cpu 0x33: Vector: 380 (Data SLB Access) at [c000000c6d19f700]
pc: c0000000000df330: memset+0x68/0x104
lr: c00000000009f6d8: hash__pmdp_huge_get_and_clear+0xe8/0x1b0
sp: c000000c6d19f990
msr: 8000000002009033
dar: 0
current = 0xc000000c6d177480
paca = 0xc00000001ec4f400 irqmask: 0x03 irq_happened: 0x01
pid = 1, comm = swapper/0
[link register ] c00000000009f6d8 hash__pmdp_huge_get_and_clear+0xe8/0x1b0
[c000000c6d19f990] c00000000009f748 hash__pmdp_huge_get_and_clear+0x158/0x1b0 (unreliable)
[c000000c6d19fa10] c0000000019ebf30 pmd_advanced_tests+0x1f0/0x378
[c000000c6d19fab0] c0000000019ed088 debug_vm_pgtable+0x79c/0x1244
[c000000c6d19fba0] c0000000000116ec do_one_initcall+0xac/0x5f0
[c000000c6d19fc80] c0000000019a4fac kernel_init_freeable+0x4dc/0x5a4
[c000000c6d19fdb0] c000000000012474 kernel_init+0x24/0x160
[c000000c6d19fe20] c00000000000cbd0 ret_from_kernel_thread+0x5c/0x6c
33:mon>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Documentation/features/debug/debug-vm-pgtable/arch-support.txt | 2 +-
arch/powerpc/Kconfig | 1 -
2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
index 53da483c8326..1c49723e7534 100644
--- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
+++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
@@ -22,7 +22,7 @@
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
- | powerpc: | ok |
+ | powerpc: | TODO |
| riscv: | ok |
| s390: | ok |
| sh: | TODO |
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 65bed1fdeaad..787e829b6f25 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -116,7 +116,6 @@ config PPC
#
select ARCH_32BIT_OFF_T if PPC32
select ARCH_HAS_DEBUG_VIRTUAL
- select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
--
2.26.2
^ permalink raw reply related
* [PATCH v6 0/8] powerpc/watchpoint: Bug fixes plus new feature flag
From: Ravi Bangoria @ 2020-09-02 4:29 UTC (permalink / raw)
To: mpe, christophe.leroy
Cc: ravi.bangoria, mikey, jniethe5, pedromfc, linux-kernel, paulus,
rogealve, naveen.n.rao, linuxppc-dev
Patch #1 fixes issue for quardword instruction on p10 predecessors.
Patch #2 fixes issue for vector instructions.
Patch #3 fixes a bug about watchpoint not firing when created with
ptrace PPC_PTRACE_SETHWDEBUG and CONFIG_HAVE_HW_BREAKPOINT=N.
The fix uses HW_BRK_TYPE_PRIV_ALL for ptrace user which, I
guess, should be fine because we don't leak any kernel
addresses and PRIV_ALL will also help to cover scenarios when
kernel accesses user memory.
Patch #4,#5 fixes infinite exception bug, again the bug happens only
with CONFIG_HAVE_HW_BREAKPOINT=N.
Patch #6 fixes two places where we are missing to set hw_len.
Patch #7 introduce new feature bit PPC_DEBUG_FEATURE_DATA_BP_ARCH_31
which will be set when running on ISA 3.1 compliant machine.
Patch #8 finally adds selftest to test scenarios fixed by patch#2,#3
and also moves MODE_EXACT tests outside of BP_RANGE condition.
Christophe, let me know if this series breaks something for 8xx.
v5: https://lore.kernel.org/r/20200825043617.1073634-1-ravi.bangoria@linux.ibm.com
v5->v6:
- Fix build faulure reported by kernel test robot
- patch #5. Use more compact if condition, suggested by Christophe
Ravi Bangoria (8):
powerpc/watchpoint: Fix quarword instruction handling on p10
predecessors
powerpc/watchpoint: Fix handling of vector instructions
powerpc/watchpoint/ptrace: Fix SETHWDEBUG when
CONFIG_HAVE_HW_BREAKPOINT=N
powerpc/watchpoint: Move DAWR detection logic outside of
hw_breakpoint.c
powerpc/watchpoint: Fix exception handling for
CONFIG_HAVE_HW_BREAKPOINT=N
powerpc/watchpoint: Add hw_len wherever missing
powerpc/watchpoint/ptrace: Introduce PPC_DEBUG_FEATURE_DATA_BP_ARCH_31
powerpc/watchpoint/selftests: Tests for kernel accessing user memory
Documentation/powerpc/ptrace.rst | 1 +
arch/powerpc/include/asm/hw_breakpoint.h | 12 ++
arch/powerpc/include/uapi/asm/ptrace.h | 1 +
arch/powerpc/kernel/Makefile | 3 +-
arch/powerpc/kernel/hw_breakpoint.c | 149 +---------------
.../kernel/hw_breakpoint_constraints.c | 162 ++++++++++++++++++
arch/powerpc/kernel/process.c | 48 ++++++
arch/powerpc/kernel/ptrace/ptrace-noadv.c | 9 +-
arch/powerpc/xmon/xmon.c | 1 +
.../selftests/powerpc/ptrace/ptrace-hwbreak.c | 48 +++++-
10 files changed, 282 insertions(+), 152 deletions(-)
create mode 100644 arch/powerpc/kernel/hw_breakpoint_constraints.c
--
2.26.2
^ permalink raw reply
* [PATCH v6 1/8] powerpc/watchpoint: Fix quarword instruction handling on p10 predecessors
From: Ravi Bangoria @ 2020-09-02 4:29 UTC (permalink / raw)
To: mpe, christophe.leroy
Cc: ravi.bangoria, mikey, jniethe5, pedromfc, linux-kernel, paulus,
rogealve, naveen.n.rao, linuxppc-dev
In-Reply-To: <20200902042945.129369-1-ravi.bangoria@linux.ibm.com>
On p10 predecessors, watchpoint with quarword access is compared at
quardword length. If the watch range is doubleword or less than that
in a first half of quarword aligned 16 bytes, and if there is any
unaligned quadword access which will access only the 2nd half, the
handler should consider it as extraneous and emulate/single-step it
before continuing.
Reported-by: Pedro Miraglia Franco de Carvalho <pedromfc@linux.ibm.com>
Fixes: 74c6881019b7 ("powerpc/watchpoint: Prepare handler to handle more than one watchpoint")
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
arch/powerpc/include/asm/hw_breakpoint.h | 1 +
arch/powerpc/kernel/hw_breakpoint.c | 12 ++++++++++--
2 files changed, 11 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h
index db206a7f38e2..9b68eafebf43 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -42,6 +42,7 @@ struct arch_hw_breakpoint {
#else
#define HW_BREAKPOINT_SIZE 0x8
#endif
+#define HW_BREAKPOINT_SIZE_QUADWORD 0x10
#define DABR_MAX_LEN 8
#define DAWR_MAX_LEN 512
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 1f4a1efa0074..9f7df1c37233 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -520,9 +520,17 @@ static bool ea_hw_range_overlaps(unsigned long ea, int size,
struct arch_hw_breakpoint *info)
{
unsigned long hw_start_addr, hw_end_addr;
+ unsigned long align_size = HW_BREAKPOINT_SIZE;
- hw_start_addr = ALIGN_DOWN(info->address, HW_BREAKPOINT_SIZE);
- hw_end_addr = ALIGN(info->address + info->len, HW_BREAKPOINT_SIZE);
+ /*
+ * On p10 predecessors, quadword is handle differently then
+ * other instructions.
+ */
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) && size == 16)
+ align_size = HW_BREAKPOINT_SIZE_QUADWORD;
+
+ hw_start_addr = ALIGN_DOWN(info->address, align_size);
+ hw_end_addr = ALIGN(info->address + info->len, align_size);
return ((ea < hw_end_addr) && (ea + size > hw_start_addr));
}
--
2.26.2
^ permalink raw reply related
* [PATCH v6 2/8] powerpc/watchpoint: Fix handling of vector instructions
From: Ravi Bangoria @ 2020-09-02 4:29 UTC (permalink / raw)
To: mpe, christophe.leroy
Cc: ravi.bangoria, mikey, jniethe5, pedromfc, linux-kernel, paulus,
rogealve, naveen.n.rao, linuxppc-dev
In-Reply-To: <20200902042945.129369-1-ravi.bangoria@linux.ibm.com>
Vector load/store instructions are special because they are always
aligned. Thus unaligned EA needs to be aligned down before comparing
it with watch ranges. Otherwise we might consider valid event as
invalid.
Fixes: 74c6881019b7 ("powerpc/watchpoint: Prepare handler to handle more than one watchpoint")
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
arch/powerpc/kernel/hw_breakpoint.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 9f7df1c37233..f6b24838ca3c 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -644,6 +644,8 @@ static void get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr,
if (*type == CACHEOP) {
*size = cache_op_size();
*ea &= ~(*size - 1);
+ } else if (*type == LOAD_VMX || *type == STORE_VMX) {
+ *ea &= ~(*size - 1);
}
}
--
2.26.2
^ permalink raw reply related
* [PATCH v6 3/8] powerpc/watchpoint/ptrace: Fix SETHWDEBUG when CONFIG_HAVE_HW_BREAKPOINT=N
From: Ravi Bangoria @ 2020-09-02 4:29 UTC (permalink / raw)
To: mpe, christophe.leroy
Cc: ravi.bangoria, mikey, jniethe5, pedromfc, linux-kernel, paulus,
rogealve, naveen.n.rao, linuxppc-dev
In-Reply-To: <20200902042945.129369-1-ravi.bangoria@linux.ibm.com>
When kernel is compiled with CONFIG_HAVE_HW_BREAKPOINT=N, user can
still create watchpoint using PPC_PTRACE_SETHWDEBUG, with limited
functionalities. But, such watchpoints are never firing because of
the missing privilege settings. Fix that.
It's safe to set HW_BRK_TYPE_PRIV_ALL because we don't really leak
any kernel address in signal info. Setting HW_BRK_TYPE_PRIV_ALL will
also help to find scenarios when kernel accesses user memory.
Reported-by: Pedro Miraglia Franco de Carvalho <pedromfc@linux.ibm.com>
Suggested-by: Pedro Miraglia Franco de Carvalho <pedromfc@linux.ibm.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
arch/powerpc/kernel/ptrace/ptrace-noadv.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/powerpc/kernel/ptrace/ptrace-noadv.c b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
index 697c7e4b5877..57a0ab822334 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-noadv.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
@@ -217,7 +217,7 @@ long ppc_set_hwdebug(struct task_struct *child, struct ppc_hw_breakpoint *bp_inf
return -EIO;
brk.address = ALIGN_DOWN(bp_info->addr, HW_BREAKPOINT_SIZE);
- brk.type = HW_BRK_TYPE_TRANSLATE;
+ brk.type = HW_BRK_TYPE_TRANSLATE | HW_BRK_TYPE_PRIV_ALL;
brk.len = DABR_MAX_LEN;
if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ)
brk.type |= HW_BRK_TYPE_READ;
--
2.26.2
^ permalink raw reply related
* [PATCH v6 4/8] powerpc/watchpoint: Move DAWR detection logic outside of hw_breakpoint.c
From: Ravi Bangoria @ 2020-09-02 4:29 UTC (permalink / raw)
To: mpe, christophe.leroy
Cc: ravi.bangoria, mikey, jniethe5, pedromfc, linux-kernel, paulus,
rogealve, naveen.n.rao, linuxppc-dev
In-Reply-To: <20200902042945.129369-1-ravi.bangoria@linux.ibm.com>
Power10 hw has multiple DAWRs but hw doesn't tell which DAWR caused
the exception. So we have a sw logic to detect that in hw_breakpoint.c.
But hw_breakpoint.c gets compiled only with CONFIG_HAVE_HW_BREAKPOINT=Y.
Move DAWR detection logic outside of hw_breakpoint.c so that it can be
reused when CONFIG_HAVE_HW_BREAKPOINT is not set.
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
arch/powerpc/include/asm/hw_breakpoint.h | 8 +
arch/powerpc/kernel/Makefile | 3 +-
arch/powerpc/kernel/hw_breakpoint.c | 159 +----------------
.../kernel/hw_breakpoint_constraints.c | 162 ++++++++++++++++++
4 files changed, 174 insertions(+), 158 deletions(-)
create mode 100644 arch/powerpc/kernel/hw_breakpoint_constraints.c
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h
index 9b68eafebf43..81872c420476 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -10,6 +10,7 @@
#define _PPC_BOOK3S_64_HW_BREAKPOINT_H
#include <asm/cpu_has_feature.h>
+#include <asm/inst.h>
#ifdef __KERNEL__
struct arch_hw_breakpoint {
@@ -52,6 +53,13 @@ static inline int nr_wp_slots(void)
return cpu_has_feature(CPU_FTR_DAWR1) ? 2 : 1;
}
+bool wp_check_constraints(struct pt_regs *regs, struct ppc_inst instr,
+ unsigned long ea, int type, int size,
+ struct arch_hw_breakpoint *info);
+
+void wp_get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr,
+ int *type, int *size, unsigned long *ea);
+
#ifdef CONFIG_HAVE_HW_BREAKPOINT
#include <linux/kdebug.h>
#include <asm/reg.h>
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index cbf41fb4ee89..a5550c2b24c4 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -45,7 +45,8 @@ obj-y := cputable.o syscalls.o \
signal.o sysfs.o cacheinfo.o time.o \
prom.o traps.o setup-common.o \
udbg.o misc.o io.o misc_$(BITS).o \
- of_platform.o prom_parse.o firmware.o
+ of_platform.o prom_parse.o firmware.o \
+ hw_breakpoint_constraints.o
obj-y += ptrace/
obj-$(CONFIG_PPC64) += setup_64.o \
paca.o nvram_64.o note.o syscall_64.o
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index f6b24838ca3c..f4e8f21046f5 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -494,161 +494,6 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs)
}
}
-static bool dar_in_user_range(unsigned long dar, struct arch_hw_breakpoint *info)
-{
- return ((info->address <= dar) && (dar - info->address < info->len));
-}
-
-static bool ea_user_range_overlaps(unsigned long ea, int size,
- struct arch_hw_breakpoint *info)
-{
- return ((ea < info->address + info->len) &&
- (ea + size > info->address));
-}
-
-static bool dar_in_hw_range(unsigned long dar, struct arch_hw_breakpoint *info)
-{
- unsigned long hw_start_addr, hw_end_addr;
-
- hw_start_addr = ALIGN_DOWN(info->address, HW_BREAKPOINT_SIZE);
- hw_end_addr = ALIGN(info->address + info->len, HW_BREAKPOINT_SIZE);
-
- return ((hw_start_addr <= dar) && (hw_end_addr > dar));
-}
-
-static bool ea_hw_range_overlaps(unsigned long ea, int size,
- struct arch_hw_breakpoint *info)
-{
- unsigned long hw_start_addr, hw_end_addr;
- unsigned long align_size = HW_BREAKPOINT_SIZE;
-
- /*
- * On p10 predecessors, quadword is handle differently then
- * other instructions.
- */
- if (!cpu_has_feature(CPU_FTR_ARCH_31) && size == 16)
- align_size = HW_BREAKPOINT_SIZE_QUADWORD;
-
- hw_start_addr = ALIGN_DOWN(info->address, align_size);
- hw_end_addr = ALIGN(info->address + info->len, align_size);
-
- return ((ea < hw_end_addr) && (ea + size > hw_start_addr));
-}
-
-/*
- * If hw has multiple DAWR registers, we also need to check all
- * dawrx constraint bits to confirm this is _really_ a valid event.
- * If type is UNKNOWN, but privilege level matches, consider it as
- * a positive match.
- */
-static bool check_dawrx_constraints(struct pt_regs *regs, int type,
- struct arch_hw_breakpoint *info)
-{
- if (OP_IS_LOAD(type) && !(info->type & HW_BRK_TYPE_READ))
- return false;
-
- /*
- * The Cache Management instructions other than dcbz never
- * cause a match. i.e. if type is CACHEOP, the instruction
- * is dcbz, and dcbz is treated as Store.
- */
- if ((OP_IS_STORE(type) || type == CACHEOP) && !(info->type & HW_BRK_TYPE_WRITE))
- return false;
-
- if (is_kernel_addr(regs->nip) && !(info->type & HW_BRK_TYPE_KERNEL))
- return false;
-
- if (user_mode(regs) && !(info->type & HW_BRK_TYPE_USER))
- return false;
-
- return true;
-}
-
-/*
- * Return true if the event is valid wrt dawr configuration,
- * including extraneous exception. Otherwise return false.
- */
-static bool check_constraints(struct pt_regs *regs, struct ppc_inst instr,
- unsigned long ea, int type, int size,
- struct arch_hw_breakpoint *info)
-{
- bool in_user_range = dar_in_user_range(regs->dar, info);
- bool dawrx_constraints;
-
- /*
- * 8xx supports only one breakpoint and thus we can
- * unconditionally return true.
- */
- if (IS_ENABLED(CONFIG_PPC_8xx)) {
- if (!in_user_range)
- info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
- return true;
- }
-
- if (unlikely(ppc_inst_equal(instr, ppc_inst(0)))) {
- if (cpu_has_feature(CPU_FTR_ARCH_31) &&
- !dar_in_hw_range(regs->dar, info))
- return false;
-
- return true;
- }
-
- dawrx_constraints = check_dawrx_constraints(regs, type, info);
-
- if (type == UNKNOWN) {
- if (cpu_has_feature(CPU_FTR_ARCH_31) &&
- !dar_in_hw_range(regs->dar, info))
- return false;
-
- return dawrx_constraints;
- }
-
- if (ea_user_range_overlaps(ea, size, info))
- return dawrx_constraints;
-
- if (ea_hw_range_overlaps(ea, size, info)) {
- if (dawrx_constraints) {
- info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
- return true;
- }
- }
- return false;
-}
-
-static int cache_op_size(void)
-{
-#ifdef __powerpc64__
- return ppc64_caches.l1d.block_size;
-#else
- return L1_CACHE_BYTES;
-#endif
-}
-
-static void get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr,
- int *type, int *size, unsigned long *ea)
-{
- struct instruction_op op;
-
- if (__get_user_instr_inatomic(*instr, (void __user *)regs->nip))
- return;
-
- analyse_instr(&op, regs, *instr);
- *type = GETTYPE(op.type);
- *ea = op.ea;
-#ifdef __powerpc64__
- if (!(regs->msr & MSR_64BIT))
- *ea &= 0xffffffffUL;
-#endif
-
- *size = GETSIZE(op.type);
- if (*type == CACHEOP) {
- *size = cache_op_size();
- *ea &= ~(*size - 1);
- } else if (*type == LOAD_VMX || *type == STORE_VMX) {
- *ea &= ~(*size - 1);
- }
-}
-
static bool is_larx_stcx_instr(int type)
{
return type == LARX || type == STCX;
@@ -732,7 +577,7 @@ int hw_breakpoint_handler(struct die_args *args)
rcu_read_lock();
if (!IS_ENABLED(CONFIG_PPC_8xx))
- get_instr_detail(regs, &instr, &type, &size, &ea);
+ wp_get_instr_detail(regs, &instr, &type, &size, &ea);
for (i = 0; i < nr_wp_slots(); i++) {
bp[i] = __this_cpu_read(bp_per_reg[i]);
@@ -742,7 +587,7 @@ int hw_breakpoint_handler(struct die_args *args)
info[i] = counter_arch_bp(bp[i]);
info[i]->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ;
- if (check_constraints(regs, instr, ea, type, size, info[i])) {
+ if (wp_check_constraints(regs, instr, ea, type, size, info[i])) {
if (!IS_ENABLED(CONFIG_PPC_8xx) &&
ppc_inst_equal(instr, ppc_inst(0))) {
handler_error(bp[i], info[i]);
diff --git a/arch/powerpc/kernel/hw_breakpoint_constraints.c b/arch/powerpc/kernel/hw_breakpoint_constraints.c
new file mode 100644
index 000000000000..867ee4aa026a
--- /dev/null
+++ b/arch/powerpc/kernel/hw_breakpoint_constraints.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0+
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <asm/hw_breakpoint.h>
+#include <asm/sstep.h>
+#include <asm/cache.h>
+
+static bool dar_in_user_range(unsigned long dar, struct arch_hw_breakpoint *info)
+{
+ return ((info->address <= dar) && (dar - info->address < info->len));
+}
+
+static bool ea_user_range_overlaps(unsigned long ea, int size,
+ struct arch_hw_breakpoint *info)
+{
+ return ((ea < info->address + info->len) &&
+ (ea + size > info->address));
+}
+
+static bool dar_in_hw_range(unsigned long dar, struct arch_hw_breakpoint *info)
+{
+ unsigned long hw_start_addr, hw_end_addr;
+
+ hw_start_addr = ALIGN_DOWN(info->address, HW_BREAKPOINT_SIZE);
+ hw_end_addr = ALIGN(info->address + info->len, HW_BREAKPOINT_SIZE);
+
+ return ((hw_start_addr <= dar) && (hw_end_addr > dar));
+}
+
+static bool ea_hw_range_overlaps(unsigned long ea, int size,
+ struct arch_hw_breakpoint *info)
+{
+ unsigned long hw_start_addr, hw_end_addr;
+ unsigned long align_size = HW_BREAKPOINT_SIZE;
+
+ /*
+ * On p10 predecessors, quadword is handle differently then
+ * other instructions.
+ */
+ if (!cpu_has_feature(CPU_FTR_ARCH_31) && size == 16)
+ align_size = HW_BREAKPOINT_SIZE_QUADWORD;
+
+ hw_start_addr = ALIGN_DOWN(info->address, align_size);
+ hw_end_addr = ALIGN(info->address + info->len, align_size);
+
+ return ((ea < hw_end_addr) && (ea + size > hw_start_addr));
+}
+
+/*
+ * If hw has multiple DAWR registers, we also need to check all
+ * dawrx constraint bits to confirm this is _really_ a valid event.
+ * If type is UNKNOWN, but privilege level matches, consider it as
+ * a positive match.
+ */
+static bool check_dawrx_constraints(struct pt_regs *regs, int type,
+ struct arch_hw_breakpoint *info)
+{
+ if (OP_IS_LOAD(type) && !(info->type & HW_BRK_TYPE_READ))
+ return false;
+
+ /*
+ * The Cache Management instructions other than dcbz never
+ * cause a match. i.e. if type is CACHEOP, the instruction
+ * is dcbz, and dcbz is treated as Store.
+ */
+ if ((OP_IS_STORE(type) || type == CACHEOP) && !(info->type & HW_BRK_TYPE_WRITE))
+ return false;
+
+ if (is_kernel_addr(regs->nip) && !(info->type & HW_BRK_TYPE_KERNEL))
+ return false;
+
+ if (user_mode(regs) && !(info->type & HW_BRK_TYPE_USER))
+ return false;
+
+ return true;
+}
+
+/*
+ * Return true if the event is valid wrt dawr configuration,
+ * including extraneous exception. Otherwise return false.
+ */
+bool wp_check_constraints(struct pt_regs *regs, struct ppc_inst instr,
+ unsigned long ea, int type, int size,
+ struct arch_hw_breakpoint *info)
+{
+ bool in_user_range = dar_in_user_range(regs->dar, info);
+ bool dawrx_constraints;
+
+ /*
+ * 8xx supports only one breakpoint and thus we can
+ * unconditionally return true.
+ */
+ if (IS_ENABLED(CONFIG_PPC_8xx)) {
+ if (!in_user_range)
+ info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
+ return true;
+ }
+
+ if (unlikely(ppc_inst_equal(instr, ppc_inst(0)))) {
+ if (cpu_has_feature(CPU_FTR_ARCH_31) &&
+ !dar_in_hw_range(regs->dar, info))
+ return false;
+
+ return true;
+ }
+
+ dawrx_constraints = check_dawrx_constraints(regs, type, info);
+
+ if (type == UNKNOWN) {
+ if (cpu_has_feature(CPU_FTR_ARCH_31) &&
+ !dar_in_hw_range(regs->dar, info))
+ return false;
+
+ return dawrx_constraints;
+ }
+
+ if (ea_user_range_overlaps(ea, size, info))
+ return dawrx_constraints;
+
+ if (ea_hw_range_overlaps(ea, size, info)) {
+ if (dawrx_constraints) {
+ info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
+ return true;
+ }
+ }
+ return false;
+}
+
+static int cache_op_size(void)
+{
+#ifdef __powerpc64__
+ return ppc64_caches.l1d.block_size;
+#else
+ return L1_CACHE_BYTES;
+#endif
+}
+
+void wp_get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr,
+ int *type, int *size, unsigned long *ea)
+{
+ struct instruction_op op;
+
+ if (__get_user_instr_inatomic(*instr, (void __user *)regs->nip))
+ return;
+
+ analyse_instr(&op, regs, *instr);
+ *type = GETTYPE(op.type);
+ *ea = op.ea;
+#ifdef __powerpc64__
+ if (!(regs->msr & MSR_64BIT))
+ *ea &= 0xffffffffUL;
+#endif
+
+ *size = GETSIZE(op.type);
+ if (*type == CACHEOP) {
+ *size = cache_op_size();
+ *ea &= ~(*size - 1);
+ } else if (*type == LOAD_VMX || *type == STORE_VMX) {
+ *ea &= ~(*size - 1);
+ }
+}
--
2.26.2
^ permalink raw reply related
* [PATCH v6 5/8] powerpc/watchpoint: Fix exception handling for CONFIG_HAVE_HW_BREAKPOINT=N
From: Ravi Bangoria @ 2020-09-02 4:29 UTC (permalink / raw)
To: mpe, christophe.leroy
Cc: ravi.bangoria, mikey, jniethe5, pedromfc, linux-kernel, paulus,
rogealve, naveen.n.rao, linuxppc-dev
In-Reply-To: <20200902042945.129369-1-ravi.bangoria@linux.ibm.com>
On powerpc, ptrace watchpoint works in one-shot mode. i.e. kernel
disables event every time it fires and user has to re-enable it.
Also, in case of ptrace watchpoint, kernel notifies ptrace user
before executing instruction.
With CONFIG_HAVE_HW_BREAKPOINT=N, kernel is missing to disable
ptrace event and thus it's causing infinite loop of exceptions.
This is especially harmful when user watches on a data which is
also read/written by kernel, eg syscall parameters. In such case,
infinite exceptions happens in kernel mode which causes soft-lockup.
Fixes: 9422de3e953d ("powerpc: Hardware breakpoints rewrite to handle non DABR breakpoint registers")
Reported-by: Pedro Miraglia Franco de Carvalho <pedromfc@linux.ibm.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
arch/powerpc/include/asm/hw_breakpoint.h | 3 ++
arch/powerpc/kernel/process.c | 48 +++++++++++++++++++++++
arch/powerpc/kernel/ptrace/ptrace-noadv.c | 4 +-
3 files changed, 54 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h
index 81872c420476..abebfbee5b1c 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -18,6 +18,7 @@ struct arch_hw_breakpoint {
u16 type;
u16 len; /* length of the target data symbol */
u16 hw_len; /* length programmed in hw */
+ u8 flags;
};
/* Note: Don't change the first 6 bits below as they are in the same order
@@ -37,6 +38,8 @@ struct arch_hw_breakpoint {
#define HW_BRK_TYPE_PRIV_ALL (HW_BRK_TYPE_USER | HW_BRK_TYPE_KERNEL | \
HW_BRK_TYPE_HYP)
+#define HW_BRK_FLAG_DISABLED 0x1
+
/* Minimum granularity */
#ifdef CONFIG_PPC_8xx
#define HW_BREAKPOINT_SIZE 0x4
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 016bd831908e..160fbbf41d40 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -636,6 +636,44 @@ void do_send_trap(struct pt_regs *regs, unsigned long address,
(void __user *)address);
}
#else /* !CONFIG_PPC_ADV_DEBUG_REGS */
+
+static void do_break_handler(struct pt_regs *regs)
+{
+ struct arch_hw_breakpoint null_brk = {0};
+ struct arch_hw_breakpoint *info;
+ struct ppc_inst instr = ppc_inst(0);
+ int type = 0;
+ int size = 0;
+ unsigned long ea;
+ int i;
+
+ /*
+ * If underneath hw supports only one watchpoint, we know it
+ * caused exception. 8xx also falls into this category.
+ */
+ if (nr_wp_slots() == 1) {
+ __set_breakpoint(0, &null_brk);
+ current->thread.hw_brk[0] = null_brk;
+ current->thread.hw_brk[0].flags |= HW_BRK_FLAG_DISABLED;
+ return;
+ }
+
+ /* Otherwise findout which DAWR caused exception and disable it. */
+ wp_get_instr_detail(regs, &instr, &type, &size, &ea);
+
+ for (i = 0; i < nr_wp_slots(); i++) {
+ info = ¤t->thread.hw_brk[i];
+ if (!info->address)
+ continue;
+
+ if (wp_check_constraints(regs, instr, ea, type, size, info)) {
+ __set_breakpoint(i, &null_brk);
+ current->thread.hw_brk[i] = null_brk;
+ current->thread.hw_brk[i].flags |= HW_BRK_FLAG_DISABLED;
+ }
+ }
+}
+
void do_break (struct pt_regs *regs, unsigned long address,
unsigned long error_code)
{
@@ -647,6 +685,16 @@ void do_break (struct pt_regs *regs, unsigned long address,
if (debugger_break_match(regs))
return;
+ /*
+ * We reach here only when watchpoint exception is generated by ptrace
+ * event (or hw is buggy!). Now if CONFIG_HAVE_HW_BREAKPOINT is set,
+ * watchpoint is already handled by hw_breakpoint_handler() so we don't
+ * have to do anything. But when CONFIG_HAVE_HW_BREAKPOINT is not set,
+ * we need to manually handle the watchpoint here.
+ */
+ if (!IS_ENABLED(CONFIG_HAVE_HW_BREAKPOINT))
+ do_break_handler(regs);
+
/* Deliver the signal to userspace */
force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address);
}
diff --git a/arch/powerpc/kernel/ptrace/ptrace-noadv.c b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
index 57a0ab822334..c9122ed91340 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-noadv.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
@@ -286,11 +286,13 @@ long ppc_del_hwdebug(struct task_struct *child, long data)
}
return ret;
#else /* CONFIG_HAVE_HW_BREAKPOINT */
- if (child->thread.hw_brk[data - 1].address == 0)
+ if (!(child->thread.hw_brk[data - 1].flags & HW_BRK_FLAG_DISABLED) &&
+ child->thread.hw_brk[data - 1].address == 0)
return -ENOENT;
child->thread.hw_brk[data - 1].address = 0;
child->thread.hw_brk[data - 1].type = 0;
+ child->thread.hw_brk[data - 1].flags = 0;
#endif /* CONFIG_HAVE_HW_BREAKPOINT */
return 0;
--
2.26.2
^ permalink raw reply related
* [PATCH v6 6/8] powerpc/watchpoint: Add hw_len wherever missing
From: Ravi Bangoria @ 2020-09-02 4:29 UTC (permalink / raw)
To: mpe, christophe.leroy
Cc: ravi.bangoria, mikey, jniethe5, pedromfc, linux-kernel, paulus,
rogealve, naveen.n.rao, linuxppc-dev
In-Reply-To: <20200902042945.129369-1-ravi.bangoria@linux.ibm.com>
There are couple of places where we set len but not hw_len. For
ptrace/perf watchpoints, when CONFIG_HAVE_HW_BREAKPOINT=Y, hw_len
will be calculated and set internally while parsing watchpoint.
But when CONFIG_HAVE_HW_BREAKPOINT=N, we need to manually set
'hw_len'. Similarly for xmon as well, hw_len needs to be set
directly.
Fixes: b57aeab811db ("powerpc/watchpoint: Fix length calculation for unaligned target")
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
arch/powerpc/kernel/ptrace/ptrace-noadv.c | 1 +
arch/powerpc/xmon/xmon.c | 1 +
2 files changed, 2 insertions(+)
diff --git a/arch/powerpc/kernel/ptrace/ptrace-noadv.c b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
index c9122ed91340..48c52426af80 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-noadv.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
@@ -219,6 +219,7 @@ long ppc_set_hwdebug(struct task_struct *child, struct ppc_hw_breakpoint *bp_inf
brk.address = ALIGN_DOWN(bp_info->addr, HW_BREAKPOINT_SIZE);
brk.type = HW_BRK_TYPE_TRANSLATE | HW_BRK_TYPE_PRIV_ALL;
brk.len = DABR_MAX_LEN;
+ brk.hw_len = DABR_MAX_LEN;
if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_READ)
brk.type |= HW_BRK_TYPE_READ;
if (bp_info->trigger_type & PPC_BREAKPOINT_TRIGGER_WRITE)
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index df7bca00f5ec..55c43a6c9111 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -969,6 +969,7 @@ static void insert_cpu_bpts(void)
brk.address = dabr[i].address;
brk.type = (dabr[i].enabled & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL;
brk.len = 8;
+ brk.hw_len = 8;
__set_breakpoint(i, &brk);
}
}
--
2.26.2
^ permalink raw reply related
* [PATCH v6 7/8] powerpc/watchpoint/ptrace: Introduce PPC_DEBUG_FEATURE_DATA_BP_ARCH_31
From: Ravi Bangoria @ 2020-09-02 4:29 UTC (permalink / raw)
To: mpe, christophe.leroy
Cc: ravi.bangoria, mikey, jniethe5, pedromfc, linux-kernel, paulus,
rogealve, naveen.n.rao, linuxppc-dev
In-Reply-To: <20200902042945.129369-1-ravi.bangoria@linux.ibm.com>
PPC_DEBUG_FEATURE_DATA_BP_ARCH_31 can be used to determine whether
we are running on an ISA 3.1 compliant machine. Which is needed to
determine DAR behaviour, 512 byte boundary limit etc. This was
requested by Pedro Miraglia Franco de Carvalho for extending
watchpoint features in gdb. Note that availability of 2nd DAWR is
independent of this flag and should be checked using
ppc_debug_info->num_data_bps.
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
Documentation/powerpc/ptrace.rst | 1 +
arch/powerpc/include/uapi/asm/ptrace.h | 1 +
arch/powerpc/kernel/ptrace/ptrace-noadv.c | 2 ++
3 files changed, 4 insertions(+)
diff --git a/Documentation/powerpc/ptrace.rst b/Documentation/powerpc/ptrace.rst
index 864d4b6dddd1..77725d69eb4a 100644
--- a/Documentation/powerpc/ptrace.rst
+++ b/Documentation/powerpc/ptrace.rst
@@ -46,6 +46,7 @@ features will have bits indicating whether there is support for::
#define PPC_DEBUG_FEATURE_DATA_BP_RANGE 0x4
#define PPC_DEBUG_FEATURE_DATA_BP_MASK 0x8
#define PPC_DEBUG_FEATURE_DATA_BP_DAWR 0x10
+ #define PPC_DEBUG_FEATURE_DATA_BP_ARCH_31 0x20
2. PTRACE_SETHWDEBUG
diff --git a/arch/powerpc/include/uapi/asm/ptrace.h b/arch/powerpc/include/uapi/asm/ptrace.h
index f5f1ccc740fc..7004cfea3f5f 100644
--- a/arch/powerpc/include/uapi/asm/ptrace.h
+++ b/arch/powerpc/include/uapi/asm/ptrace.h
@@ -222,6 +222,7 @@ struct ppc_debug_info {
#define PPC_DEBUG_FEATURE_DATA_BP_RANGE 0x0000000000000004
#define PPC_DEBUG_FEATURE_DATA_BP_MASK 0x0000000000000008
#define PPC_DEBUG_FEATURE_DATA_BP_DAWR 0x0000000000000010
+#define PPC_DEBUG_FEATURE_DATA_BP_ARCH_31 0x0000000000000020
#ifndef __ASSEMBLY__
diff --git a/arch/powerpc/kernel/ptrace/ptrace-noadv.c b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
index 48c52426af80..aa36fcad36cd 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-noadv.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
@@ -57,6 +57,8 @@ void ppc_gethwdinfo(struct ppc_debug_info *dbginfo)
} else {
dbginfo->features = 0;
}
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ dbginfo->features |= PPC_DEBUG_FEATURE_DATA_BP_ARCH_31;
}
int ptrace_get_debugreg(struct task_struct *child, unsigned long addr,
--
2.26.2
^ permalink raw reply related
* [PATCH v6 8/8] powerpc/watchpoint/selftests: Tests for kernel accessing user memory
From: Ravi Bangoria @ 2020-09-02 4:29 UTC (permalink / raw)
To: mpe, christophe.leroy
Cc: ravi.bangoria, mikey, jniethe5, pedromfc, linux-kernel, paulus,
rogealve, naveen.n.rao, linuxppc-dev
In-Reply-To: <20200902042945.129369-1-ravi.bangoria@linux.ibm.com>
Introduce tests to cover simple scenarios where user is watching
memory which can be accessed by kernel as well. We also support
_MODE_EXACT with _SETHWDEBUG interface. Move those testcases out-
side of _BP_RANGE condition. This will help to test _MODE_EXACT
scenarios when CONFIG_HAVE_HW_BREAKPOINT is not set, eg:
$ ./ptrace-hwbreak
...
PTRACE_SET_DEBUGREG, Kernel Access Userspace, len: 8: Ok
PPC_PTRACE_SETHWDEBUG, MODE_EXACT, WO, len: 1: Ok
PPC_PTRACE_SETHWDEBUG, MODE_EXACT, RO, len: 1: Ok
PPC_PTRACE_SETHWDEBUG, MODE_EXACT, RW, len: 1: Ok
PPC_PTRACE_SETHWDEBUG, MODE_EXACT, Kernel Access Userspace, len: 1: Ok
success: ptrace-hwbreak
Suggested-by: Pedro Miraglia Franco de Carvalho <pedromfc@linux.ibm.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
.../selftests/powerpc/ptrace/ptrace-hwbreak.c | 48 ++++++++++++++++++-
1 file changed, 46 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
index fc477dfe86a2..2e0d86e0687e 100644
--- a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
@@ -20,6 +20,8 @@
#include <signal.h>
#include <sys/types.h>
#include <sys/wait.h>
+#include <sys/syscall.h>
+#include <linux/limits.h>
#include "ptrace.h"
#define SPRN_PVR 0x11F
@@ -44,6 +46,7 @@ struct gstruct {
};
static volatile struct gstruct gstruct __attribute__((aligned(512)));
+static volatile char cwd[PATH_MAX] __attribute__((aligned(8)));
static void get_dbginfo(pid_t child_pid, struct ppc_debug_info *dbginfo)
{
@@ -138,6 +141,9 @@ static void test_workload(void)
write_var(len);
}
+ /* PTRACE_SET_DEBUGREG, Kernel Access Userspace test */
+ syscall(__NR_getcwd, &cwd, PATH_MAX);
+
/* PPC_PTRACE_SETHWDEBUG, MODE_EXACT, WO test */
write_var(1);
@@ -150,6 +156,9 @@ static void test_workload(void)
else
read_var(1);
+ /* PPC_PTRACE_SETHWDEBUG, MODE_EXACT, Kernel Access Userspace test */
+ syscall(__NR_getcwd, &cwd, PATH_MAX);
+
/* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, WO test */
gstruct.a[rand() % A_LEN] = 'a';
@@ -293,6 +302,24 @@ static int test_set_debugreg(pid_t child_pid)
return 0;
}
+static int test_set_debugreg_kernel_userspace(pid_t child_pid)
+{
+ unsigned long wp_addr = (unsigned long)cwd;
+ char *name = "PTRACE_SET_DEBUGREG";
+
+ /* PTRACE_SET_DEBUGREG, Kernel Access Userspace test */
+ wp_addr &= ~0x7UL;
+ wp_addr |= (1Ul << DABR_READ_SHIFT);
+ wp_addr |= (1UL << DABR_WRITE_SHIFT);
+ wp_addr |= (1UL << DABR_TRANSLATION_SHIFT);
+ ptrace_set_debugreg(child_pid, wp_addr);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "Kernel Access Userspace", wp_addr, 8);
+
+ ptrace_set_debugreg(child_pid, 0);
+ return 0;
+}
+
static void get_ppc_hw_breakpoint(struct ppc_hw_breakpoint *info, int type,
unsigned long addr, int len)
{
@@ -338,6 +365,22 @@ static void test_sethwdebug_exact(pid_t child_pid)
ptrace_delhwdebug(child_pid, wh);
}
+static void test_sethwdebug_exact_kernel_userspace(pid_t child_pid)
+{
+ struct ppc_hw_breakpoint info;
+ unsigned long wp_addr = (unsigned long)&cwd;
+ char *name = "PPC_PTRACE_SETHWDEBUG, MODE_EXACT";
+ int len = 1; /* hardcoded in kernel */
+ int wh;
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_EXACT, Kernel Access Userspace test */
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr, 0);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "Kernel Access Userspace", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+}
+
static void test_sethwdebug_range_aligned(pid_t child_pid)
{
struct ppc_hw_breakpoint info;
@@ -452,9 +495,10 @@ static void
run_tests(pid_t child_pid, struct ppc_debug_info *dbginfo, bool dawr)
{
test_set_debugreg(child_pid);
+ test_set_debugreg_kernel_userspace(child_pid);
+ test_sethwdebug_exact(child_pid);
+ test_sethwdebug_exact_kernel_userspace(child_pid);
if (dbginfo->features & PPC_DEBUG_FEATURE_DATA_BP_RANGE) {
- test_sethwdebug_exact(child_pid);
-
test_sethwdebug_range_aligned(child_pid);
if (dawr || is_8xx) {
test_sethwdebug_range_unaligned(child_pid);
--
2.26.2
^ permalink raw reply related
* Re: [PATCH -next] powerpc: Convert to DEFINE_SHOW_ATTRIBUTE
From: Paul Mackerras @ 2020-09-02 4:51 UTC (permalink / raw)
To: Qinglang Miao; +Cc: Greg Kroah-Hartman, linuxppc-dev, linux-kernel, kvm-ppc
In-Reply-To: <20200716090712.14375-1-miaoqinglang@huawei.com>
On Thu, Jul 16, 2020 at 05:07:12PM +0800, Qinglang Miao wrote:
> From: Chen Huang <chenhuang5@huawei.com>
>
> Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code.
>
> Signed-off-by: Chen Huang <chenhuang5@huawei.com>
For the arch/powerpc/kvm part:
Acked-by: Paul Mackerras <paulus@ozlabs.org>
I expect Michael Ellerman will take the patch through his tree.
Paul.
^ permalink raw reply
* Re: [PATCH 2/7] powerpc/watchpoint/kvm: Add infrastructure to support 2nd DAWR
From: Ravi Bangoria @ 2020-09-02 5:09 UTC (permalink / raw)
To: Paul Mackerras
Cc: christophe.leroy, Ravi Bangoria, mikey, rogealve, kvm,
linux-kernel, npiggin, kvm-ppc, linux-kselftest, jniethe5,
pedromfc, pbonzini, linuxppc-dev
In-Reply-To: <20200902020124.GB272502@thinks.paulus.ozlabs.org>
Hi Paul,
On 9/2/20 7:31 AM, Paul Mackerras wrote:
> On Thu, Jul 23, 2020 at 03:50:53PM +0530, Ravi Bangoria wrote:
>> kvm code assumes single DAWR everywhere. Add code to support 2nd DAWR.
>> DAWR is a hypervisor resource and thus H_SET_MODE hcall is used to set/
>> unset it. Introduce new case H_SET_MODE_RESOURCE_SET_DAWR1 for 2nd DAWR.
>
> Is this the same interface as will be defined in PAPR and available
> under PowerVM, or is it a new/different interface for KVM?
Yes, kvm hcall interface for 2nd DAWR is same as PowerVM, as defined in PAPR.
>
>> Also, kvm will support 2nd DAWR only if CPU_FTR_DAWR1 is set.
>
> In general QEMU wants to be able to control all aspects of the virtual
> machine presented to the guest, meaning that just because a host has a
> particular hardware capability does not mean we should automatically
> present that capability to the guest.
>
> In this case, QEMU will want a way to control whether the guest sees
> the availability of the second DAWR/X registers or not, i.e. whether a
> H_SET_MODE to set DAWR[X]1 will succeed or fail.
Patch #3 adds new kvm capability KVM_CAP_PPC_DAWR1 that can be checked
by Qemu. Also, as suggested by David in Qemu patch[1], I'm planning to
add new machine capability in Qemu:
-machine cap-dawr1=ON/OFF
cap-dawr1 will be default ON when PPC_FEATURE2_ARCH_3_10 is set and OFF
otherwise.
Is this correct approach?
[1]: https://lore.kernel.org/kvm/20200724045613.GA8983@umbus.fritz.box
Thanks,
Ravi
^ permalink raw reply
* Re: [PATCH 1/7] powerpc/watchpoint/kvm: Rename current DAWR macros and variables
From: Ravi Bangoria @ 2020-09-02 5:09 UTC (permalink / raw)
To: Paul Mackerras
Cc: christophe.leroy, Ravi Bangoria, mikey, rogealve, kvm,
linux-kernel, npiggin, kvm-ppc, linux-kselftest, jniethe5,
pedromfc, pbonzini, linuxppc-dev
In-Reply-To: <20200902014908.GA272502@thinks.paulus.ozlabs.org>
Hi Paul,
On 9/2/20 7:19 AM, Paul Mackerras wrote:
> On Thu, Jul 23, 2020 at 03:50:52PM +0530, Ravi Bangoria wrote:
>> Power10 is introducing second DAWR. Use real register names (with
>> suffix 0) from ISA for current macros and variables used by kvm.
>
> Most of this looks fine, but I think we should not change the existing
> names in arch/powerpc/include/uapi/asm/kvm.h (and therefore also
> Documentation/virt/kvm/api.rst).
Missed that I'm changing uapi. I'll rename only those macros/variables
which are not uapi.
Thanks,
Ravi
^ permalink raw reply
* Re: [PATCH 2/7] powerpc/watchpoint/kvm: Add infrastructure to support 2nd DAWR
From: Ravi Bangoria @ 2020-09-02 5:12 UTC (permalink / raw)
To: paulus
Cc: christophe.leroy, Ravi Bangoria, mikey, rogealve, kvm,
linux-kernel, npiggin, kvm-ppc, linux-kselftest, jniethe5,
pedromfc, pbonzini, linuxppc-dev
In-Reply-To: <20200723102058.312282-3-ravi.bangoria@linux.ibm.com>
Hi Paul,
> diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
> index 33793444144c..03f401d7be41 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -538,6 +538,8 @@ struct hv_guest_state {
> s64 tb_offset;
> u64 dawr0;
> u64 dawrx0;
> + u64 dawr1;
> + u64 dawrx1;
> u64 ciabr;
> u64 hdec_expiry;
> u64 purr;
After this struct, there is a macro HV_GUEST_STATE_VERSION, I guess that
also needs to be incremented because I'm adding new members in the struct?
Thanks,
Ravi
^ permalink raw reply
* Re: [PATCH 0/7] powerpc/watchpoint: 2nd DAWR kvm enablement + selftests
From: Ravi Bangoria @ 2020-09-02 5:13 UTC (permalink / raw)
To: Paul Mackerras
Cc: christophe.leroy, Ravi Bangoria, mikey, rogealve, kvm,
linux-kernel, npiggin, kvm-ppc, linux-kselftest, jniethe5,
pedromfc, pbonzini, linuxppc-dev
In-Reply-To: <20200902023259.GC272502@thinks.paulus.ozlabs.org>
Hi Paul,
On 9/2/20 8:02 AM, Paul Mackerras wrote:
> On Thu, Jul 23, 2020 at 03:50:51PM +0530, Ravi Bangoria wrote:
>> Patch #1, #2 and #3 enables p10 2nd DAWR feature for Book3S kvm guest. DAWR
>> is a hypervisor resource and thus H_SET_MODE hcall is used to set/unset it.
>> A new case H_SET_MODE_RESOURCE_SET_DAWR1 is introduced in H_SET_MODE hcall
>> for setting/unsetting 2nd DAWR. Also, new capability KVM_CAP_PPC_DAWR1 has
>> been added to query 2nd DAWR support via kvm ioctl.
>>
>> This feature also needs to be enabled in Qemu to really use it. I'll reply
>> link to qemu patches once I post them in qemu-devel mailing list.
>>
>> Patch #4, #5, #6 and #7 adds selftests to test 2nd DAWR.
>
> If/when you resubmit these patches, please split the KVM patches into
> a separate series, since the KVM patches would go via my tree whereas
> I expect the selftests/powerpc patches would go through Michael
> Ellerman's tree.
Sure. Will split it.
Thanks,
Ravi
^ permalink raw reply
* Re: [PATCH 0/2] link vdso with linker
From: Nathan Chancellor @ 2020-09-02 5:21 UTC (permalink / raw)
To: Nick Desaulniers
Cc: Christophe Leroy, Joe Lawrence, Kees Cook, Fangrui Song,
linux-kernel, Nicholas Piggin, clang-built-linux, Paul Mackerras,
linuxppc-dev
In-Reply-To: <20200901222523.1941988-1-ndesaulniers@google.com>
On Tue, Sep 01, 2020 at 03:25:21PM -0700, Nick Desaulniers wrote:
> Kees Cook is working on series that adds --orphan-section=warn to arm,
> arm64, and x86. I noticed that ppc vdso were still using cc-ldoption
> for these which I removed. It seems this results in that flag being
> silently dropped.
>
> I'm very confident with the first patch, but the second needs closer
> review around the error mentioned below the fold related to the .got
> section.
>
> Nick Desaulniers (2):
> powerpc/vdso64: link vdso64 with linker
> powerpc/vdso32: link vdso64 with linker
>
> arch/powerpc/include/asm/vdso.h | 17 ++---------------
> arch/powerpc/kernel/vdso32/Makefile | 7 +++++--
> arch/powerpc/kernel/vdso32/vdso32.lds.S | 3 ++-
> arch/powerpc/kernel/vdso64/Makefile | 8 ++++++--
> arch/powerpc/kernel/vdso64/vdso64.lds.S | 1 -
> 5 files changed, 15 insertions(+), 21 deletions(-)
>
> --
> 2.28.0.402.g5ffc5be6b7-goog
>
ppc44x_defconfig and powernv_defconfig start failing with this series
when LD=ld.lld is used.
$ make -skj"$(nproc)" ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- LLVM=1 O=out/ppc32 distclean ppc44x_defconfig uImage
ld.lld: error: relocation R_PPC_REL16_LO cannot be used against symbol __kernel_datapage_offset; recompile with -fPIC
>>> defined in arch/powerpc/kernel/vdso32/datapage.o
>>> referenced by arch/powerpc/kernel/vdso32/gettimeofday.o:(__kernel_gettimeofday)
ld.lld: error: relocation R_PPC_REL16_LO cannot be used against symbol __kernel_datapage_offset; recompile with -fPIC
>>> defined in arch/powerpc/kernel/vdso32/datapage.o
>>> referenced by arch/powerpc/kernel/vdso32/gettimeofday.o:(__kernel_clock_gettime)
ld.lld: error: relocation R_PPC_REL16_LO cannot be used against symbol __kernel_datapage_offset; recompile with -fPIC
>>> defined in arch/powerpc/kernel/vdso32/datapage.o
>>> referenced by arch/powerpc/kernel/vdso32/gettimeofday.o:(__kernel_clock_getres)
ld.lld: error: relocation R_PPC_REL16_LO cannot be used against symbol __kernel_datapage_offset; recompile with -fPIC
>>> defined in arch/powerpc/kernel/vdso32/datapage.o
>>> referenced by arch/powerpc/kernel/vdso32/gettimeofday.o:(__kernel_time)
...
$ make -skj"$(nproc)" ARCH=powerpc CROSS_COMPILE=powerpc64le-linux-gnu- LLVM=1 O=out/ppc64le distclean powernv_defconfig zImage.epapr
ld.lld: error: relocation R_PPC64_REL16_LO cannot be used against symbol __kernel_datapage_offset; recompile with -fPIC
>>> defined in arch/powerpc/kernel/vdso64/datapage.o
>>> referenced by arch/powerpc/kernel/vdso64/gettimeofday.o:(__kernel_gettimeofday)
ld.lld: error: relocation R_PPC64_REL16_LO cannot be used against symbol __kernel_datapage_offset; recompile with -fPIC
>>> defined in arch/powerpc/kernel/vdso64/datapage.o
>>> referenced by arch/powerpc/kernel/vdso64/gettimeofday.o:(__kernel_clock_gettime)
ld.lld: error: relocation R_PPC64_REL16_LO cannot be used against symbol __kernel_datapage_offset; recompile with -fPIC
>>> defined in arch/powerpc/kernel/vdso64/datapage.o
>>> referenced by arch/powerpc/kernel/vdso64/gettimeofday.o:(__kernel_clock_getres)
ld.lld: error: relocation R_PPC64_REL16_LO cannot be used against symbol __kernel_datapage_offset; recompile with -fPIC
>>> defined in arch/powerpc/kernel/vdso64/datapage.o
>>> referenced by arch/powerpc/kernel/vdso64/gettimeofday.o:(__kernel_time)
ld.lld: error: relocation R_PPC64_REL16_LO cannot be used against symbol __kernel_datapage_offset; recompile with -fPIC
>>> defined in arch/powerpc/kernel/vdso64/datapage.o
>>> referenced by arch/powerpc/kernel/vdso64/cacheflush.o:(__kernel_sync_dicache)
...
We need Fangrui's patch to fix ppc44x_defconfig:
https://lore.kernel.org/lkml/20200205005054.k72fuikf6rwrgfe4@google.com/
That exact same fix is needed in arch/powerpc/kernel/vdso64/datapage.S
to fix powernv_defconfig.
Cheers,
Nathan
^ permalink raw reply
* Re: [PATCH v1 08/10] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()
From: Leonardo Bras @ 2020-09-02 5:27 UTC (permalink / raw)
To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
Paul Mackerras, Christophe Leroy, Joel Stanley,
Thiago Jung Bauermann, Ram Pai, Brian King,
Murilo Fossa Vicentini, David Dai
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <1a469384-91ad-81f0-2a42-4c985cbc92da@ozlabs.ru>
On Mon, 2020-08-31 at 14:34 +1000, Alexey Kardashevskiy wrote:
>
> On 29/08/2020 01:25, Leonardo Bras wrote:
> > On Mon, 2020-08-24 at 15:07 +1000, Alexey Kardashevskiy wrote:
> > > On 18/08/2020 09:40, Leonardo Bras wrote:
> > > > Code used to create a ddw property that was previously scattered in
> > > > enable_ddw() is now gathered in ddw_property_create(), which deals with
> > > > allocation and filling the property, letting it ready for
> > > > of_property_add(), which now occurs in sequence.
> > > >
> > > > This created an opportunity to reorganize the second part of enable_ddw():
> > > >
> > > > Without this patch enable_ddw() does, in order:
> > > > kzalloc() property & members, create_ddw(), fill ddwprop inside property,
> > > > ddw_list_add(), do tce_setrange_multi_pSeriesLP_walk in all memory,
> > > > of_add_property().
> > > >
> > > > With this patch enable_ddw() does, in order:
> > > > create_ddw(), ddw_property_create(), of_add_property(), ddw_list_add(),
> > > > do tce_setrange_multi_pSeriesLP_walk in all memory.
> > > >
> > > > This change requires of_remove_property() in case anything fails after
> > > > of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
> > > > in all memory, which looks the most expensive operation, only if
> > > > everything else succeeds.
> > > >
> > > > Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> > > > ---
> > > > arch/powerpc/platforms/pseries/iommu.c | 97 +++++++++++++++-----------
> > > > 1 file changed, 57 insertions(+), 40 deletions(-)
> > > >
> > > > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > > > index 4031127c9537..3a1ef02ad9d5 100644
> > > > --- a/arch/powerpc/platforms/pseries/iommu.c
> > > > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > > > @@ -1123,6 +1123,31 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
> > > > ret);
> > > > }
> > > >
> > > > +static int ddw_property_create(struct property **ddw_win, const char *propname,
> > >
> > > @propname is always the same, do you really want to pass it every time?
> >
> > I think it reads better, like "create a ddw property with this name".
>
> This reads as "there are at least two ddw properties".
>
> > Also, it makes possible to create ddw properties with other names, in
> > case we decide to create properties with different names depending on
> > the window created.
>
> It is one window at any given moment, why call it different names... I
> get the part that it is not always "direct" anymore but still...
>
It seems the case as one of the options you suggested on patch [09/10]
>> I suspect it breaks kexec (from older kernel to this one) so you
>> either need to check for both DT names or just keep the old one.
>
> > Also, it's probably optimized / inlined at this point.
> > Is it ok doing it like this?
> >
> > > > + u32 liobn, u64 dma_addr, u32 page_shift, u32 window_shift)
> > > > +{
> > > > + struct dynamic_dma_window_prop *ddwprop;
> > > > + struct property *win64;
> > > > +
> > > > + *ddw_win = win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
> > > > + if (!win64)
> > > > + return -ENOMEM;
> > > > +
> > > > + win64->name = kstrdup(propname, GFP_KERNEL);
> > >
> > > Not clear why "win64->name = DIRECT64_PROPNAME" would not work here, the
> > > generic OF code does not try kfree() it but it is probably out of scope
> > > here.
> >
> > Yeah, I had that question too.
> > Previous code was like that, and I as trying not to mess too much on
> > how it's done.
> >
> > > > + ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
> > > > + win64->value = ddwprop;
> > > > + win64->length = sizeof(*ddwprop);
> > > > + if (!win64->name || !win64->value)
> > > > + return -ENOMEM;
> > >
> > > Up to 2 memory leaks here. I see the cleanup at "out_free_prop:" but
> > > still looks fragile. Instead you could simply return win64 as the only
> > > error possible here is -ENOMEM and returning NULL is equally good.
> >
> > I agree. It's better if this function have it's own cleaning routine.
> > It will be fixed for next version.
> >
> > >
> > > > +
> > > > + ddwprop->liobn = cpu_to_be32(liobn);
> > > > + ddwprop->dma_base = cpu_to_be64(dma_addr);
> > > > + ddwprop->tce_shift = cpu_to_be32(page_shift);
> > > > + ddwprop->window_shift = cpu_to_be32(window_shift);
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > /*
> > > > * If the PE supports dynamic dma windows, and there is space for a table
> > > > * that can map all pages in a linear offset, then setup such a table,
> > > > @@ -1140,12 +1165,11 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> > > > struct ddw_query_response query;
> > > > struct ddw_create_response create;
> > > > int page_shift;
> > > > - u64 max_addr;
> > > > + u64 max_addr, win_addr;
> > > > struct device_node *dn;
> > > > u32 ddw_avail[DDW_APPLICABLE_SIZE];
> > > > struct direct_window *window;
> > > > - struct property *win64;
> > > > - struct dynamic_dma_window_prop *ddwprop;
> > > > + struct property *win64 = NULL;
> > > > struct failed_ddw_pdn *fpdn;
> > > > bool default_win_removed = false;
> > > >
> > > > @@ -1244,38 +1268,34 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> > > > goto out_failed;
> > > > }
> > > > len = order_base_2(max_addr);
> > > > - win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
> > > > - if (!win64) {
> > > > - dev_info(&dev->dev,
> > > > - "couldn't allocate property for 64bit dma window\n");
> > > > +
> > > > + ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
> > > > + if (ret != 0)
> > >
> > > It is usually just "if (ret)"
> >
> > It was previously like that, and all query_ddw() checks return value
> > this way.
>
> ah I see.
>
> > Should I update them all or just this one?
>
> Pick one variant and make sure all new lines use just that. In this
> patch you add both variants. Thanks,
Ok, I will do that from now on.
Thanks!
^ permalink raw reply
* Re: [PATCH v1 09/10] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition
From: Leonardo Bras @ 2020-09-02 6:11 UTC (permalink / raw)
To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
Paul Mackerras, Christophe Leroy, Joel Stanley,
Thiago Jung Bauermann, Ram Pai, Brian King,
Murilo Fossa Vicentini, David Dai
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <d309c872-c574-4eb5-0e6e-0a36f73e168e@ozlabs.ru>
On Mon, 2020-08-31 at 14:35 +1000, Alexey Kardashevskiy wrote:
>
> On 29/08/2020 04:36, Leonardo Bras wrote:
> > On Mon, 2020-08-24 at 15:17 +1000, Alexey Kardashevskiy wrote:
> > > On 18/08/2020 09:40, Leonardo Bras wrote:
> > > > As of today, if the biggest DDW that can be created can't map the whole
> > > > partition, it's creation is skipped and the default DMA window
> > > > "ibm,dma-window" is used instead.
> > > >
> > > > DDW is 16x bigger than the default DMA window,
> > >
> > > 16x only under very specific circumstances which are
> > > 1. phyp
> > > 2. sriov
> > > 3. device class in hmc (or what that priority number is in the lpar config).
> >
> > Yeah, missing details.
> >
> > > > having the same amount of
> > > > pages, but increasing the page size to 64k.
> > > > Besides larger DMA window,
> > >
> > > "Besides being larger"?
> >
> > You are right there.
> >
> > > > it performs better for allocations over 4k,
> > >
> > > Better how?
> >
> > I was thinking for allocations larger than (512 * 4k), since >2
> > hypercalls are needed here, and for 64k pages would still be just 1
> > hypercall up to (512 * 64k).
> > But yeah, not the usual case anyway.
>
> Yup.
>
>
> > > > so it would be nice to use it instead.
> > >
> > > I'd rather say something like:
> > > ===
> > > So far we assumed we can map the guest RAM 1:1 to the bus which worked
> > > with a small number of devices. SRIOV changes it as the user can
> > > configure hundreds VFs and since phyp preallocates TCEs and does not
> > > allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
> > > per a PE to limit waste of physical pages.
> > > ===
> >
> > I mixed this in my commit message, it looks like this:
> >
> > ===
> > powerpc/pseries/iommu: Make use of DDW for indirect mapping
> >
> > So far it's assumed possible to map the guest RAM 1:1 to the bus, which
> > works with a small number of devices. SRIOV changes it as the user can
> > configure hundreds VFs and since phyp preallocates TCEs and does not
> > allow IOMMU pages bigger than 64K, it has to limit the number of TCEs
> > per a PE to limit waste of physical pages.
> >
> > As of today, if the assumed direct mapping is not possible, DDW
> > creation is skipped and the default DMA window "ibm,dma-window" is used
> > instead.
> >
> > The default DMA window uses 4k pages instead of 64k pages, and since
> > the amount of pages is the same,
>
> Is the amount really the same? I thought you can prioritize some VFs
> over others (== allocate different number of TCEs). Does it really
> matter if it is the same?
On a conversation with Travis Pizel, he explained how it's supposed to
work, and I understood this:
When a VF is created, it will be assigned a capacity, like 4%, 20%, and
so on. The number of 'TCE entries' that are available to that partition
are proportional to that capacity.
If we use the default DMA window, the IOMMU pagesize/entry will be 4k,
and if we use DDW, we will get 64k pagesize. As the number of entries
will be the same (for the same capacity), the total space that can be
addressed by the IOMMU will be 16 times bigger. This sometimes enable
direct mapping, but sometimes it's still not enough.
On Travis words :
"A low capacity VF, with less resources available, will certainly have
less DMA window capability than a high capacity VF. But, an 8GB DMA
window (with 64k pages) is still 16x larger than an 512MB window (with
4K pages).
A high capacity VF - for example, one that Leonardo has in his scenario
- will go from 8GB (using 4K pages) to 128GB (using 64K pages) - again,
16x larger - but it's obviously still possible to create a partition
that exceeds 128GB of memory in size."
>
>
> > making use of DDW instead of the
> > default DMA window for indirect mapping will expand in 16x the amount
> > of memory that can be mapped on DMA.
>
> Stop saying "16x", it is not guaranteed by anything :)
>
>
> > The DDW created will be used for direct mapping by default. [...]
> > ===
> >
> > What do you think?
> >
> > > > The DDW created will be used for direct mapping by default.
> > > > If it's not available, indirect mapping will be used instead.
> > > >
> > > > For indirect mapping, it's necessary to update the iommu_table so
> > > > iommu_alloc() can use the DDW created. For this,
> > > > iommu_table_update_window() is called when everything else succeeds
> > > > at enable_ddw().
> > > >
> > > > Removing the default DMA window for using DDW with indirect mapping
> > > > is only allowed if there is no current IOMMU memory allocated in
> > > > the iommu_table. enable_ddw() is aborted otherwise.
> > > >
> > > > As there will never have both direct and indirect mappings at the same
> > > > time, the same property name can be used for the created DDW.
> > > >
> > > > So renaming
> > > > define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
> > > > to
> > > > define DMA64_PROPNAME "linux,dma64-ddr-window-info"
> > > > looks the right thing to do.
> > >
> > > I know I suggested this but this does not look so good anymore as I
> > > suspect it breaks kexec (from older kernel to this one) so you either
> > > need to check for both DT names or just keep the old one. Changing the
> > > macro name is fine.
> > >
> >
> > Yeah, having 'direct' in the name don't really makes sense if it's used
> > for indirect mapping. I will just add this new define instead of
> > replacing the old one, and check for both.
> > Is that ok?
>
> No, having two of these does not seem right or useful. It is pseries
> which does not use petitboot (relies on grub instead so until the target
> kernel is started, there will be no ddw) so realistically we need this
> property for kexec/kdump which uses the same kernel but different
> initramdisk so for that purpose we need the same property name.
>
> But I can see myself annoyed when I try petitboot in the hacked pseries
> qemu and things may crash :) On this basis I'd suggest keeping the name
> and adding a comment next to it that it is not always "direct" anymore.
>
Keeping the same name should bring more problems than solve.
If we have indirect mapping and kexec() to an older kernel, it will
think direct mapping is enabled, and trying to use a DMA address
without doing H_PUT_* first may cause a crash.
I tested with a new property name, and it doesn't crash.
As the property is not found, it does try to create a new DDW, which
fails and it falls back to using the default DMA window.
The device that need the IOMMU don't work well, but when iommu_map()
fails, it doesn't try to use the DMA address as valid.
>
> > > > To make sure the property differentiates both cases, a new u32 for flags
> > > > was added at the end of the property, where BIT(0) set means direct
> > > > mapping.
> > > >
> > > > Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> > > > ---
> > > > arch/powerpc/platforms/pseries/iommu.c | 108 +++++++++++++++++++------
> > > > 1 file changed, 84 insertions(+), 24 deletions(-)
> > > >
> > > > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > > > index 3a1ef02ad9d5..9544e3c91ced 100644
> > > > --- a/arch/powerpc/platforms/pseries/iommu.c
> > > > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > > > @@ -350,8 +350,11 @@ struct dynamic_dma_window_prop {
> > > > __be64 dma_base; /* address hi,lo */
> > > > __be32 tce_shift; /* ilog2(tce_page_size) */
> > > > __be32 window_shift; /* ilog2(tce_window_size) */
> > > > + __be32 flags; /* DDW properties, see bellow */
> > > > };
> > > >
> > > > +#define DDW_FLAGS_DIRECT 0x01
> > >
> > > This is set if ((1<<window_shift) >= ddw_memory_hotplug_max()), you
> > > could simply check window_shift and drop the flags.
> > >
> >
> > Yeah, it's better this way, I will revert this.
> >
> > > > +
> > > > struct direct_window {
> > > > struct device_node *device;
> > > > const struct dynamic_dma_window_prop *prop;
> > > > @@ -377,7 +380,7 @@ static LIST_HEAD(direct_window_list);
> > > > static DEFINE_SPINLOCK(direct_window_list_lock);
> > > > /* protects initializing window twice for same device */
> > > > static DEFINE_MUTEX(direct_window_init_mutex);
> > > > -#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info"
> > > > +#define DMA64_PROPNAME "linux,dma64-ddr-window-info"
> > > >
> > > > static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn,
> > > > unsigned long num_pfn, const void *arg)
> > > > @@ -836,7 +839,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
> > > > if (ret)
> > > > return;
> > > >
> > > > - win = of_find_property(np, DIRECT64_PROPNAME, NULL);
> > > > + win = of_find_property(np, DMA64_PROPNAME, NULL);
> > > > if (!win)
> > > > return;
> > > >
> > > > @@ -852,7 +855,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
> > > > np, ret);
> > > > }
> > > >
> > > > -static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
> > > > +static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, bool *direct_mapping)
> > > > {
> > > > struct direct_window *window;
> > > > const struct dynamic_dma_window_prop *direct64;
> > > > @@ -864,6 +867,7 @@ static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr)
> > > > if (window->device == pdn) {
> > > > direct64 = window->prop;
> > > > *dma_addr = be64_to_cpu(direct64->dma_base);
> > > > + *direct_mapping = be32_to_cpu(direct64->flags) & DDW_FLAGS_DIRECT;
> > > > found = true;
> > > > break;
> > > > }
> > > > @@ -901,8 +905,8 @@ static int find_existing_ddw_windows(void)
> > > > if (!firmware_has_feature(FW_FEATURE_LPAR))
> > > > return 0;
> > > >
> > > > - for_each_node_with_property(pdn, DIRECT64_PROPNAME) {
> > > > - direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len);
> > > > + for_each_node_with_property(pdn, DMA64_PROPNAME) {
> > > > + direct64 = of_get_property(pdn, DMA64_PROPNAME, &len);
> > > > if (!direct64)
> > > > continue;
> > > >
> > > > @@ -1124,7 +1128,8 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
> > > > }
> > > >
> > > > static int ddw_property_create(struct property **ddw_win, const char *propname,
> > > > - u32 liobn, u64 dma_addr, u32 page_shift, u32 window_shift)
> > > > + u32 liobn, u64 dma_addr, u32 page_shift,
> > > > + u32 window_shift, bool direct_mapping)
> > > > {
> > > > struct dynamic_dma_window_prop *ddwprop;
> > > > struct property *win64;
> > > > @@ -1144,6 +1149,36 @@ static int ddw_property_create(struct property **ddw_win, const char *propname,
> > > > ddwprop->dma_base = cpu_to_be64(dma_addr);
> > > > ddwprop->tce_shift = cpu_to_be32(page_shift);
> > > > ddwprop->window_shift = cpu_to_be32(window_shift);
> > > > + if (direct_mapping)
> > > > + ddwprop->flags = cpu_to_be32(DDW_FLAGS_DIRECT);
> > > > +
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static int iommu_table_update_window(struct iommu_table **tbl, int nid, unsigned long liobn,
> > > > + unsigned long win_addr, unsigned long page_shift,
> > > > + unsigned long window_size)
> > >
> > > Rather strange helper imho. I'd extract the most of
> > > iommu_table_setparms_lpar() into iommu_table_setparms() (except
> > > of_parse_dma_window) and call new helper from where you call
> > > iommu_table_update_window; and do
> > > iommu_pseries_alloc_table/iommu_tce_table_put there.
> > >
> >
> > I don't see how to extract iommu_table_setparms_lpar() into
> > iommu_table_setparms(), they look to be used for different machine
> > types.
> >
> > Do mean you extracting most of iommu_table_setparms_lpar() (and maybe
> > iommu_table_setparms() ) into a new helper, which is called in both
> > functions and use it instead of iommu_table_update_window() ?
>
> Yes, this.
I will do that then, seems better. :)
>
>
> > > > +{
> > > > + struct iommu_table *new_tbl, *old_tbl;
> > > > +
> > > > + new_tbl = iommu_pseries_alloc_table(nid);
> > > > + if (!new_tbl)
> > > > + return -ENOMEM;
> > > > +
> > > > + old_tbl = *tbl;
> > > > + new_tbl->it_index = liobn;
> > > > + new_tbl->it_offset = win_addr >> page_shift;
> > > > + new_tbl->it_page_shift = page_shift;
> > > > + new_tbl->it_size = window_size >> page_shift;
> > > > + new_tbl->it_base = old_tbl->it_base;
> > >
> > > Should not be used in pseries.
> > >
> >
> > The point here is to migrate the values from the older tbl to the
>
> The actual window/table is new (on the hypervisor side), you are not
> migrating a single TCE, you deleted one whole window and created another
> whole window, calling it "migration" is confusing, especially when PAPR
> actually defines TCE migration.
Ok, I understand it's confusing now. I will avoid using this term from
now on.
>
>
> > newer. I Would like to understand why this is bad, if it will still be
> > 'unused' as the older tbl.
>
> Having explicit values is more readable imho.
Ok, I understand why it should be improved.!
Alexey, thank you for reviewing, and for helping me with my questions!
Best regards,
>
>
> > > > + new_tbl->it_busno = old_tbl->it_busno;
> > > > + new_tbl->it_blocksize = old_tbl->it_blocksize;
> > >
> > > 16 for pseries and does not change (may be even make it a macro).
> > >
> > > > + new_tbl->it_type = old_tbl->it_type;
> > >
> > > TCE_PCI.
> > >
> >
> > Same as above.
> >
> > > > + new_tbl->it_ops = old_tbl->it_ops;
> > > > +
> > > > + iommu_init_table(new_tbl, nid, old_tbl->it_reserved_start, old_tbl->it_reserved_end);
> > > > + iommu_tce_table_put(old_tbl);
> > > > + *tbl = new_tbl;
> > > >
> > > > return 0;
> > > > }
> > > > @@ -1171,12 +1206,16 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> > > > struct direct_window *window;
> > > > struct property *win64 = NULL;
> > > > struct failed_ddw_pdn *fpdn;
> > > > - bool default_win_removed = false;
> > > > + bool default_win_removed = false, maps_whole_partition = false;
> > >
> > > s/maps_whole_partition/direct_mapping/
> > >
> >
> > Sure, I will get it replaced.
> >
> > > > + struct pci_dn *pci = PCI_DN(pdn);
> > > > + struct iommu_table *tbl = pci->table_group->tables[0];
> > > >
> > > > mutex_lock(&direct_window_init_mutex);
> > > >
> > > > - if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset))
> > > > - goto out_unlock;
> > > > + if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &maps_whole_partition)) {
> > > > + mutex_unlock(&direct_window_init_mutex);
> > > > + return maps_whole_partition;
> > > > + }
> > > >
> > > > /*
> > > > * If we already went through this for a previous function of
> > > > @@ -1258,16 +1297,24 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> > > > query.page_size);
> > > > goto out_failed;
> > > > }
> > > > +
> > > > /* verify the window * number of ptes will map the partition */
> > > > - /* check largest block * page size > max memory hotplug addr */
> > > > max_addr = ddw_memory_hotplug_max();
> > > > if (query.largest_available_block < (max_addr >> page_shift)) {
> > > > - dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
> > > > - "%llu-sized pages\n", max_addr, query.largest_available_block,
> > > > - 1ULL << page_shift);
> > > > - goto out_failed;
> > > > + dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu %llu-sized pages\n",
> > > > + max_addr, query.largest_available_block,
> > > > + 1ULL << page_shift);
> > > > +
> > > > + len = order_base_2(query.largest_available_block << page_shift);
> > > > + } else {
> > > > + maps_whole_partition = true;
> > > > + len = order_base_2(max_addr);
> > > > }
> > > > - len = order_base_2(max_addr);
> > > > +
> > > > + /* DDW + IOMMU on single window may fail if there is any allocation */
> > > > + if (default_win_removed && !maps_whole_partition &&
> > > > + iommu_table_in_use(tbl))
> > > > + goto out_failed;
> > > >
> > > > ret = create_ddw(dev, ddw_avail, &create, page_shift, len);
> > > > if (ret != 0)
> > > > @@ -1277,8 +1324,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> > > > create.liobn, dn);
> > > >
> > > > win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
> > > > - ret = ddw_property_create(&win64, DIRECT64_PROPNAME, create.liobn, win_addr,
> > > > - page_shift, len);
> > > > + ret = ddw_property_create(&win64, DMA64_PROPNAME, create.liobn, win_addr,
> > > > + page_shift, len, maps_whole_partition);
> > > > if (ret) {
> > > > dev_info(&dev->dev,
> > > > "couldn't allocate property, property name, or value\n");
> > > > @@ -1297,12 +1344,25 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> > > > if (!window)
> > > > goto out_prop_del;
> > > >
> > > > - ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
> > > > - win64->value, tce_setrange_multi_pSeriesLP_walk);
> > > > - if (ret) {
> > > > - dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
> > > > - dn, ret);
> > > > - goto out_free_window;
> > > > + if (maps_whole_partition) {
> > > > + /* DDW maps the whole partition, so enable direct DMA mapping */
> > > > + ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
> > > > + win64->value, tce_setrange_multi_pSeriesLP_walk);
> > > > + if (ret) {
> > > > + dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n",
> > > > + dn, ret);
> > > > + goto out_free_window;
> > > > + }
> > > > + } else {
> > > > + /* New table for using DDW instead of the default DMA window */
> > > > + if (iommu_table_update_window(&tbl, pci->phb->node, create.liobn,
> > > > + win_addr, page_shift, 1UL << len))
> > > > + goto out_free_window;
> > > > +
> > > > + set_iommu_table_base(&dev->dev, tbl);
> > > > + WARN_ON(dev->dev.archdata.dma_offset >= SZ_4G);
> > >
> > > What is this check for exactly? Why 4G, not >= 0, for example?
> >
> > I am not really sure, you suggested adding it here:
> > http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200716071658.467820-6-leobras.c@gmail.com/#2488874
>
> Ah right I did suggest this :) My bad. I think I suggested it before
> suggesting to keep the reserved area boundaries checked/adjusted to the
> window boundaries, may as well drop this. Thanks,
>
>
> > I can remove it if it's ok.
> >
> > > > + goto out_unlock;
> > > > +
> > > > }
> > > >
> > > > dev->dev.archdata.dma_offset = win_addr;
> > > > @@ -1340,7 +1400,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> > > >
> > > > out_unlock:
> > > > mutex_unlock(&direct_window_init_mutex);
> > > > - return win64;
> > > > + return win64 && maps_whole_partition;
> > > > }
> > > >
> > > > static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
> > > >
^ permalink raw reply
* Re: [PATCH 10/10] powerpc: remove address space overrides using set_fs()
From: Christophe Leroy @ 2020-09-02 6:15 UTC (permalink / raw)
To: Christoph Hellwig, Linus Torvalds, Al Viro, Michael Ellerman, x86
Cc: linux-fsdevel, linux-arch, linuxppc-dev, Kees Cook, linux-kernel
In-Reply-To: <20200827150030.282762-11-hch@lst.de>
Le 27/08/2020 à 17:00, Christoph Hellwig a écrit :
> Stop providing the possibility to override the address space using
> set_fs() now that there is no need for that any more.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> arch/powerpc/Kconfig | 1 -
> arch/powerpc/include/asm/processor.h | 7 ---
> arch/powerpc/include/asm/thread_info.h | 5 +--
> arch/powerpc/include/asm/uaccess.h | 62 ++++++++------------------
> arch/powerpc/kernel/signal.c | 3 --
> arch/powerpc/lib/sstep.c | 6 +--
> 6 files changed, 22 insertions(+), 62 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
> index 7fe3531ad36a77..39727537d39701 100644
> --- a/arch/powerpc/include/asm/uaccess.h
> +++ b/arch/powerpc/include/asm/uaccess.h
> @@ -8,62 +8,36 @@
> #include <asm/extable.h>
> #include <asm/kup.h>
>
> -/*
> - * The fs value determines whether argument validity checking should be
> - * performed or not. If get_fs() == USER_DS, checking is performed, with
> - * get_fs() == KERNEL_DS, checking is bypassed.
> - *
> - * For historical reasons, these macros are grossly misnamed.
> - *
> - * The fs/ds values are now the highest legal address in the "segment".
> - * This simplifies the checking in the routines below.
> - */
> -
> -#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
> -
> -#define KERNEL_DS MAKE_MM_SEG(~0UL)
> #ifdef __powerpc64__
> /* We use TASK_SIZE_USER64 as TASK_SIZE is not constant */
> -#define USER_DS MAKE_MM_SEG(TASK_SIZE_USER64 - 1)
> -#else
> -#define USER_DS MAKE_MM_SEG(TASK_SIZE - 1)
> -#endif
> -
> -#define get_fs() (current->thread.addr_limit)
> +#define TASK_SIZE_MAX TASK_SIZE_USER64
>
> -static inline void set_fs(mm_segment_t fs)
> +static inline bool __access_ok(unsigned long addr, unsigned long size)
> {
> - current->thread.addr_limit = fs;
> - /* On user-mode return check addr_limit (fs) is correct */
> - set_thread_flag(TIF_FSCHECK);
> + if (addr >= TASK_SIZE_MAX)
> + return false;
> + /*
> + * This check is sufficient because there is a large enough gap between
> + * user addresses and the kernel addresses.
> + */
> + return size <= TASK_SIZE_MAX;
> }
> -
> -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg)
> -#define user_addr_max() (get_fs().seg)
> -
> -#ifdef __powerpc64__
> -/*
> - * This check is sufficient because there is a large enough
> - * gap between user addresses and the kernel addresses
> - */
> -#define __access_ok(addr, size, segment) \
> - (((addr) <= (segment).seg) && ((size) <= (segment).seg))
> -
> #else
> +#define TASK_SIZE_MAX TASK_SIZE
>
> -static inline int __access_ok(unsigned long addr, unsigned long size,
> - mm_segment_t seg)
> +static inline bool __access_ok(unsigned long addr, unsigned long size)
> {
> - if (addr > seg.seg)
> - return 0;
> - return (size == 0 || size - 1 <= seg.seg - addr);
> + if (addr >= TASK_SIZE_MAX)
> + return false;
> + if (size == 0)
> + return false;
__access_ok() was returning true when size == 0 up to now. Any reason to
return false now ?
> + return size <= TASK_SIZE_MAX - addr;
> }
> -
> -#endif
> +#endif /* __powerpc64__ */
>
> #define access_ok(addr, size) \
> (__chk_user_ptr(addr), \
> - __access_ok((__force unsigned long)(addr), (size), get_fs()))
> + __access_ok((unsigned long)(addr), (size)))
>
> /*
> * These are the main single-value transfer routines. They automatically
Christophe
^ permalink raw reply
* Re: [RFC PATCH 1/2] KVM: PPC: Use the ppc_inst type
From: Paul Mackerras @ 2020-09-02 6:13 UTC (permalink / raw)
To: Jordan Niethe; +Cc: linuxppc-dev, kvm-ppc
In-Reply-To: <20200820033922.32311-1-jniethe5@gmail.com>
On Thu, Aug 20, 2020 at 01:39:21PM +1000, Jordan Niethe wrote:
> The ppc_inst type was added to help cope with the addition of prefixed
> instructions to the ISA. Convert KVM to use this new type for dealing
> wiht instructions. For now do not try to add further support for
> prefixed instructions.
This change does seem to splatter itself across a lot of code that
mostly or exclusively runs on machines which are not POWER10 and will
never need to handle prefixed instructions, unfortunately. I wonder
if there is a less invasive way to approach this.
In particular we are inflicting this 64-bit struct on 32-bit platforms
unnecessarily (I assume, correct me if I am wrong here).
How would it be to do something like:
typedef unsigned long ppc_inst_t;
so it is 32 bits on 32-bit platforms and 64 bits on 64-bit platforms,
and then use that instead of 'struct ppc_inst'? You would still need
to change the function declarations but I think most of the function
bodies would not need to be changed. In particular you would avoid a
lot of the churn related to having to add ppc_inst_val() and suchlike.
> -static inline unsigned make_dsisr(unsigned instr)
> +static inline unsigned make_dsisr(struct ppc_inst instr)
> {
> unsigned dsisr;
> + u32 word = ppc_inst_val(instr);
>
>
> /* bits 6:15 --> 22:31 */
> - dsisr = (instr & 0x03ff0000) >> 16;
> + dsisr = (word & 0x03ff0000) >> 16;
>
> if (IS_XFORM(instr)) {
> /* bits 29:30 --> 15:16 */
> - dsisr |= (instr & 0x00000006) << 14;
> + dsisr |= (word & 0x00000006) << 14;
> /* bit 25 --> 17 */
> - dsisr |= (instr & 0x00000040) << 8;
> + dsisr |= (word & 0x00000040) << 8;
> /* bits 21:24 --> 18:21 */
> - dsisr |= (instr & 0x00000780) << 3;
> + dsisr |= (word & 0x00000780) << 3;
> } else {
> /* bit 5 --> 17 */
> - dsisr |= (instr & 0x04000000) >> 12;
> + dsisr |= (word & 0x04000000) >> 12;
> /* bits 1: 4 --> 18:21 */
> - dsisr |= (instr & 0x78000000) >> 17;
> + dsisr |= (word & 0x78000000) >> 17;
> /* bits 30:31 --> 12:13 */
> if (IS_DSFORM(instr))
> - dsisr |= (instr & 0x00000003) << 18;
> + dsisr |= (word & 0x00000003) << 18;
Here I would have done something like:
> -static inline unsigned make_dsisr(unsigned instr)
> +static inline unsigned make_dsisr(struct ppc_inst pi)
> {
> unsigned dsisr;
> + u32 instr = ppc_inst_val(pi);
and left the rest of the function unchanged.
At first I wondered why we still had that function, since IBM Power
CPUs have not set DSISR on an alignment interrupt since POWER3 days.
It turns out it this function is used by PR KVM when it is emulating
one of the old 32-bit PowerPC CPUs (601, 603, 604, 750, 7450 etc.).
> diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
Despite the file name, this code is not used on IBM Power servers.
It is for platforms which run under an ePAPR (not server PAPR)
hypervisor (which would be a KVM variant, but generally book E KVM not
book 3S).
Paul.
^ permalink raw reply
* Re: [RFC PATCH 2/2] KVM: PPC: Book3S HV: Support prefixed instructions
From: Paul Mackerras @ 2020-09-02 6:18 UTC (permalink / raw)
To: Jordan Niethe; +Cc: linuxppc-dev, kvm-ppc
In-Reply-To: <20200820033922.32311-2-jniethe5@gmail.com>
On Thu, Aug 20, 2020 at 01:39:22PM +1000, Jordan Niethe wrote:
> There are two main places where instructions are loaded from the guest:
> * Emulate loadstore - such as when performing MMIO emulation
> triggered by an HDSI
> * After an HV emulation assistance interrupt (e40)
>
> If it is a prefixed instruction that triggers these cases, its suffix
> must be loaded. Use the SRR1_PREFIX bit to decide if a suffix needs to
> be loaded. Make sure if this bit is set inject_interrupt() also sets it
> when giving an interrupt to the guest.
>
> ISA v3.10 extends the Hypervisor Emulation Instruction Register (HEIR)
> to 64 bits long to accommodate prefixed instructions. For interrupts
> caused by a word instruction the instruction is loaded into bits 32:63
> and bits 0:31 are zeroed. When caused by a prefixed instruction the
> prefix and suffix are loaded into bits 0:63.
>
> Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
> ---
> arch/powerpc/kvm/book3s.c | 15 +++++++++++++--
> arch/powerpc/kvm/book3s_64_mmu_hv.c | 10 +++++++---
> arch/powerpc/kvm/book3s_hv_builtin.c | 3 +++
> arch/powerpc/kvm/book3s_hv_rmhandlers.S | 14 ++++++++++++++
> 4 files changed, 37 insertions(+), 5 deletions(-)
>
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index 70d8967acc9b..18b1928a571b 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -456,13 +456,24 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu,
> {
> ulong pc = kvmppc_get_pc(vcpu);
> u32 word;
> + u64 doubleword;
> int r;
>
> if (type == INST_SC)
> pc -= 4;
>
> - r = kvmppc_ld(vcpu, &pc, sizeof(u32), &word, false);
> - *inst = ppc_inst(word);
> + if ((kvmppc_get_msr(vcpu) & SRR1_PREFIXED)) {
> + r = kvmppc_ld(vcpu, &pc, sizeof(u64), &doubleword, false);
Should we also have a check here that the doubleword is not crossing a
page boundary? I can't think of a way to get this code to cross a
page boundary, assuming the hardware is working correctly, but it
makes me just a little nervous.
> +#ifdef CONFIG_CPU_LITTLE_ENDIAN
> + *inst = ppc_inst_prefix(doubleword & 0xffffffff, doubleword >> 32);
> +#else
> + *inst = ppc_inst_prefix(doubleword >> 32, doubleword & 0xffffffff);
> +#endif
Ick. Is there a cleaner way to do this?
> + } else {
> + r = kvmppc_ld(vcpu, &pc, sizeof(u32), &word, false);
> + *inst = ppc_inst(word);
> + }
> +
> if (r == EMULATE_DONE)
> return r;
> else
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index 775ce41738ce..0802471f4856 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -411,9 +411,13 @@ static int instruction_is_store(struct ppc_inst instr)
> unsigned int mask;
>
> mask = 0x10000000;
> - if ((ppc_inst_val(instr) & 0xfc000000) == 0x7c000000)
> - mask = 0x100; /* major opcode 31 */
> - return (ppc_inst_val(instr) & mask) != 0;
> + if (ppc_inst_prefixed(instr)) {
> + return (ppc_inst_suffix(instr) & mask) != 0;
> + } else {
> + if ((ppc_inst_val(instr) & 0xfc000000) == 0x7c000000)
> + mask = 0x100; /* major opcode 31 */
> + return (ppc_inst_val(instr) & mask) != 0;
> + }
The way the code worked before, the mask depended on whether the
instruction was a D-form (or DS-form or other variant) instruction,
where you can tell loads and stores apart by looking at the major
opcode, or an X-form instruction, where you look at the minor opcode.
Now we are only looking at the minor opcode if it is not a prefixed
instruction. Are there no X-form prefixed loads or stores?
Paul.
^ permalink raw reply
* Re: [PATCH 2/2] powerpc/vdso32: link vdso64 with linker
From: Christophe Leroy @ 2020-09-02 6:46 UTC (permalink / raw)
To: Nick Desaulniers, Michael Ellerman, Nicholas Piggin
Cc: Christophe Leroy, Joe Lawrence, Kees Cook, Fangrui Song,
linux-kernel, clang-built-linux, Paul Mackerras, linuxppc-dev
In-Reply-To: <20200901222523.1941988-3-ndesaulniers@google.com>
On 09/01/2020 10:25 PM, Nick Desaulniers wrote:
> Rather than invoke the compiler as the driver, use the linker. That way
> we can check --orphan-handling=warn support correctly, as cc-ldoption
> was removed in
> commit 055efab3120b ("kbuild: drop support for cc-ldoption").
>
> Requires dropping the .got section. I couldn't find how it was used in
> the vdso32.
ld crashes:
LD arch/powerpc/kernel/vdso32/vdso32.so.dbg
/bin/sh: line 1: 23780 Segmentation fault (core dumped)
ppc-linux-ld -EB -m elf32ppc -shared -soname linux-vdso32.so.1
--eh-frame-hdr --orphan-handling=warn -T
arch/powerpc/kernel/vdso32/vdso32.lds
arch/powerpc/kernel/vdso32/sigtramp.o
arch/powerpc/kernel/vdso32/gettimeofday.o
arch/powerpc/kernel/vdso32/datapage.o
arch/powerpc/kernel/vdso32/cacheflush.o
arch/powerpc/kernel/vdso32/note.o arch/powerpc/kernel/vdso32/getcpu.o -o
arch/powerpc/kernel/vdso32/vdso32.so.dbg
make[4]: *** [arch/powerpc/kernel/vdso32/vdso32.so.dbg] Error 139
[root@localhost linux-powerpc]# ppc-linux-ld --version
GNU ld (GNU Binutils) 2.26.20160125
Christophe
>
> Fixes: commit f2af201002a8 ("powerpc/build: vdso linker warning for orphan sections")
> Link: https://lore.kernel.org/lkml/CAKwvOdnn3wxYdJomvnveyD_njwRku3fABWT_bS92duihhywLJQ@mail.gmail.com/
> Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
> ---
> Not sure removing .got is a good idea or not. Otherwise I observe the
> following link error:
> powerpc-linux-gnu-ld: warning: orphan section `.got' from `arch/powerpc/kernel/vdso32/sigtramp.o' being placed in section `.got'
> powerpc-linux-gnu-ld: _GLOBAL_OFFSET_TABLE_ not defined in linker created .got
> powerpc-linux-gnu-ld: final link failed: bad value
>
> sigtramp.c doesn't mention anything from the GOT AFAICT, and doesn't
> look like it contains relocations that do, so I'm not sure where
> references to _GLOBAL_OFFSET_TABLE_ are coming from.
>
> arch/powerpc/kernel/vdso32/Makefile | 7 +++++--
> arch/powerpc/kernel/vdso32/vdso32.lds.S | 3 ++-
> 2 files changed, 7 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile
> index 87ab1152d5ce..611a5951945a 100644
> --- a/arch/powerpc/kernel/vdso32/Makefile
> +++ b/arch/powerpc/kernel/vdso32/Makefile
> @@ -27,6 +27,9 @@ UBSAN_SANITIZE := n
> ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
> -Wl,-soname=linux-vdso32.so.1 -Wl,--hash-style=both
> asflags-y := -D__VDSO32__ -s
> +ldflags-y := -shared -soname linux-vdso32.so.1 \
> + $(call ld-option, --eh-frame-hdr) \
> + $(call ld-option, --orphan-handling=warn) -T
>
> obj-y += vdso32_wrapper.o
> extra-y += vdso32.lds
> @@ -49,8 +52,8 @@ $(obj-vdso32): %.o: %.S FORCE
> $(call if_changed_dep,vdso32as)
>
> # actual build commands
> -quiet_cmd_vdso32ld = VDSO32L $@
> - cmd_vdso32ld = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ $(call cc-ldoption, -Wl$(comma)--orphan-handling=warn) -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
> +quiet_cmd_vdso32ld = LD $@
> + cmd_vdso32ld = $(cmd_ld)
> quiet_cmd_vdso32as = VDSO32A $@
> cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) -c -o $@ $<
>
> diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S
> index 4c985467a668..0ccdebad18b8 100644
> --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
> +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
> @@ -61,7 +61,6 @@ SECTIONS
> .fixup : { *(.fixup) }
>
> .dynamic : { *(.dynamic) } :text :dynamic
> - .got : { *(.got) } :text
> .plt : { *(.plt) }
>
> _end = .;
> @@ -108,7 +107,9 @@ SECTIONS
> .debug_varnames 0 : { *(.debug_varnames) }
>
> /DISCARD/ : {
> + *(.got)
> *(.note.GNU-stack)
> + *(.branch_lt)
> *(.data .data.* .gnu.linkonce.d.* .sdata*)
> *(.bss .sbss .dynbss .dynsbss)
> *(.glink .iplt .plt .rela*)
>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox