* [PATCH v6 1/5] KVM: PPC: Book3S HV: Fix function definition in book3s_hv_uvmem.c
From: Ram Pai @ 2020-07-27 18:07 UTC (permalink / raw)
To: kvm-ppc, linuxppc-dev
Cc: ldufour, linuxram, cclaudio, bharata, sathnaga, aneesh.kumar,
sukadev, bauerman, david
In-Reply-To: <1595873238-26184-1-git-send-email-linuxram@us.ibm.com>
Without this fix, git is confused. It generates wrong
function context for code changes in subsequent patches.
Weird, but true.
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Bharata B Rao <bharata@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Claudio Carvalho <cclaudio@linux.ibm.com>
Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
arch/powerpc/kvm/book3s_hv_uvmem.c | 21 ++++++++++-----------
1 file changed, 10 insertions(+), 11 deletions(-)
diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 09d8119..e6f76bc 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -382,8 +382,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
* Alloc a PFN from private device memory pool and copy page from normal
* memory to secure memory using UV_PAGE_IN uvcall.
*/
-static int
-kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
+static int kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
unsigned long end, unsigned long gpa, struct kvm *kvm,
unsigned long page_shift, bool *downgrade)
{
@@ -450,8 +449,8 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
* In the former case, uses dev_pagemap_ops.migrate_to_ram handler
* to unmap the device page from QEMU's page tables.
*/
-static unsigned long
-kvmppc_share_page(struct kvm *kvm, unsigned long gpa, unsigned long page_shift)
+static unsigned long kvmppc_share_page(struct kvm *kvm, unsigned long gpa,
+ unsigned long page_shift)
{
int ret = H_PARAMETER;
@@ -500,9 +499,9 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
* H_PAGE_IN_SHARED flag makes the page shared which means that the same
* memory in is visible from both UV and HV.
*/
-unsigned long
-kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
- unsigned long flags, unsigned long page_shift)
+unsigned long kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa,
+ unsigned long flags,
+ unsigned long page_shift)
{
bool downgrade = false;
unsigned long start, end;
@@ -559,10 +558,10 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm)
* Provision a new page on HV side and copy over the contents
* from secure memory using UV_PAGE_OUT uvcall.
*/
-static int
-kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, unsigned long page_shift,
- struct kvm *kvm, unsigned long gpa)
+static int kvmppc_svm_page_out(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end, unsigned long page_shift,
+ struct kvm *kvm, unsigned long gpa)
{
unsigned long src_pfn, dst_pfn = 0;
struct migrate_vma mig;
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH] powerpc/64s/hash: Fix hash_preload running with interrupts enabled
From: Athira Rajeev @ 2020-07-27 17:21 UTC (permalink / raw)
To: Michael Ellerman; +Cc: Aneesh Kumar K . V, linuxppc-dev, Nicholas Piggin
In-Reply-To: <87k0yp6sqh.fsf@mpe.ellerman.id.au>
> On 27-Jul-2020, at 6:05 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
>
> Athira Rajeev <atrajeev@linux.vnet.ibm.com> writes:
>>> On 27-Jul-2020, at 11:39 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>>
>>> Commit 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
>>> caller") removed the local_irq_disable from hash_preload, but it was
>>> required for more than just the page table walk: the hash pte busy bit is
>>> effectively a lock which may be taken in interrupt context, and the local
>>> update flag test must not be preempted before it's used.
>>>
>>> This solves apparent lockups with perf interrupting __hash_page_64K. If
>>> get_perf_callchain then also takes a hash fault on the same page while it
>>> is already locked, it will loop forever taking hash faults, which looks like
>>> this:
>>>
>>> cpu 0x49e: Vector: 100 (System Reset) at [c00000001a4f7d70]
>>> pc: c000000000072dc8: hash_page_mm+0x8/0x800
>>> lr: c00000000000c5a4: do_hash_page+0x24/0x38
>>> sp: c0002ac1cc69ac70
>>> msr: 8000000000081033
>>> current = 0xc0002ac1cc602e00
>>> paca = 0xc00000001de1f280 irqmask: 0x03 irq_happened: 0x01
>>> pid = 20118, comm = pread2_processe
>>> Linux version 5.8.0-rc6-00345-g1fad14f18bc6
>>> 49e:mon> t
>>> [c0002ac1cc69ac70] c00000000000c5a4 do_hash_page+0x24/0x38 (unreliable)
>>> --- Exception: 300 (Data Access) at c00000000008fa60 __copy_tofrom_user_power7+0x20c/0x7ac
>>> [link register ] c000000000335d10 copy_from_user_nofault+0xf0/0x150
>>> [c0002ac1cc69af70] c00032bf9fa3c880 (unreliable)
>>> [c0002ac1cc69afa0] c000000000109df0 read_user_stack_64+0x70/0xf0
>>> [c0002ac1cc69afd0] c000000000109fcc perf_callchain_user_64+0x15c/0x410
>>> [c0002ac1cc69b060] c000000000109c00 perf_callchain_user+0x20/0x40
>>> [c0002ac1cc69b080] c00000000031c6cc get_perf_callchain+0x25c/0x360
>>> [c0002ac1cc69b120] c000000000316b50 perf_callchain+0x70/0xa0
>>> [c0002ac1cc69b140] c000000000316ddc perf_prepare_sample+0x25c/0x790
>>> [c0002ac1cc69b1a0] c000000000317350 perf_event_output_forward+0x40/0xb0
>>> [c0002ac1cc69b220] c000000000306138 __perf_event_overflow+0x88/0x1a0
>>> [c0002ac1cc69b270] c00000000010cf70 record_and_restart+0x230/0x750
>>> [c0002ac1cc69b620] c00000000010d69c perf_event_interrupt+0x20c/0x510
>>> [c0002ac1cc69b730] c000000000027d9c performance_monitor_exception+0x4c/0x60
>>> [c0002ac1cc69b750] c00000000000b2f8 performance_monitor_common_virt+0x1b8/0x1c0
>>> --- Exception: f00 (Performance Monitor) at c0000000000cb5b0 pSeries_lpar_hpte_insert+0x0/0x160
>>> [link register ] c0000000000846f0 __hash_page_64K+0x210/0x540
>>> [c0002ac1cc69ba50] 0000000000000000 (unreliable)
>>> [c0002ac1cc69bb00] c000000000073ae0 update_mmu_cache+0x390/0x3a0
>>> [c0002ac1cc69bb70] c00000000037f024 wp_page_copy+0x364/0xce0
>>> [c0002ac1cc69bc20] c00000000038272c do_wp_page+0xdc/0xa60
>>> [c0002ac1cc69bc70] c0000000003857bc handle_mm_fault+0xb9c/0x1b60
>>> [c0002ac1cc69bd50] c00000000006c434 __do_page_fault+0x314/0xc90
>>> [c0002ac1cc69be20] c00000000000c5c8 handle_page_fault+0x10/0x2c
>>> --- Exception: 300 (Data Access) at 00007fff8c861fe8
>>> SP (7ffff6b19660) is in userspace
>>>
>>> Reported-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>>> Reported-by: Anton Blanchard <anton@ozlabs.org>
>>> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>> Fixes: 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
>>> caller")
>>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>>
>>
>> Hi,
>>
>> Tested with the patch and it fixes the lockups I was seeing with my test run.
>> Thanks for the fix.
>>
>> Tested-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>
> Thanks for testing.
>
> What test are you running?
Hi Michael
I was running “perf record” and Unixbench tests ( https://github.com/kdlucas/byte-unixbench ) in parallel where we were getting soft lockups
1. Perf command run:
# perf record -a -g -c 10000000 -o <data_file> sleep 60
2. Unixbench tests
# Run -q -c <nr_threads> spawn
Wtth the fix, perf completes successfully.
Thanks
Athira
>
> cheers
^ permalink raw reply
* [PATCH V5 4/4] tools/perf: Add perf tools support for extended regs in power10
From: Athira Rajeev @ 2020-07-27 17:16 UTC (permalink / raw)
To: mpe, acme, jolsa; +Cc: ravi.bangoria, mikey, maddy, linuxppc-dev, kjain
In-Reply-To: <1595870184-1460-1-git-send-email-atrajeev@linux.vnet.ibm.com>
Added support for supported regs which are new in power10
( MMCR3, SIER2, SIER3 ) to sample_reg_mask in the tool side
to use with `-I?` option. Also added PVR check to send extended
mask for power10 at kernel while capturing extended regs in
each sample.
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
tools/arch/powerpc/include/uapi/asm/perf_regs.h | 6 ++++++
tools/perf/arch/powerpc/include/perf_regs.h | 3 +++
tools/perf/arch/powerpc/util/perf_regs.c | 6 ++++++
3 files changed, 15 insertions(+)
diff --git a/tools/arch/powerpc/include/uapi/asm/perf_regs.h b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
index 225c64c..bdf5f10 100644
--- a/tools/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -52,6 +52,9 @@ enum perf_event_powerpc_regs {
PERF_REG_POWERPC_MMCR0,
PERF_REG_POWERPC_MMCR1,
PERF_REG_POWERPC_MMCR2,
+ PERF_REG_POWERPC_MMCR3,
+ PERF_REG_POWERPC_SIER2,
+ PERF_REG_POWERPC_SIER3,
/* Max regs without the extended regs */
PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
};
@@ -60,6 +63,9 @@ enum perf_event_powerpc_regs {
/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
#define PERF_REG_PMU_MASK_300 (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) - PERF_REG_PMU_MASK)
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_31 */
+#define PERF_REG_PMU_MASK_31 (((1ULL << (PERF_REG_POWERPC_SIER3 + 1)) - 1) - PERF_REG_PMU_MASK)
#define PERF_REG_MAX_ISA_300 (PERF_REG_POWERPC_MMCR2 + 1)
+#define PERF_REG_MAX_ISA_31 (PERF_REG_POWERPC_SIER3 + 1)
#endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h
index 46ed00d..63f3ac9 100644
--- a/tools/perf/arch/powerpc/include/perf_regs.h
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -68,6 +68,9 @@
[PERF_REG_POWERPC_MMCR0] = "mmcr0",
[PERF_REG_POWERPC_MMCR1] = "mmcr1",
[PERF_REG_POWERPC_MMCR2] = "mmcr2",
+ [PERF_REG_POWERPC_MMCR3] = "mmcr3",
+ [PERF_REG_POWERPC_SIER2] = "sier2",
+ [PERF_REG_POWERPC_SIER3] = "sier3",
};
static inline const char *perf_reg_name(int id)
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
index d64ba0c..2b6d470 100644
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -14,6 +14,7 @@
#include <linux/kernel.h>
#define PVR_POWER9 0x004E
+#define PVR_POWER10 0x0080
const struct sample_reg sample_reg_masks[] = {
SMPL_REG(r0, PERF_REG_POWERPC_R0),
@@ -64,6 +65,9 @@
SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0),
SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1),
SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2),
+ SMPL_REG(mmcr3, PERF_REG_POWERPC_MMCR3),
+ SMPL_REG(sier2, PERF_REG_POWERPC_SIER2),
+ SMPL_REG(sier3, PERF_REG_POWERPC_SIER3),
SMPL_REG_END
};
@@ -194,6 +198,8 @@ uint64_t arch__intr_reg_mask(void)
version = (((mfspr(SPRN_PVR)) >> 16) & 0xFFFF);
if (version == PVR_POWER9)
extended_mask = PERF_REG_PMU_MASK_300;
+ else if (version == PVR_POWER10)
+ extended_mask = PERF_REG_PMU_MASK_31;
else
return mask;
--
1.8.3.1
^ permalink raw reply related
* [PATCH V5 3/4] tools/perf: Add perf tools support for extended register capability in powerpc
From: Athira Rajeev @ 2020-07-27 17:16 UTC (permalink / raw)
To: mpe, acme, jolsa; +Cc: ravi.bangoria, mikey, maddy, linuxppc-dev, kjain
In-Reply-To: <1595870184-1460-1-git-send-email-atrajeev@linux.vnet.ibm.com>
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Add extended regs to sample_reg_mask in the tool side to use
with `-I?` option. Perf tools side uses extended mask to display
the platform supported register names (with -I? option) to the user
and also send this mask to the kernel to capture the extended registers
in each sample. Hence decide the mask value based on the processor
version.
Currently definitions for `mfspr`, `SPRN_PVR` are part of
`arch/powerpc/util/header.c`. Move this to a header file so that
these definitions can be re-used in other source files as well.
Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
[Decide extended mask at run time based on platform]
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
tools/arch/powerpc/include/uapi/asm/perf_regs.h | 14 ++++++-
tools/perf/arch/powerpc/include/perf_regs.h | 5 ++-
| 9 +----
tools/perf/arch/powerpc/util/perf_regs.c | 49 +++++++++++++++++++++++++
| 15 ++++++++
5 files changed, 82 insertions(+), 10 deletions(-)
create mode 100644 tools/perf/arch/powerpc/util/utils_header.h
diff --git a/tools/arch/powerpc/include/uapi/asm/perf_regs.h b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
index f599064..225c64c 100644
--- a/tools/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -48,6 +48,18 @@ enum perf_event_powerpc_regs {
PERF_REG_POWERPC_DSISR,
PERF_REG_POWERPC_SIER,
PERF_REG_POWERPC_MMCRA,
- PERF_REG_POWERPC_MAX,
+ /* Extended registers */
+ PERF_REG_POWERPC_MMCR0,
+ PERF_REG_POWERPC_MMCR1,
+ PERF_REG_POWERPC_MMCR2,
+ /* Max regs without the extended regs */
+ PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
};
+
+#define PERF_REG_PMU_MASK ((1ULL << PERF_REG_POWERPC_MAX) - 1)
+
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
+#define PERF_REG_PMU_MASK_300 (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) - PERF_REG_PMU_MASK)
+
+#define PERF_REG_MAX_ISA_300 (PERF_REG_POWERPC_MMCR2 + 1)
#endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h
index e18a355..46ed00d 100644
--- a/tools/perf/arch/powerpc/include/perf_regs.h
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -64,7 +64,10 @@
[PERF_REG_POWERPC_DAR] = "dar",
[PERF_REG_POWERPC_DSISR] = "dsisr",
[PERF_REG_POWERPC_SIER] = "sier",
- [PERF_REG_POWERPC_MMCRA] = "mmcra"
+ [PERF_REG_POWERPC_MMCRA] = "mmcra",
+ [PERF_REG_POWERPC_MMCR0] = "mmcr0",
+ [PERF_REG_POWERPC_MMCR1] = "mmcr1",
+ [PERF_REG_POWERPC_MMCR2] = "mmcr2",
};
static inline const char *perf_reg_name(int id)
--git a/tools/perf/arch/powerpc/util/header.c b/tools/perf/arch/powerpc/util/header.c
index d487007..1a95017 100644
--- a/tools/perf/arch/powerpc/util/header.c
+++ b/tools/perf/arch/powerpc/util/header.c
@@ -7,17 +7,10 @@
#include <string.h>
#include <linux/stringify.h>
#include "header.h"
+#include "utils_header.h"
#include "metricgroup.h"
#include <api/fs/fs.h>
-#define mfspr(rn) ({unsigned long rval; \
- asm volatile("mfspr %0," __stringify(rn) \
- : "=r" (rval)); rval; })
-
-#define SPRN_PVR 0x11F /* Processor Version Register */
-#define PVR_VER(pvr) (((pvr) >> 16) & 0xFFFF) /* Version field */
-#define PVR_REV(pvr) (((pvr) >> 0) & 0xFFFF) /* Revison field */
-
int
get_cpuid(char *buffer, size_t sz)
{
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
index 0a52429..d64ba0c 100644
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -6,9 +6,15 @@
#include "../../../util/perf_regs.h"
#include "../../../util/debug.h"
+#include "../../../util/event.h"
+#include "../../../util/header.h"
+#include "../../../perf-sys.h"
+#include "utils_header.h"
#include <linux/kernel.h>
+#define PVR_POWER9 0x004E
+
const struct sample_reg sample_reg_masks[] = {
SMPL_REG(r0, PERF_REG_POWERPC_R0),
SMPL_REG(r1, PERF_REG_POWERPC_R1),
@@ -55,6 +61,9 @@
SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
SMPL_REG(sier, PERF_REG_POWERPC_SIER),
SMPL_REG(mmcra, PERF_REG_POWERPC_MMCRA),
+ SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0),
+ SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1),
+ SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2),
SMPL_REG_END
};
@@ -163,3 +172,43 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)
return SDT_ARG_VALID;
}
+
+uint64_t arch__intr_reg_mask(void)
+{
+ struct perf_event_attr attr = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .sample_type = PERF_SAMPLE_REGS_INTR,
+ .precise_ip = 1,
+ .disabled = 1,
+ .exclude_kernel = 1,
+ };
+ int fd;
+ u32 version;
+ u64 extended_mask = 0, mask = PERF_REGS_MASK;
+
+ /*
+ * Get the PVR value to set the extended
+ * mask specific to platform.
+ */
+ version = (((mfspr(SPRN_PVR)) >> 16) & 0xFFFF);
+ if (version == PVR_POWER9)
+ extended_mask = PERF_REG_PMU_MASK_300;
+ else
+ return mask;
+
+ attr.sample_regs_intr = extended_mask;
+ attr.sample_period = 1;
+ event_attr_init(&attr);
+
+ /*
+ * check if the pmu supports perf extended regs, before
+ * returning the register mask to sample.
+ */
+ fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+ if (fd != -1) {
+ close(fd);
+ mask |= extended_mask;
+ }
+ return mask;
+}
--git a/tools/perf/arch/powerpc/util/utils_header.h b/tools/perf/arch/powerpc/util/utils_header.h
new file mode 100644
index 0000000..5788eb1
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/utils_header.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PERF_UTIL_HEADER_H
+#define __PERF_UTIL_HEADER_H
+
+#include <linux/stringify.h>
+
+#define mfspr(rn) ({unsigned long rval; \
+ asm volatile("mfspr %0," __stringify(rn) \
+ : "=r" (rval)); rval; })
+
+#define SPRN_PVR 0x11F /* Processor Version Register */
+#define PVR_VER(pvr) (((pvr) >> 16) & 0xFFFF) /* Version field */
+#define PVR_REV(pvr) (((pvr) >> 0) & 0xFFFF) /* Revison field */
+
+#endif /* __PERF_UTIL_HEADER_H */
--
1.8.3.1
^ permalink raw reply related
* [PATCH V5 2/4] powerpc/perf: Add extended regs support for power10 platform
From: Athira Rajeev @ 2020-07-27 17:16 UTC (permalink / raw)
To: mpe, acme, jolsa; +Cc: ravi.bangoria, mikey, maddy, linuxppc-dev, kjain
In-Reply-To: <1595870184-1460-1-git-send-email-atrajeev@linux.vnet.ibm.com>
Include capability flag `PERF_PMU_CAP_EXTENDED_REGS` for power10
and expose MMCR3, SIER2, SIER3 registers as part of extended regs.
Also introduce `PERF_REG_PMU_MASK_31` to define extended mask
value at runtime for power10
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
[Fix build failure on PPC32 platform]
Suggested-by: Ryan Grimm <grimm@linux.ibm.com>
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Tested-by: Nageswara R Sastry <nasastry@in.ibm.com>
Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
arch/powerpc/include/uapi/asm/perf_regs.h | 6 ++++++
arch/powerpc/perf/perf_regs.c | 12 +++++++++++-
arch/powerpc/perf/power10-pmu.c | 6 ++++++
3 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h
index 225c64c..bdf5f10 100644
--- a/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -52,6 +52,9 @@ enum perf_event_powerpc_regs {
PERF_REG_POWERPC_MMCR0,
PERF_REG_POWERPC_MMCR1,
PERF_REG_POWERPC_MMCR2,
+ PERF_REG_POWERPC_MMCR3,
+ PERF_REG_POWERPC_SIER2,
+ PERF_REG_POWERPC_SIER3,
/* Max regs without the extended regs */
PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
};
@@ -60,6 +63,9 @@ enum perf_event_powerpc_regs {
/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
#define PERF_REG_PMU_MASK_300 (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) - PERF_REG_PMU_MASK)
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_31 */
+#define PERF_REG_PMU_MASK_31 (((1ULL << (PERF_REG_POWERPC_SIER3 + 1)) - 1) - PERF_REG_PMU_MASK)
#define PERF_REG_MAX_ISA_300 (PERF_REG_POWERPC_MMCR2 + 1)
+#define PERF_REG_MAX_ISA_31 (PERF_REG_POWERPC_SIER3 + 1)
#endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index 9301e68..8e53f2f 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -81,6 +81,14 @@ static u64 get_ext_regs_value(int idx)
return mfspr(SPRN_MMCR1);
case PERF_REG_POWERPC_MMCR2:
return mfspr(SPRN_MMCR2);
+#ifdef CONFIG_PPC64
+ case PERF_REG_POWERPC_MMCR3:
+ return mfspr(SPRN_MMCR3);
+ case PERF_REG_POWERPC_SIER2:
+ return mfspr(SPRN_SIER2);
+ case PERF_REG_POWERPC_SIER3:
+ return mfspr(SPRN_SIER3);
+#endif
default: return 0;
}
}
@@ -89,7 +97,9 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
{
u64 perf_reg_extended_max = PERF_REG_POWERPC_MAX;
- if (cpu_has_feature(CPU_FTR_ARCH_300))
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ perf_reg_extended_max = PERF_REG_MAX_ISA_31;
+ else if (cpu_has_feature(CPU_FTR_ARCH_300))
perf_reg_extended_max = PERF_REG_MAX_ISA_300;
if (idx == PERF_REG_POWERPC_SIER &&
diff --git a/arch/powerpc/perf/power10-pmu.c b/arch/powerpc/perf/power10-pmu.c
index f7cff7f..8314865 100644
--- a/arch/powerpc/perf/power10-pmu.c
+++ b/arch/powerpc/perf/power10-pmu.c
@@ -87,6 +87,8 @@
#define POWER10_MMCRA_IFM3 0x00000000C0000000UL
#define POWER10_MMCRA_BHRB_MASK 0x00000000C0000000UL
+extern u64 PERF_REG_EXTENDED_MASK;
+
/* Table of alternatives, sorted by column 0 */
static const unsigned int power10_event_alternatives[][MAX_ALT] = {
{ PM_RUN_CYC_ALT, PM_RUN_CYC },
@@ -397,6 +399,7 @@ static void power10_config_bhrb(u64 pmu_bhrb_filter)
.cache_events = &power10_cache_events,
.attr_groups = power10_pmu_attr_groups,
.bhrb_nr = 32,
+ .capabilities = PERF_PMU_CAP_EXTENDED_REGS,
};
int init_power10_pmu(void)
@@ -408,6 +411,9 @@ int init_power10_pmu(void)
strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power10"))
return -ENODEV;
+ /* Set the PERF_REG_EXTENDED_MASK here */
+ PERF_REG_EXTENDED_MASK = PERF_REG_PMU_MASK_31;
+
rc = register_power_pmu(&power10_pmu);
if (rc)
return rc;
--
1.8.3.1
^ permalink raw reply related
* [PATCH V5 1/4] powerpc/perf: Add support for outputting extended regs in perf intr_regs
From: Athira Rajeev @ 2020-07-27 17:16 UTC (permalink / raw)
To: mpe, acme, jolsa; +Cc: ravi.bangoria, mikey, maddy, linuxppc-dev, kjain
In-Reply-To: <1595870184-1460-1-git-send-email-atrajeev@linux.vnet.ibm.com>
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Add support for perf extended register capability in powerpc.
The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the
PMU which support extended registers. The generic code define the mask
of extended registers as 0 for non supported architectures.
Patch adds extended regs support for power9 platform by
exposing MMCR0, MMCR1 and MMCR2 registers.
REG_RESERVED mask needs update to include extended regs.
`PERF_REG_EXTENDED_MASK`, contains mask value of the supported registers,
is defined at runtime in the kernel based on platform since the supported
registers may differ from one processor version to another and hence the
MASK value.
with patch
----------
available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11
r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26
r27 r28 r29 r30 r31 nip msr orig_r3 ctr link xer ccr softe
trap dar dsisr sier mmcra mmcr0 mmcr1 mmcr2
PERF_RECORD_SAMPLE(IP, 0x1): 4784/4784: 0 period: 1 addr: 0
... intr regs: mask 0xffffffffffff ABI 64-bit
.... r0 0xc00000000012b77c
.... r1 0xc000003fe5e03930
.... r2 0xc000000001b0e000
.... r3 0xc000003fdcddf800
.... r4 0xc000003fc7880000
.... r5 0x9c422724be
.... r6 0xc000003fe5e03908
.... r7 0xffffff63bddc8706
.... r8 0x9e4
.... r9 0x0
.... r10 0x1
.... r11 0x0
.... r12 0xc0000000001299c0
.... r13 0xc000003ffffc4800
.... r14 0x0
.... r15 0x7fffdd8b8b00
.... r16 0x0
.... r17 0x7fffdd8be6b8
.... r18 0x7e7076607730
.... r19 0x2f
.... r20 0xc00000001fc26c68
.... r21 0xc0002041e4227e00
.... r22 0xc00000002018fb60
.... r23 0x1
.... r24 0xc000003ffec4d900
.... r25 0x80000000
.... r26 0x0
.... r27 0x1
.... r28 0x1
.... r29 0xc000000001be1260
.... r30 0x6008010
.... r31 0xc000003ffebb7218
.... nip 0xc00000000012b910
.... msr 0x9000000000009033
.... orig_r3 0xc00000000012b86c
.... ctr 0xc0000000001299c0
.... link 0xc00000000012b77c
.... xer 0x0
.... ccr 0x28002222
.... softe 0x1
.... trap 0xf00
.... dar 0x0
.... dsisr 0x80000000000
.... sier 0x0
.... mmcra 0x80000000000
.... mmcr0 0x82008090
.... mmcr1 0x1e000000
.... mmcr2 0x0
... thread: perf:4784
Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
[Defined PERF_REG_EXTENDED_MASK at run time to add support for different platforms ]
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
[Fix build issue using CONFIG_PERF_EVENTS without CONFIG_PPC_PERF_CTRS]
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Tested-by: Nageswara R Sastry <nasastry@in.ibm.com>
Reviewed-and-tested-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
---
arch/powerpc/include/asm/perf_event.h | 3 +++
arch/powerpc/include/asm/perf_event_server.h | 5 ++++
arch/powerpc/include/uapi/asm/perf_regs.h | 14 +++++++++++-
arch/powerpc/perf/core-book3s.c | 1 +
arch/powerpc/perf/perf_regs.c | 34 +++++++++++++++++++++++++---
arch/powerpc/perf/power9-pmu.c | 6 +++++
6 files changed, 59 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h
index 1e8b2e1..daec64d 100644
--- a/arch/powerpc/include/asm/perf_event.h
+++ b/arch/powerpc/include/asm/perf_event.h
@@ -40,4 +40,7 @@
/* To support perf_regs sier update */
extern bool is_sier_available(void);
+/* To define perf extended regs mask value */
+extern u64 PERF_REG_EXTENDED_MASK;
+#define PERF_REG_EXTENDED_MASK PERF_REG_EXTENDED_MASK
#endif
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 86c9eb06..f6acabb 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -62,6 +62,11 @@ struct power_pmu {
int *blacklist_ev;
/* BHRB entries in the PMU */
int bhrb_nr;
+ /*
+ * set this flag with `PERF_PMU_CAP_EXTENDED_REGS` if
+ * the pmu supports extended perf regs capability
+ */
+ int capabilities;
};
/*
diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h
index f599064..225c64c 100644
--- a/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -48,6 +48,18 @@ enum perf_event_powerpc_regs {
PERF_REG_POWERPC_DSISR,
PERF_REG_POWERPC_SIER,
PERF_REG_POWERPC_MMCRA,
- PERF_REG_POWERPC_MAX,
+ /* Extended registers */
+ PERF_REG_POWERPC_MMCR0,
+ PERF_REG_POWERPC_MMCR1,
+ PERF_REG_POWERPC_MMCR2,
+ /* Max regs without the extended regs */
+ PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
};
+
+#define PERF_REG_PMU_MASK ((1ULL << PERF_REG_POWERPC_MAX) - 1)
+
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
+#define PERF_REG_PMU_MASK_300 (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) - PERF_REG_PMU_MASK)
+
+#define PERF_REG_MAX_ISA_300 (PERF_REG_POWERPC_MMCR2 + 1)
#endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index e29c846..65a0b76 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2317,6 +2317,7 @@ int register_power_pmu(struct power_pmu *pmu)
pmu->name);
power_pmu.attr_groups = ppmu->attr_groups;
+ power_pmu.capabilities |= (ppmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS);
#ifdef MSR_HV
/*
diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c
index a213a0a..9301e68 100644
--- a/arch/powerpc/perf/perf_regs.c
+++ b/arch/powerpc/perf/perf_regs.c
@@ -13,9 +13,11 @@
#include <asm/ptrace.h>
#include <asm/perf_regs.h>
+u64 PERF_REG_EXTENDED_MASK;
+
#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
-#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1))
+#define REG_RESERVED (~(PERF_REG_EXTENDED_MASK | PERF_REG_PMU_MASK))
static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = {
PT_REGS_OFFSET(PERF_REG_POWERPC_R0, gpr[0]),
@@ -69,10 +71,26 @@
PT_REGS_OFFSET(PERF_REG_POWERPC_MMCRA, dsisr),
};
+/* Function to return the extended register values */
+static u64 get_ext_regs_value(int idx)
+{
+ switch (idx) {
+ case PERF_REG_POWERPC_MMCR0:
+ return mfspr(SPRN_MMCR0);
+ case PERF_REG_POWERPC_MMCR1:
+ return mfspr(SPRN_MMCR1);
+ case PERF_REG_POWERPC_MMCR2:
+ return mfspr(SPRN_MMCR2);
+ default: return 0;
+ }
+}
+
u64 perf_reg_value(struct pt_regs *regs, int idx)
{
- if (WARN_ON_ONCE(idx >= PERF_REG_POWERPC_MAX))
- return 0;
+ u64 perf_reg_extended_max = PERF_REG_POWERPC_MAX;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ perf_reg_extended_max = PERF_REG_MAX_ISA_300;
if (idx == PERF_REG_POWERPC_SIER &&
(IS_ENABLED(CONFIG_FSL_EMB_PERF_EVENT) ||
@@ -85,6 +103,16 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
IS_ENABLED(CONFIG_PPC32)))
return 0;
+ if (idx >= PERF_REG_POWERPC_MAX && idx < perf_reg_extended_max)
+ return get_ext_regs_value(idx);
+
+ /*
+ * If the idx is referring to value beyond the
+ * supported registers, return 0 with a warning
+ */
+ if (WARN_ON_ONCE(idx >= perf_reg_extended_max))
+ return 0;
+
return regs_get_register(regs, pt_regs_offset[idx]);
}
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 05dae38..2a57e93 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -90,6 +90,8 @@ enum {
#define POWER9_MMCRA_IFM3 0x00000000C0000000UL
#define POWER9_MMCRA_BHRB_MASK 0x00000000C0000000UL
+extern u64 PERF_REG_EXTENDED_MASK;
+
/* Nasty Power9 specific hack */
#define PVR_POWER9_CUMULUS 0x00002000
@@ -434,6 +436,7 @@ static void power9_config_bhrb(u64 pmu_bhrb_filter)
.cache_events = &power9_cache_events,
.attr_groups = power9_pmu_attr_groups,
.bhrb_nr = 32,
+ .capabilities = PERF_PMU_CAP_EXTENDED_REGS,
};
int init_power9_pmu(void)
@@ -457,6 +460,9 @@ int init_power9_pmu(void)
}
}
+ /* Set the PERF_REG_EXTENDED_MASK here */
+ PERF_REG_EXTENDED_MASK = PERF_REG_PMU_MASK_300;
+
rc = register_power_pmu(&power9_pmu);
if (rc)
return rc;
--
1.8.3.1
^ permalink raw reply related
* [PATCH V5 0/4] powerpc/perf: Add support for perf extended regs in powerpc
From: Athira Rajeev @ 2020-07-27 17:16 UTC (permalink / raw)
To: mpe, acme, jolsa; +Cc: ravi.bangoria, mikey, maddy, linuxppc-dev, kjain
Patch set to add support for perf extended register capability in
powerpc. The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to
indicate the PMU which support extended registers. The generic code
define the mask of extended registers as 0 for non supported architectures.
Patches 1 and 2 are the kernel side changes needed to include
base support for extended regs in powerpc and in power10.
Patches 3 and 4 are the perf tools side changes needed to support the
extended registers.
patch 1/4 defines the PERF_PMU_CAP_EXTENDED_REGS mask to output the
values of mmcr0,mmcr1,mmcr2 for POWER9. Defines `PERF_REG_EXTENDED_MASK`
at runtime which contains mask value of the supported registers under
extended regs.
patch 2/4 adds the extended regs support for power10 and exposes
MMCR3, SIER2, SIER3 registers as part of extended regs.
Patch 3/4 and 4/4 adds extended regs to sample_reg_mask in the tool
side to use with `-I?` option for power9 and power10 respectively.
Ravi bangoria found an issue with `perf record -I` while testing the
changes. The same issue is currently being worked on here:
https://lkml.org/lkml/2020/7/19/413 and will be resolved once fix
from Jin Yao is merged.
This patch series is based on powerpc/next
Changelog:
Changes from v4 -> v5
- initialize `perf_reg_extended_max` to work on
all platforms as suggested by Ravi Bangoria
- Added Reviewed-and-Tested-by from Ravi Bangoria
Changes from v3 -> v4
- Split the series and send extended regs as separate patch set here.
Link to previous series :
https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=190462&state=*
Other PMU patches are already merged in powerpc/next.
- Fixed kernel build issue when using config having
CONFIG_PERF_EVENTS set and without CONFIG_PPC_PERF_CTRS
reported by kernel build bot.
- Included Reviewed-by from Kajol Jain.
- Addressed review comments from Ravi Bangoria to initialize `perf_reg_extended_max`
and define it in lowercase since it is local variable.
Anju T Sudhakar (2):
powerpc/perf: Add support for outputting extended regs in perf
intr_regs
tools/perf: Add perf tools support for extended register capability in
powerpc
Athira Rajeev (2):
powerpc/perf: Add extended regs support for power10 platform
tools/perf: Add perf tools support for extended regs in power10
arch/powerpc/include/asm/perf_event.h | 3 ++
arch/powerpc/include/asm/perf_event_server.h | 5 +++
arch/powerpc/include/uapi/asm/perf_regs.h | 20 ++++++++-
arch/powerpc/perf/core-book3s.c | 1 +
arch/powerpc/perf/perf_regs.c | 44 ++++++++++++++++++--
arch/powerpc/perf/power10-pmu.c | 6 +++
arch/powerpc/perf/power9-pmu.c | 6 +++
tools/arch/powerpc/include/uapi/asm/perf_regs.h | 20 ++++++++-
tools/perf/arch/powerpc/include/perf_regs.h | 8 +++-
tools/perf/arch/powerpc/util/header.c | 9 +---
tools/perf/arch/powerpc/util/perf_regs.c | 55 +++++++++++++++++++++++++
tools/perf/arch/powerpc/util/utils_header.h | 15 +++++++
12 files changed, 178 insertions(+), 14 deletions(-)
create mode 100644 tools/perf/arch/powerpc/util/utils_header.h
--
1.8.3.1
^ permalink raw reply
* [PATCH -next] powerpc/powernv/sriov: Remove unused but set variable 'phb'
From: Wei Yongjun @ 2020-07-27 17:11 UTC (permalink / raw)
To: Hulk Robot, Michael Ellerman, Oliver O'Halloran,
Alexey Kardashevskiy
Cc: linuxppc-dev, Wei Yongjun, linux-kernel
Gcc report warning as follows:
arch/powerpc/platforms/powernv/pci-sriov.c:602:25: warning:
variable 'phb' set but not used [-Wunused-but-set-variable]
602 | struct pnv_phb *phb;
| ^~~
This variable is not used, so this commit removing it.
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
---
arch/powerpc/platforms/powernv/pci-sriov.c | 2 --
1 file changed, 2 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
index 8404d8c3901d..7894745fd4f8 100644
--- a/arch/powerpc/platforms/powernv/pci-sriov.c
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -599,10 +599,8 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
static void pnv_pci_sriov_disable(struct pci_dev *pdev)
{
u16 num_vfs, base_pe;
- struct pnv_phb *phb;
struct pnv_iov_data *iov;
- phb = pci_bus_to_pnvhb(pdev->bus);
iov = pnv_iov_get(pdev);
num_vfs = iov->num_vfs;
base_pe = iov->vf_pe_arr[0].pe_number;
^ permalink raw reply related
* Re: [PATCH 0/5] cpuidle-pseries: Parse extended CEDE information for idle.
From: Rafael J. Wysocki @ 2020-07-27 14:14 UTC (permalink / raw)
To: Gautham R. Shenoy
Cc: Nathan Lynch, Michael Neuling, Vaidyanathan Srinivasan, Linux PM,
Linux Kernel Mailing List, Nicholas Piggin, linuxppc-dev
In-Reply-To: <20200707113235.GM14120@in.ibm.com>
On Tue, Jul 7, 2020 at 1:32 PM Gautham R Shenoy <ego@linux.vnet.ibm.com> wrote:
>
> Hi,
>
> On Tue, Jul 07, 2020 at 04:41:34PM +0530, Gautham R. Shenoy wrote:
> > From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
> >
> > Hi,
> >
> >
> >
> >
> > Gautham R. Shenoy (5):
> > cpuidle-pseries: Set the latency-hint before entering CEDE
> > cpuidle-pseries: Add function to parse extended CEDE records
> > cpuidle-pseries : Fixup exit latency for CEDE(0)
> > cpuidle-pseries : Include extended CEDE states in cpuidle framework
> > cpuidle-pseries: Block Extended CEDE(1) which adds no additional
> > value.
>
> Forgot to mention that these patches are on top of Nathan's series to
> remove extended CEDE offline and bogus topology update code :
> https://lore.kernel.org/linuxppc-dev/20200612051238.1007764-1-nathanl@linux.ibm.com/
OK, so this is targeted at the powerpc maintainers, isn't it?
^ permalink raw reply
* Re: [PATCH v2 4/5] powerpc/mm: Remove custom stack expansion checking
From: Daniel Axtens @ 2020-07-27 13:48 UTC (permalink / raw)
To: Michael Ellerman, linuxppc-dev; +Cc: linux-kernel
In-Reply-To: <20200724092528.1578671-4-mpe@ellerman.id.au>
Hi Michael,
I tested v1 of this. I ran the test from the bug with a range of stack
sizes, in a loop, for several hours and didn't see any crashes/signal
delivery failures.
I retested v2 for a few minutes just to be sure, and I ran stress-ng's
stack, stackmmap and bad-altstack stressors to make sure no obvious
kernel bugs were exposed. Nothing crashed.
All tests done on a P8 LE guest under KVM.
On that basis:
Tested-by: Daniel Axtens <dja@axtens.net>
The more I look at this the less qualified I feel to Review it, but
certainly it looks better than my ugly hack from late last year.
Kind regards,
Daniel
> We have powerpc specific logic in our page fault handling to decide if
> an access to an unmapped address below the stack pointer should expand
> the stack VMA.
>
> The logic aims to prevent userspace from doing bad accesses below the
> stack pointer. However as long as the stack is < 1MB in size, we allow
> all accesses without further checks. Adding some debug I see that I
> can do a full kernel build and LTP run, and not a single process has
> used more than 1MB of stack. So for the majority of processes the
> logic never even fires.
>
> We also recently found a nasty bug in this code which could cause
> userspace programs to be killed during signal delivery. It went
> unnoticed presumably because most processes use < 1MB of stack.
>
> The generic mm code has also grown support for stack guard pages since
> this code was originally written, so the most heinous case of the
> stack expanding into other mappings is now handled for us.
>
> Finally although some other arches have special logic in this path,
> from what I can tell none of x86, arm64, arm and s390 impose any extra
> checks other than those in expand_stack().
>
> So drop our complicated logic and like other architectures just let
> the stack expand as long as its within the rlimit.
>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
> arch/powerpc/mm/fault.c | 109 ++--------------------------------------
> 1 file changed, 5 insertions(+), 104 deletions(-)
>
> v2: no change just rebased.
>
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 3ebb1792e636..925a7231abb3 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -42,39 +42,7 @@
> #include <asm/kup.h>
> #include <asm/inst.h>
>
> -/*
> - * Check whether the instruction inst is a store using
> - * an update addressing form which will update r1.
> - */
> -static bool store_updates_sp(struct ppc_inst inst)
> -{
> - /* check for 1 in the rA field */
> - if (((ppc_inst_val(inst) >> 16) & 0x1f) != 1)
> - return false;
> - /* check major opcode */
> - switch (ppc_inst_primary_opcode(inst)) {
> - case OP_STWU:
> - case OP_STBU:
> - case OP_STHU:
> - case OP_STFSU:
> - case OP_STFDU:
> - return true;
> - case OP_STD: /* std or stdu */
> - return (ppc_inst_val(inst) & 3) == 1;
> - case OP_31:
> - /* check minor opcode */
> - switch ((ppc_inst_val(inst) >> 1) & 0x3ff) {
> - case OP_31_XOP_STDUX:
> - case OP_31_XOP_STWUX:
> - case OP_31_XOP_STBUX:
> - case OP_31_XOP_STHUX:
> - case OP_31_XOP_STFSUX:
> - case OP_31_XOP_STFDUX:
> - return true;
> - }
> - }
> - return false;
> -}
> +
> /*
> * do_page_fault error handling helpers
> */
> @@ -267,57 +235,6 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
> return false;
> }
>
> -// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
> -#define SIGFRAME_MAX_SIZE (4096 + 128)
> -
> -static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
> - struct vm_area_struct *vma, unsigned int flags,
> - bool *must_retry)
> -{
> - /*
> - * N.B. The POWER/Open ABI allows programs to access up to
> - * 288 bytes below the stack pointer.
> - * The kernel signal delivery code writes a bit over 4KB
> - * below the stack pointer (r1) before decrementing it.
> - * The exec code can write slightly over 640kB to the stack
> - * before setting the user r1. Thus we allow the stack to
> - * expand to 1MB without further checks.
> - */
> - if (address + 0x100000 < vma->vm_end) {
> - struct ppc_inst __user *nip = (struct ppc_inst __user *)regs->nip;
> - /* get user regs even if this fault is in kernel mode */
> - struct pt_regs *uregs = current->thread.regs;
> - if (uregs == NULL)
> - return true;
> -
> - /*
> - * A user-mode access to an address a long way below
> - * the stack pointer is only valid if the instruction
> - * is one which would update the stack pointer to the
> - * address accessed if the instruction completed,
> - * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
> - * (or the byte, halfword, float or double forms).
> - *
> - * If we don't check this then any write to the area
> - * between the last mapped region and the stack will
> - * expand the stack rather than segfaulting.
> - */
> - if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
> - return false;
> -
> - if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
> - access_ok(nip, sizeof(*nip))) {
> - struct ppc_inst inst;
> -
> - if (!probe_user_read_inst(&inst, nip))
> - return !store_updates_sp(inst);
> - *must_retry = true;
> - }
> - return true;
> - }
> - return false;
> -}
> -
> #ifdef CONFIG_PPC_MEM_KEYS
> static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey,
> struct vm_area_struct *vma)
> @@ -483,7 +400,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
> int is_user = user_mode(regs);
> int is_write = page_fault_is_write(error_code);
> vm_fault_t fault, major = 0;
> - bool must_retry = false;
> bool kprobe_fault = kprobe_page_fault(regs, 11);
>
> if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
> @@ -572,30 +488,15 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
> vma = find_vma(mm, address);
> if (unlikely(!vma))
> return bad_area(regs, address);
> - if (likely(vma->vm_start <= address))
> - goto good_area;
> - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
> - return bad_area(regs, address);
>
> - /* The stack is being expanded, check if it's valid */
> - if (unlikely(bad_stack_expansion(regs, address, vma, flags,
> - &must_retry))) {
> - if (!must_retry)
> + if (unlikely(vma->vm_start > address)) {
> + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
> return bad_area(regs, address);
>
> - mmap_read_unlock(mm);
> - if (fault_in_pages_readable((const char __user *)regs->nip,
> - sizeof(unsigned int)))
> - return bad_area_nosemaphore(regs, address);
> - goto retry;
> + if (unlikely(expand_stack(vma, address)))
> + return bad_area(regs, address);
> }
>
> - /* Try to expand it */
> - if (unlikely(expand_stack(vma, address)))
> - return bad_area(regs, address);
> -
> -good_area:
> -
> #ifdef CONFIG_PPC_MEM_KEYS
> if (unlikely(access_pkey_error(is_write, is_exec,
> (error_code & DSISR_KEYFAULT), vma)))
> --
> 2.25.1
^ permalink raw reply
* Re: [PATCH v3 1/2] cpuidle: Trace IPI based and timer based wakeup latency from idle states
From: Rafael J. Wysocki @ 2020-07-27 13:42 UTC (permalink / raw)
To: Pratik Rajesh Sampat
Cc: Gautham R. Shenoy, pratik.r.sampat, Linux PM, Daniel Lezcano,
Rafael J. Wysocki, linuxppc-dev, Nicholas Piggin, Paul Mackerras,
linux-kselftest, Shuah Khan, srivatsa, Linux Kernel Mailing List
In-Reply-To: <20200721124300.65615-2-psampat@linux.ibm.com>
On Tue, Jul 21, 2020 at 2:43 PM Pratik Rajesh Sampat
<psampat@linux.ibm.com> wrote:
>
> Fire directed smp_call_function_single IPIs from a specified source
> CPU to the specified target CPU to reduce the noise we have to wade
> through in the trace log.
And what's the purpose of it?
> The module is based on the idea written by Srivatsa Bhat and maintained
> by Vaidyanathan Srinivasan internally.
>
> Queue HR timer and measure jitter. Wakeup latency measurement for idle
> states using hrtimer. Echo a value in ns to timer_test_function and
> watch trace. A HRtimer will be queued and when it fires the expected
> wakeup vs actual wakeup is computes and delay printed in ns.
>
> Implemented as a module which utilizes debugfs so that it can be
> integrated with selftests.
>
> To include the module, check option and include as module
> kernel hacking -> Cpuidle latency selftests
>
> [srivatsa.bhat@linux.vnet.ibm.com: Initial implementation in
> cpidle/sysfs]
>
> [svaidy@linux.vnet.ibm.com: wakeup latency measurements using hrtimer
> and fix some of the time calculation]
>
> [ego@linux.vnet.ibm.com: Fix some whitespace and tab errors and
> increase the resolution of IPI wakeup]
>
> Signed-off-by: Pratik Rajesh Sampat <psampat@linux.ibm.com>
> Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
> ---
> drivers/cpuidle/Makefile | 1 +
> drivers/cpuidle/test-cpuidle_latency.c | 150 +++++++++++++++++++++++++
> lib/Kconfig.debug | 10 ++
> 3 files changed, 161 insertions(+)
> create mode 100644 drivers/cpuidle/test-cpuidle_latency.c
>
> diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
> index f07800cbb43f..2ae05968078c 100644
> --- a/drivers/cpuidle/Makefile
> +++ b/drivers/cpuidle/Makefile
> @@ -8,6 +8,7 @@ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
> obj-$(CONFIG_DT_IDLE_STATES) += dt_idle_states.o
> obj-$(CONFIG_ARCH_HAS_CPU_RELAX) += poll_state.o
> obj-$(CONFIG_HALTPOLL_CPUIDLE) += cpuidle-haltpoll.o
> +obj-$(CONFIG_IDLE_LATENCY_SELFTEST) += test-cpuidle_latency.o
>
> ##################################################################################
> # ARM SoC drivers
> diff --git a/drivers/cpuidle/test-cpuidle_latency.c b/drivers/cpuidle/test-cpuidle_latency.c
> new file mode 100644
> index 000000000000..61574665e972
> --- /dev/null
> +++ b/drivers/cpuidle/test-cpuidle_latency.c
> @@ -0,0 +1,150 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Module-based API test facility for cpuidle latency using IPIs and timers
I'd like to see a more detailed description of what it does and how it
works here.
> + */
> +
> +#include <linux/debugfs.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +
> +/* IPI based wakeup latencies */
> +struct latency {
> + unsigned int src_cpu;
> + unsigned int dest_cpu;
> + ktime_t time_start;
> + ktime_t time_end;
> + u64 latency_ns;
> +} ipi_wakeup;
> +
> +static void measure_latency(void *info)
> +{
> + struct latency *v;
> + ktime_t time_diff;
> +
> + v = (struct latency *)info;
> + v->time_end = ktime_get();
> + time_diff = ktime_sub(v->time_end, v->time_start);
> + v->latency_ns = ktime_to_ns(time_diff);
> +}
> +
> +void run_smp_call_function_test(unsigned int cpu)
> +{
> + ipi_wakeup.src_cpu = smp_processor_id();
> + ipi_wakeup.dest_cpu = cpu;
> + ipi_wakeup.time_start = ktime_get();
> + smp_call_function_single(cpu, measure_latency, &ipi_wakeup, 1);
> +}
> +
> +/* Timer based wakeup latencies */
> +struct timer_data {
> + unsigned int src_cpu;
> + u64 timeout;
> + ktime_t time_start;
> + ktime_t time_end;
> + struct hrtimer timer;
> + u64 timeout_diff_ns;
> +} timer_wakeup;
> +
> +static enum hrtimer_restart timer_called(struct hrtimer *hrtimer)
> +{
> + struct timer_data *w;
> + ktime_t time_diff;
> +
> + w = container_of(hrtimer, struct timer_data, timer);
> + w->time_end = ktime_get();
> +
> + time_diff = ktime_sub(w->time_end, w->time_start);
> + time_diff = ktime_sub(time_diff, ns_to_ktime(w->timeout));
> + w->timeout_diff_ns = ktime_to_ns(time_diff);
> + return HRTIMER_NORESTART;
> +}
> +
> +static void run_timer_test(unsigned int ns)
> +{
> + hrtimer_init(&timer_wakeup.timer, CLOCK_MONOTONIC,
> + HRTIMER_MODE_REL);
> + timer_wakeup.timer.function = timer_called;
> + timer_wakeup.time_start = ktime_get();
> + timer_wakeup.src_cpu = smp_processor_id();
> + timer_wakeup.timeout = ns;
> +
> + hrtimer_start(&timer_wakeup.timer, ns_to_ktime(ns),
> + HRTIMER_MODE_REL_PINNED);
> +}
> +
> +static struct dentry *dir;
> +
> +static int cpu_read_op(void *data, u64 *value)
> +{
> + *value = ipi_wakeup.dest_cpu;
> + return 0;
> +}
> +
> +static int cpu_write_op(void *data, u64 value)
> +{
> + run_smp_call_function_test(value);
> + return 0;
> +}
> +DEFINE_SIMPLE_ATTRIBUTE(ipi_ops, cpu_read_op, cpu_write_op, "%llu\n");
> +
> +static int timeout_read_op(void *data, u64 *value)
> +{
> + *value = timer_wakeup.timeout;
> + return 0;
> +}
> +
> +static int timeout_write_op(void *data, u64 value)
> +{
> + run_timer_test(value);
> + return 0;
> +}
> +DEFINE_SIMPLE_ATTRIBUTE(timeout_ops, timeout_read_op, timeout_write_op, "%llu\n");
> +
> +static int __init latency_init(void)
> +{
> + struct dentry *temp;
> +
> + dir = debugfs_create_dir("latency_test", 0);
> + if (!dir) {
> + pr_alert("latency_test: failed to create /sys/kernel/debug/latency_test\n");
> + return -1;
> + }
> + temp = debugfs_create_file("ipi_cpu_dest",
> + 0666,
> + dir,
> + NULL,
> + &ipi_ops);
> + if (!temp) {
> + pr_alert("latency_test: failed to create /sys/kernel/debug/ipi_cpu_dest\n");
> + return -1;
> + }
> + debugfs_create_u64("ipi_latency_ns", 0444, dir, &ipi_wakeup.latency_ns);
> + debugfs_create_u32("ipi_cpu_src", 0444, dir, &ipi_wakeup.src_cpu);
> +
> + temp = debugfs_create_file("timeout_expected_ns",
> + 0666,
> + dir,
> + NULL,
> + &timeout_ops);
> + if (!temp) {
> + pr_alert("latency_test: failed to create /sys/kernel/debug/timeout_expected_ns\n");
> + return -1;
> + }
> + debugfs_create_u64("timeout_diff_ns", 0444, dir, &timer_wakeup.timeout_diff_ns);
> + debugfs_create_u32("timeout_cpu_src", 0444, dir, &timer_wakeup.src_cpu);
> + pr_info("Latency Test module loaded\n");
> + return 0;
> +}
> +
> +static void __exit latency_cleanup(void)
> +{
> + pr_info("Cleaning up Latency Test module.\n");
> + debugfs_remove_recursive(dir);
> +}
> +
> +module_init(latency_init);
> +module_exit(latency_cleanup);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("IBM Corporation");
> +MODULE_DESCRIPTION("Measuring idle latency for IPIs and Timers");
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index d74ac0fd6b2d..e2283790245a 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -1375,6 +1375,16 @@ config DEBUG_KOBJECT
> If you say Y here, some extra kobject debugging messages will be sent
> to the syslog.
>
> +config IDLE_LATENCY_SELFTEST
> + tristate "Cpuidle latency selftests"
> + depends on CPU_IDLE
> + help
> + This option provides a kernel module that runs tests using the IPI and
> + timers to measure latency.
What latency does it measure?
> +
> + Say M if you want these self tests to build as a module.
> + Say N if you are unsure.
> +
> config DEBUG_KOBJECT_RELEASE
> bool "kobject release debugging"
> depends on DEBUG_OBJECTS_TIMERS
> --
> 2.25.4
>
^ permalink raw reply
* Re: [PATCH] lockdep: Fix TRACE_IRQFLAGS vs NMIs
From: Ingo Molnar @ 2020-07-27 13:17 UTC (permalink / raw)
To: peterz
Cc: linux-arch, linux-sh, jcmvbkbc, Will Deacon, x86, linux-kernel,
npiggin, borntraeger, linuxppc-dev
In-Reply-To: <20200727124852.GK119549@hirez.programming.kicks-ass.net>
* peterz@infradead.org <peterz@infradead.org> wrote:
>
> Prior to commit 859d069ee1dd ("lockdep: Prepare for NMI IRQ state
> tracking") IRQ state tracking was disabled in NMIs due to nmi_enter()
> doing lockdep_off() -- with the obvious requirement that NMI entry
> call nmi_enter() before trace_hardirqs_off().
>
> [ afaict, PowerPC and SH violate this order on their NMI entry ]
>
> However, that commit explicitly changed lockdep_hardirqs_*() to ignore
> lockdep_off() and breaks every architecture that has irq-tracing in
> it's NMI entry that hasn't been fixed up (x86 being the only fixed one
> at this point).
>
> The reason for this change is that by ignoring lockdep_off() we can:
>
> - get rid of 'current->lockdep_recursion' in lockdep_assert_irqs*()
> which was going to to give header-recursion issues with the
> seqlock rework.
>
> - allow these lockdep_assert_*() macros to function in NMI context.
>
> Restore the previous state of things and allow an architecture to
> opt-in to the NMI IRQ tracking support, however instead of relying on
> lockdep_off(), rely on in_nmi(), both are part of nmi_enter() and so
> over-all entry ordering doesn't need to change.
>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
> arch/x86/Kconfig.debug | 3 +++
> kernel/locking/lockdep.c | 8 +++++++-
> lib/Kconfig.debug | 6 ++++++
> 3 files changed, 16 insertions(+), 1 deletion(-)
Tree management side note: to apply this I've created a new
tip:locking/nmi branch, which is based off the existing NMI vs. IRQ
tracing commits included in locking/core:
ed00495333cc: ("locking/lockdep: Fix TRACE_IRQFLAGS vs. NMIs")
ba1f2b2eaa2a: ("x86/entry: Fix NMI vs IRQ state tracking")
859d069ee1dd: ("lockdep: Prepare for NMI IRQ state tracking")
248591f5d257: ("kcsan: Make KCSAN compatible with new IRQ state tracking")
e1bcad609f5a: ("Merge branch 'tip/x86/entry'")
b037b09b9058: ("x86/entry: Rename idtentry_enter/exit_cond_rcu() to idtentry_enter/exit()")
dcb7fd82c75e: ("Linux 5.8-rc4")
This locking/nmi branch can then be merged into irq/entry (there's a
bunch of conflicts between them), without coupling all of v5.9's
locking changes to Thomas's generic entry work.
Thanks,
Ingo
^ permalink raw reply
* [PATCH] lockdep: Fix TRACE_IRQFLAGS vs NMIs
From: peterz @ 2020-07-27 12:48 UTC (permalink / raw)
To: Ingo Molnar, Will Deacon
Cc: linux-arch, linux-sh, jcmvbkbc, x86, linux-kernel, npiggin,
borntraeger, linuxppc-dev
Prior to commit 859d069ee1dd ("lockdep: Prepare for NMI IRQ state
tracking") IRQ state tracking was disabled in NMIs due to nmi_enter()
doing lockdep_off() -- with the obvious requirement that NMI entry
call nmi_enter() before trace_hardirqs_off().
[ afaict, PowerPC and SH violate this order on their NMI entry ]
However, that commit explicitly changed lockdep_hardirqs_*() to ignore
lockdep_off() and breaks every architecture that has irq-tracing in
it's NMI entry that hasn't been fixed up (x86 being the only fixed one
at this point).
The reason for this change is that by ignoring lockdep_off() we can:
- get rid of 'current->lockdep_recursion' in lockdep_assert_irqs*()
which was going to to give header-recursion issues with the
seqlock rework.
- allow these lockdep_assert_*() macros to function in NMI context.
Restore the previous state of things and allow an architecture to
opt-in to the NMI IRQ tracking support, however instead of relying on
lockdep_off(), rely on in_nmi(), both are part of nmi_enter() and so
over-all entry ordering doesn't need to change.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
arch/x86/Kconfig.debug | 3 +++
kernel/locking/lockdep.c | 8 +++++++-
lib/Kconfig.debug | 6 ++++++
3 files changed, 16 insertions(+), 1 deletion(-)
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -3,6 +3,9 @@
config TRACE_IRQFLAGS_SUPPORT
def_bool y
+config TRACE_IRQFLAGS_NMI_SUPPORT
+ def_bool y
+
config EARLY_PRINTK_USB
bool
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3712,6 +3712,9 @@ void noinstr lockdep_hardirqs_on(unsigne
* and not rely on hardware state like normal interrupts.
*/
if (unlikely(in_nmi())) {
+ if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+ return;
+
/*
* Skip:
* - recursion check, because NMI can hit lockdep;
@@ -3773,7 +3776,10 @@ void noinstr lockdep_hardirqs_off(unsign
* they will restore the software state. This ensures the software
* state is consistent inside NMIs as well.
*/
- if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)))
+ if (in_nmi()) {
+ if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+ return;
+ } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)
return;
/*
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1325,11 +1325,17 @@ config WW_MUTEX_SELFTEST
endmenu # lock debugging
config TRACE_IRQFLAGS
+ depends on TRACE_IRQFLAGS_SUPPORT
bool
help
Enables hooks to interrupt enabling and disabling for
either tracing or lock debugging.
+config TRACE_IRQFLAGS_NMI
+ def_bool y
+ depends on TRACE_IRQFLAGS
+ depends on TRACE_IRQFLAGS_NMI_SUPPORT
+
config STACKTRACE
bool "Stack backtrace support"
depends on STACKTRACE_SUPPORT
^ permalink raw reply
* Re: [PATCH] powerpc/64s/hash: Fix hash_preload running with interrupts enabled
From: Michael Ellerman @ 2020-07-27 12:35 UTC (permalink / raw)
To: Athira Rajeev, Nicholas Piggin; +Cc: Aneesh Kumar K . V, linuxppc-dev
In-Reply-To: <4925309C-A338-4C0F-90E3-4522643021CB@linux.vnet.ibm.com>
Athira Rajeev <atrajeev@linux.vnet.ibm.com> writes:
>> On 27-Jul-2020, at 11:39 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> Commit 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
>> caller") removed the local_irq_disable from hash_preload, but it was
>> required for more than just the page table walk: the hash pte busy bit is
>> effectively a lock which may be taken in interrupt context, and the local
>> update flag test must not be preempted before it's used.
>>
>> This solves apparent lockups with perf interrupting __hash_page_64K. If
>> get_perf_callchain then also takes a hash fault on the same page while it
>> is already locked, it will loop forever taking hash faults, which looks like
>> this:
>>
>> cpu 0x49e: Vector: 100 (System Reset) at [c00000001a4f7d70]
>> pc: c000000000072dc8: hash_page_mm+0x8/0x800
>> lr: c00000000000c5a4: do_hash_page+0x24/0x38
>> sp: c0002ac1cc69ac70
>> msr: 8000000000081033
>> current = 0xc0002ac1cc602e00
>> paca = 0xc00000001de1f280 irqmask: 0x03 irq_happened: 0x01
>> pid = 20118, comm = pread2_processe
>> Linux version 5.8.0-rc6-00345-g1fad14f18bc6
>> 49e:mon> t
>> [c0002ac1cc69ac70] c00000000000c5a4 do_hash_page+0x24/0x38 (unreliable)
>> --- Exception: 300 (Data Access) at c00000000008fa60 __copy_tofrom_user_power7+0x20c/0x7ac
>> [link register ] c000000000335d10 copy_from_user_nofault+0xf0/0x150
>> [c0002ac1cc69af70] c00032bf9fa3c880 (unreliable)
>> [c0002ac1cc69afa0] c000000000109df0 read_user_stack_64+0x70/0xf0
>> [c0002ac1cc69afd0] c000000000109fcc perf_callchain_user_64+0x15c/0x410
>> [c0002ac1cc69b060] c000000000109c00 perf_callchain_user+0x20/0x40
>> [c0002ac1cc69b080] c00000000031c6cc get_perf_callchain+0x25c/0x360
>> [c0002ac1cc69b120] c000000000316b50 perf_callchain+0x70/0xa0
>> [c0002ac1cc69b140] c000000000316ddc perf_prepare_sample+0x25c/0x790
>> [c0002ac1cc69b1a0] c000000000317350 perf_event_output_forward+0x40/0xb0
>> [c0002ac1cc69b220] c000000000306138 __perf_event_overflow+0x88/0x1a0
>> [c0002ac1cc69b270] c00000000010cf70 record_and_restart+0x230/0x750
>> [c0002ac1cc69b620] c00000000010d69c perf_event_interrupt+0x20c/0x510
>> [c0002ac1cc69b730] c000000000027d9c performance_monitor_exception+0x4c/0x60
>> [c0002ac1cc69b750] c00000000000b2f8 performance_monitor_common_virt+0x1b8/0x1c0
>> --- Exception: f00 (Performance Monitor) at c0000000000cb5b0 pSeries_lpar_hpte_insert+0x0/0x160
>> [link register ] c0000000000846f0 __hash_page_64K+0x210/0x540
>> [c0002ac1cc69ba50] 0000000000000000 (unreliable)
>> [c0002ac1cc69bb00] c000000000073ae0 update_mmu_cache+0x390/0x3a0
>> [c0002ac1cc69bb70] c00000000037f024 wp_page_copy+0x364/0xce0
>> [c0002ac1cc69bc20] c00000000038272c do_wp_page+0xdc/0xa60
>> [c0002ac1cc69bc70] c0000000003857bc handle_mm_fault+0xb9c/0x1b60
>> [c0002ac1cc69bd50] c00000000006c434 __do_page_fault+0x314/0xc90
>> [c0002ac1cc69be20] c00000000000c5c8 handle_page_fault+0x10/0x2c
>> --- Exception: 300 (Data Access) at 00007fff8c861fe8
>> SP (7ffff6b19660) is in userspace
>>
>> Reported-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>> Reported-by: Anton Blanchard <anton@ozlabs.org>
>> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> Fixes: 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
>> caller")
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>
>
> Hi,
>
> Tested with the patch and it fixes the lockups I was seeing with my test run.
> Thanks for the fix.
>
> Tested-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Thanks for testing.
What test are you running?
cheers
^ permalink raw reply
* Re: [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame
From: Michael Ellerman @ 2020-07-27 12:28 UTC (permalink / raw)
To: Gabriel Paubert; +Cc: linuxppc-dev, linux-kernel, dja
In-Reply-To: <20200727082331.GA2110@lt-gp.iram.es>
Gabriel Paubert <paubert@iram.es> writes:
> On Fri, Jul 24, 2020 at 07:25:25PM +1000, Michael Ellerman wrote:
>> We have powerpc specific logic in our page fault handling to decide if
>> an access to an unmapped address below the stack pointer should expand
>> the stack VMA.
>>
>> The code was originally added in 2004 "ported from 2.4". The rough
>> logic is that the stack is allowed to grow to 1MB with no extra
>> checking. Over 1MB the access must be within 2048 bytes of the stack
>> pointer, or be from a user instruction that updates the stack pointer.
>>
>> The 2048 byte allowance below the stack pointer is there to cover the
>> 288 byte "red zone" as well as the "about 1.5kB" needed by the signal
>> delivery code.
>>
>> Unfortunately since then the signal frame has expanded, and is now
>> 4224 bytes on 64-bit kernels with transactional memory enabled.
>
> Are there really users of transactional memory in the wild?
Not many that I've heard of, but some.
Though anything that does use it needs to be written to fallback to
regular locking if TM is not available anyway.
> Just asking because Power10 removes TM, and Power9 has had some issues
> with it AFAICT.
It varies on different Power9 chip levels. For guests it should work.
> Getting rid of it (if possible) would result in smaller signal frames,
> with simpler signal delivery code (probably slightly faster also).
All the kernel code should be behind CONFIG_PPC_TRANSACTIONAL_MEM.
Deciding to disable that is really a distro decision.
In upstream we tend not to drop support for existing hardware while
people are still using it. But we could make a special case for TM,
because it's quite intrusive. I think we'd wait for a major distro to
ship without TM enabled before we did that though.
cheers
^ permalink raw reply
* Re: [PATCH] powerpc/test_emulate_sstep: Fix build error
From: Michael Ellerman @ 2020-07-27 12:07 UTC (permalink / raw)
To: Michael Ellerman, linuxppc-dev
In-Reply-To: <20200724004109.1461709-1-mpe@ellerman.id.au>
On Fri, 24 Jul 2020 10:41:09 +1000, Michael Ellerman wrote:
> ppc64_book3e_allmodconfig fails with:
>
> arch/powerpc/lib/test_emulate_step.c: In function 'test_pld':
> arch/powerpc/lib/test_emulate_step.c:113:7: error: implicit declaration of function 'cpu_has_feature'
> 113 | if (!cpu_has_feature(CPU_FTR_ARCH_31)) {
> | ^~~~~~~~~~~~~~~
>
> [...]
Applied to powerpc/next.
[1/1] powerpc/test_emulate_sstep: Fix build error
https://git.kernel.org/powerpc/c/70cc062c47e7851335ff4c44ba9b362174baf7d4
cheers
^ permalink raw reply
* Re: [PATCH] powerpc/sstep: Fix incorrect CONFIG symbol in scv handling
From: Michael Ellerman @ 2020-07-27 12:07 UTC (permalink / raw)
To: Michael Ellerman, linuxppc-dev; +Cc: christophe.leroy
In-Reply-To: <20200724131609.1640533-1-mpe@ellerman.id.au>
On Fri, 24 Jul 2020 23:16:09 +1000, Michael Ellerman wrote:
> When I "fixed" the ppc64e build in Nick's recent patch, I typoed the
> CONFIG symbol, resulting in one that doesn't exist. Fix it to use the
> correct symbol.
Applied to powerpc/next.
[1/1] powerpc/sstep: Fix incorrect CONFIG symbol in scv handling
https://git.kernel.org/powerpc/c/826b07b190c8ca69ce674f13b4dc9be2bc536fcd
cheers
^ permalink raw reply
* [powerpc:next] BUILD SUCCESS 86052e407e8e1964c81965de25832258875a0e6d
From: kernel test robot @ 2020-07-27 11:48 UTC (permalink / raw)
To: Michael Ellerman; +Cc: linuxppc-dev
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
branch HEAD: 86052e407e8e1964c81965de25832258875a0e6d powerpc/powernv/pci.h: delete duplicated word
elapsed time: 1289m
configs tested: 70
configs skipped: 1
The following configs have been built successfully.
More configs may be tested in the coming days.
arm defconfig
arm64 allyesconfig
arm64 defconfig
arm allyesconfig
arm allmodconfig
i386 allyesconfig
i386 defconfig
ia64 allmodconfig
ia64 defconfig
ia64 allyesconfig
m68k allmodconfig
m68k defconfig
m68k allyesconfig
m68k sun3_defconfig
nios2 defconfig
arc allyesconfig
nds32 allnoconfig
c6x allyesconfig
nios2 allyesconfig
openrisc defconfig
nds32 defconfig
csky defconfig
alpha defconfig
alpha allyesconfig
xtensa allyesconfig
h8300 allyesconfig
arc defconfig
sh allmodconfig
xtensa defconfig
parisc defconfig
s390 allyesconfig
parisc allyesconfig
s390 defconfig
sparc allyesconfig
sparc defconfig
mips allyesconfig
mips allmodconfig
powerpc defconfig
powerpc allyesconfig
powerpc allmodconfig
powerpc allnoconfig
i386 randconfig-a003-20200727
i386 randconfig-a005-20200727
i386 randconfig-a004-20200727
i386 randconfig-a006-20200727
i386 randconfig-a002-20200727
i386 randconfig-a001-20200727
x86_64 randconfig-a005-20200727
x86_64 randconfig-a004-20200727
x86_64 randconfig-a003-20200727
x86_64 randconfig-a006-20200727
x86_64 randconfig-a002-20200727
x86_64 randconfig-a001-20200727
i386 randconfig-a016-20200727
i386 randconfig-a013-20200727
i386 randconfig-a012-20200727
i386 randconfig-a015-20200727
i386 randconfig-a011-20200727
i386 randconfig-a014-20200727
riscv allyesconfig
riscv allnoconfig
riscv defconfig
riscv allmodconfig
sparc64 defconfig
x86_64 rhel-7.6-kselftests
x86_64 rhel-8.3
x86_64 kexec
x86_64 rhel
x86_64 allyesconfig
x86_64 defconfig
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
^ permalink raw reply
* Re: [PATCH v2] powerpc/book3s64/radix: Add kernel command line option to disable radix GTSE
From: Bharata B Rao @ 2020-07-27 11:36 UTC (permalink / raw)
To: Aneesh Kumar K.V; +Cc: linuxppc-dev
In-Reply-To: <20200727085908.420806-1-aneesh.kumar@linux.ibm.com>
On Mon, Jul 27, 2020 at 02:29:08PM +0530, Aneesh Kumar K.V wrote:
> This adds a kernel command line option that can be used to disable GTSE support.
> Disabling GTSE implies kernel will make hcalls to invalidate TLB entries.
>
> This was done so that we can do VM migration between configs that enable/disable
> GTSE support via hypervisor. To migrate a VM from a system that supports
> GTSE to a system that doesn't, we can boot the guest with
> radix_hcall_invalidate=on, thereby forcing the guest to use hcalls for TLB
> invalidates.
>
> The check for hcall availability is done in pSeries_setup_arch so that
> the panic message appears on the console. This should only happen on
> a hypervisor that doesn't force the guest to hash translation even
> though it can't handle the radix GTSE=0 request via CAS. With
> radix_hcall_invalidate=on if the hypervisor doesn't support hcall_rpt_invalidate
> hcall it should force the LPAR to hash translation.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Tested
1. radix_hcall_invalidate=on with KVM implementation of H_RPT_INVALIDATE hcall,
the tlb flush calls get off-loaded to the hcall.
2. radix_hcall_invalidate=on w/o H_RPT_INVALIDATE hcall, the guest kernel
panics as per design.
Tested-by: Bharata B Rao <bharata@linux.ibm.com>
^ permalink raw reply
* Re: [PATCH v3 09/10] powerpc/smp: Create coregroup domain
From: Srikar Dronamraju @ 2020-07-27 11:18 UTC (permalink / raw)
To: Gautham R Shenoy
Cc: Nathan Lynch, Michael Neuling, Peter Zijlstra, LKML,
Nicholas Piggin, Valentin Schneider, Oliver O'Halloran,
Jordan Niethe, linuxppc-dev, Ingo Molnar
In-Reply-To: <20200727043941.GA18303@in.ibm.com>
* Gautham R Shenoy <ego@linux.vnet.ibm.com> [2020-07-27 10:09:41]:
> >
> > static void fixup_topology(void)
> > {
> > + if (!has_coregroup_support())
> > + powerpc_topology[mc_idx].mask = cpu_bigcore_mask;
> > +
> > if (shared_caches) {
> > pr_info("Using shared cache scheduler topology\n");
> > powerpc_topology[bigcore_idx].mask = shared_cache_mask;
>
>
> Suppose we consider a topology which does not have coregroup_support,
> but has shared_caches. In that case, we would want our coregroup
> domain to degenerate.
>
> From the above code, after the fixup, our topology will look as
> follows:
>
> static struct sched_domain_topology_level powerpc_topology[] = {
> { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
> { cpu_bigcore_mask, SD_INIT_NAME(MC) },
> { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> { NULL, },
>
> So, in this case, the core-group domain (identified by MC) will
> degenerate only if cpu_bigcore_mask() and shared_cache_mask() return
> the same value. This may work for existing platforms, because either
> shared_caches don't exist, or when they do, cpu_bigcore_mask and
> shared_cache_mask return the same set of CPUs. But this may or may not
> continue to hold good in the future.
>
> Furthermore, if that is always going to be the case that in the
> presence of shared_caches the cpu_bigcore_mask() and
> shared_cache_mask() will always be the same, then why even define two
> separate masks and not just have only the cpu_bigcore_mask() ?
>
Your two statements are contradicting. In the former you are saying we
should be future proof and in the latter, you are asking for why add if they
are both going to be the same.
> The correct way would be to set the powerpc_topology[mc_idx].mask to
> powerpc_topology[bigcore_idx].mask *after* we have fixedup the
> big_core level.
The reason I modified it in v4 is not for degeneration or for future case
but for the current PowerNV/SMT 4 case. I could have as well detected the
the same and modified bigcore but thought fixup at one place would be
better.
--
Thanks and Regards
Srikar Dronamraju
^ permalink raw reply
* Re: [PATCH v2 01/14] powerpc/eeh: Remove eeh_dev_phb_init_dynamic()
From: Michael Ellerman @ 2020-07-27 10:53 UTC (permalink / raw)
To: Michael Ellerman, linuxppc-dev, Oliver O'Halloran
In-Reply-To: <159583477725.602200.17371356742597086381.b4-ty@ellerman.id.au>
Michael Ellerman <patch-notifications@ellerman.id.au> writes:
> On Wed, 22 Jul 2020 14:26:15 +1000, Oliver O'Halloran wrote:
>> This function is a one line wrapper around eeh_phb_pe_create() and despite
>> the name it doesn't create any eeh_dev structures. Replace it with direct
>> calls to eeh_phb_pe_create() since that does what it says on the tin
>> and removes a layer of indirection.
>
> Applied to powerpc/next.
>
> [01/14] powerpc/eeh: Remove eeh_dev_phb_init_dynamic()
> https://git.kernel.org/powerpc/c/475028efc708880e16e61cc4cbbc00af784cb39b
Something weird happened with the "thanks" script. Pretty sure I applied v3.
I think I applied this version previously and the script just matched
the subjects?
Anyway, ignore this mail.
cheers
^ permalink raw reply
* Re: [PATCH v2 2/5] powerpc: Allow 4224 bytes of stack expansion for the signal frame
From: Daniel Axtens @ 2020-07-27 10:50 UTC (permalink / raw)
To: Michael Ellerman, linuxppc-dev; +Cc: linux-kernel
In-Reply-To: <20200724092528.1578671-2-mpe@ellerman.id.au>
Hi Michael,
I have tested this with the test from the bug and it now seems to pass
fine. On that basis:
Tested-by: Daniel Axtens <dja@axtens.net>
Thank you for coming up with a better solution than my gross hack!
Kind regards,
Daniel
> We have powerpc specific logic in our page fault handling to decide if
> an access to an unmapped address below the stack pointer should expand
> the stack VMA.
>
> The code was originally added in 2004 "ported from 2.4". The rough
> logic is that the stack is allowed to grow to 1MB with no extra
> checking. Over 1MB the access must be within 2048 bytes of the stack
> pointer, or be from a user instruction that updates the stack pointer.
>
> The 2048 byte allowance below the stack pointer is there to cover the
> 288 byte "red zone" as well as the "about 1.5kB" needed by the signal
> delivery code.
>
> Unfortunately since then the signal frame has expanded, and is now
> 4224 bytes on 64-bit kernels with transactional memory enabled. This
> means if a process has consumed more than 1MB of stack, and its stack
> pointer lies less than 4224 bytes from the next page boundary, signal
> delivery will fault when trying to expand the stack and the process
> will see a SEGV.
>
> The total size of the signal frame is the size of struct rt_sigframe
> (which includes the red zone) plus __SIGNAL_FRAMESIZE (128 bytes on
> 64-bit).
>
> The 2048 byte allowance was correct until 2008 as the signal frame
> was:
>
> struct rt_sigframe {
> struct ucontext uc; /* 0 1440 */
> /* --- cacheline 11 boundary (1408 bytes) was 32 bytes ago --- */
> long unsigned int _unused[2]; /* 1440 16 */
> unsigned int tramp[6]; /* 1456 24 */
> struct siginfo * pinfo; /* 1480 8 */
> void * puc; /* 1488 8 */
> struct siginfo info; /* 1496 128 */
> /* --- cacheline 12 boundary (1536 bytes) was 88 bytes ago --- */
> char abigap[288]; /* 1624 288 */
>
> /* size: 1920, cachelines: 15, members: 7 */
> /* padding: 8 */
> };
>
> 1920 + 128 = 2048
>
> Then in commit ce48b2100785 ("powerpc: Add VSX context save/restore,
> ptrace and signal support") (Jul 2008) the signal frame expanded to
> 2304 bytes:
>
> struct rt_sigframe {
> struct ucontext uc; /* 0 1696 */ <--
> /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
> long unsigned int _unused[2]; /* 1696 16 */
> unsigned int tramp[6]; /* 1712 24 */
> struct siginfo * pinfo; /* 1736 8 */
> void * puc; /* 1744 8 */
> struct siginfo info; /* 1752 128 */
> /* --- cacheline 14 boundary (1792 bytes) was 88 bytes ago --- */
> char abigap[288]; /* 1880 288 */
>
> /* size: 2176, cachelines: 17, members: 7 */
> /* padding: 8 */
> };
>
> 2176 + 128 = 2304
>
> At this point we should have been exposed to the bug, though as far as
> I know it was never reported. I no longer have a system old enough to
> easily test on.
>
> Then in 2010 commit 320b2b8de126 ("mm: keep a guard page below a
> grow-down stack segment") caused our stack expansion code to never
> trigger, as there was always a VMA found for a write up to PAGE_SIZE
> below r1.
>
> That meant the bug was hidden as we continued to expand the signal
> frame in commit 2b0a576d15e0 ("powerpc: Add new transactional memory
> state to the signal context") (Feb 2013):
>
> struct rt_sigframe {
> struct ucontext uc; /* 0 1696 */
> /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
> struct ucontext uc_transact; /* 1696 1696 */ <--
> /* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
> long unsigned int _unused[2]; /* 3392 16 */
> unsigned int tramp[6]; /* 3408 24 */
> struct siginfo * pinfo; /* 3432 8 */
> void * puc; /* 3440 8 */
> struct siginfo info; /* 3448 128 */
> /* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
> char abigap[288]; /* 3576 288 */
>
> /* size: 3872, cachelines: 31, members: 8 */
> /* padding: 8 */
> /* last cacheline: 32 bytes */
> };
>
> 3872 + 128 = 4000
>
> And commit 573ebfa6601f ("powerpc: Increase stack redzone for 64-bit
> userspace to 512 bytes") (Feb 2014):
>
> struct rt_sigframe {
> struct ucontext uc; /* 0 1696 */
> /* --- cacheline 13 boundary (1664 bytes) was 32 bytes ago --- */
> struct ucontext uc_transact; /* 1696 1696 */
> /* --- cacheline 26 boundary (3328 bytes) was 64 bytes ago --- */
> long unsigned int _unused[2]; /* 3392 16 */
> unsigned int tramp[6]; /* 3408 24 */
> struct siginfo * pinfo; /* 3432 8 */
> void * puc; /* 3440 8 */
> struct siginfo info; /* 3448 128 */
> /* --- cacheline 27 boundary (3456 bytes) was 120 bytes ago --- */
> char abigap[512]; /* 3576 512 */ <--
>
> /* size: 4096, cachelines: 32, members: 8 */
> /* padding: 8 */
> };
>
> 4096 + 128 = 4224
>
> Then finally in 2017, commit 1be7107fbe18 ("mm: larger stack guard
> gap, between vmas") exposed us to the existing bug, because it changed
> the stack VMA to be the correct/real size, meaning our stack expansion
> code is now triggered.
>
> Fix it by increasing the allowance to 4224 bytes.
>
> Hard-coding 4224 is obviously unsafe against future expansions of the
> signal frame in the same way as the existing code. We can't easily use
> sizeof() because the signal frame structure is not in a header. We
> will either fix that, or rip out all the custom stack expansion
> checking logic entirely.
>
> Fixes: ce48b2100785 ("powerpc: Add VSX context save/restore, ptrace and signal support")
> Cc: stable@vger.kernel.org # v2.6.27+
> Reported-by: Tom Lane <tgl@sss.pgh.pa.us>
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>
> v2: Account for the extra 128 bytes of __SIGNAL_FRAMESIZE, making the
> total size 4224, as noticed by dja.
>
> See also https://bugzilla.kernel.org/show_bug.cgi?id=205183
> ---
> arch/powerpc/mm/fault.c | 7 +++++--
> 1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 641fc5f3d7dd..3ebb1792e636 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -267,6 +267,9 @@ static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
> return false;
> }
>
> +// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
> +#define SIGFRAME_MAX_SIZE (4096 + 128)
> +
> static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
> struct vm_area_struct *vma, unsigned int flags,
> bool *must_retry)
> @@ -274,7 +277,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
> /*
> * N.B. The POWER/Open ABI allows programs to access up to
> * 288 bytes below the stack pointer.
> - * The kernel signal delivery code writes up to about 1.5kB
> + * The kernel signal delivery code writes a bit over 4KB
> * below the stack pointer (r1) before decrementing it.
> * The exec code can write slightly over 640kB to the stack
> * before setting the user r1. Thus we allow the stack to
> @@ -299,7 +302,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
> * between the last mapped region and the stack will
> * expand the stack rather than segfaulting.
> */
> - if (address + 2048 >= uregs->gpr[1])
> + if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
> return false;
>
> if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
> --
> 2.25.1
^ permalink raw reply
* [PATCH] powerpc/mm: Limit resize_hpt_for_hotplug() call to hash guests only
From: Bharata B Rao @ 2020-07-27 9:57 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Bharata B Rao, aneesh.kumar, david, Nathan Lynch
During memory hotplug and unplug, resize_hpt_for_hotplug() gets called
for both hash and radix guests but it should be called only for hash
guests. Though the call does nothing in the radix guest case, it is
cleaner to push this call into hash specific memory hotplug routines.
Reported-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
---
Tested with memory hotplug and unplug for hash and radix KVM guests.
arch/powerpc/include/asm/sparsemem.h | 6 ------
arch/powerpc/mm/book3s64/hash_utils.c | 8 +++++++-
arch/powerpc/mm/mem.c | 5 -----
3 files changed, 7 insertions(+), 12 deletions(-)
diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
index c89b32443cff..1e6fa371cc38 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -17,12 +17,6 @@ extern int create_section_mapping(unsigned long start, unsigned long end,
int nid, pgprot_t prot);
extern int remove_section_mapping(unsigned long start, unsigned long end);
-#ifdef CONFIG_PPC_BOOK3S_64
-extern int resize_hpt_for_hotplug(unsigned long new_mem_size);
-#else
-static inline int resize_hpt_for_hotplug(unsigned long new_mem_size) { return 0; }
-#endif
-
#ifdef CONFIG_NUMA
extern int hot_add_scn_to_nid(unsigned long scn_addr);
#else
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 9fdabea04990..30a4a91d9987 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -785,7 +785,7 @@ static unsigned long __init htab_get_table_size(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int resize_hpt_for_hotplug(unsigned long new_mem_size)
+static int resize_hpt_for_hotplug(unsigned long new_mem_size)
{
unsigned target_hpt_shift;
@@ -819,6 +819,8 @@ int hash__create_section_mapping(unsigned long start, unsigned long end,
return -1;
}
+ resize_hpt_for_hotplug(memblock_phys_mem_size());
+
rc = htab_bolt_mapping(start, end, __pa(start),
pgprot_val(prot), mmu_linear_psize,
mmu_kernel_ssize);
@@ -836,6 +838,10 @@ int hash__remove_section_mapping(unsigned long start, unsigned long end)
int rc = htab_remove_mapping(start, end, mmu_linear_psize,
mmu_kernel_ssize);
WARN_ON(rc < 0);
+
+ if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
+ pr_warn("Hash collision while resizing HPT\n");
+
return rc;
}
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index c2c11eb8dcfc..9dafc636588f 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -127,8 +127,6 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
unsigned long nr_pages = size >> PAGE_SHIFT;
int rc;
- resize_hpt_for_hotplug(memblock_phys_mem_size());
-
start = (unsigned long)__va(start);
rc = create_section_mapping(start, start + size, nid,
params->pgprot);
@@ -161,9 +159,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
* hit that section of memory
*/
vm_unmap_aliases();
-
- if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
- pr_warn("Hash collision while resizing HPT\n");
}
#endif
--
2.26.2
^ permalink raw reply related
* Re: [PATCH] KVM: PPC: Book3S HV: increase KVMPPC_NR_LPIDS on POWER8 and POWER9
From: Cédric Le Goater @ 2020-07-27 9:38 UTC (permalink / raw)
To: Paul Mackerras; +Cc: kvm-ppc, linuxppc-dev, Nicholas Piggin, kvm
In-Reply-To: <20200723062016.GE213782@thinks.paulus.ozlabs.org>
On 7/23/20 8:20 AM, Paul Mackerras wrote:
> On Mon, Jun 08, 2020 at 01:57:14PM +0200, Cédric Le Goater wrote:
>> POWER8 and POWER9 have 12-bit LPIDs. Change LPID_RSVD to support up to
>> (4096 - 2) guests on these processors. POWER7 is kept the same with a
>> limitation of (1024 - 2), but it might be time to drop KVM support for
>> POWER7.
>>
>> Tested with 2048 guests * 4 vCPUs on a witherspoon system with 512G
>> RAM and a bit of swap.
>>
>> Signed-off-by: Cédric Le Goater <clg@kaod.org>
>
> Thanks, patch applied to my kvm-ppc-next branch.
We have pushed the limits further on a 1TB system and reached the limit
of 4094 guests with 16 vCPUs.
With more vCPUs, the system starts to check-stop. We believe that the
pages used by the interrupt controller for the backing store of the
XIVE internal tables (END and NVT) allocated with GFP_KERNEL are
reclaimable.
I am thinking of changing the allocation flags with :
__GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC
because XIVE should be able to fail gracefully if the system is
low on mem. Is that correct ?
Thanks,
C.
^ permalink raw reply
* Re: [PATCH] powerpc/64s/hash: Fix hash_preload running with interrupts enabled
From: Athira Rajeev @ 2020-07-27 9:32 UTC (permalink / raw)
To: Nicholas Piggin; +Cc: Aneesh Kumar K . V, linuxppc-dev
In-Reply-To: <20200727060947.10060-1-npiggin@gmail.com>
> On 27-Jul-2020, at 11:39 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>
> Commit 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
> caller") removed the local_irq_disable from hash_preload, but it was
> required for more than just the page table walk: the hash pte busy bit is
> effectively a lock which may be taken in interrupt context, and the local
> update flag test must not be preempted before it's used.
>
> This solves apparent lockups with perf interrupting __hash_page_64K. If
> get_perf_callchain then also takes a hash fault on the same page while it
> is already locked, it will loop forever taking hash faults, which looks like
> this:
>
> cpu 0x49e: Vector: 100 (System Reset) at [c00000001a4f7d70]
> pc: c000000000072dc8: hash_page_mm+0x8/0x800
> lr: c00000000000c5a4: do_hash_page+0x24/0x38
> sp: c0002ac1cc69ac70
> msr: 8000000000081033
> current = 0xc0002ac1cc602e00
> paca = 0xc00000001de1f280 irqmask: 0x03 irq_happened: 0x01
> pid = 20118, comm = pread2_processe
> Linux version 5.8.0-rc6-00345-g1fad14f18bc6
> 49e:mon> t
> [c0002ac1cc69ac70] c00000000000c5a4 do_hash_page+0x24/0x38 (unreliable)
> --- Exception: 300 (Data Access) at c00000000008fa60 __copy_tofrom_user_power7+0x20c/0x7ac
> [link register ] c000000000335d10 copy_from_user_nofault+0xf0/0x150
> [c0002ac1cc69af70] c00032bf9fa3c880 (unreliable)
> [c0002ac1cc69afa0] c000000000109df0 read_user_stack_64+0x70/0xf0
> [c0002ac1cc69afd0] c000000000109fcc perf_callchain_user_64+0x15c/0x410
> [c0002ac1cc69b060] c000000000109c00 perf_callchain_user+0x20/0x40
> [c0002ac1cc69b080] c00000000031c6cc get_perf_callchain+0x25c/0x360
> [c0002ac1cc69b120] c000000000316b50 perf_callchain+0x70/0xa0
> [c0002ac1cc69b140] c000000000316ddc perf_prepare_sample+0x25c/0x790
> [c0002ac1cc69b1a0] c000000000317350 perf_event_output_forward+0x40/0xb0
> [c0002ac1cc69b220] c000000000306138 __perf_event_overflow+0x88/0x1a0
> [c0002ac1cc69b270] c00000000010cf70 record_and_restart+0x230/0x750
> [c0002ac1cc69b620] c00000000010d69c perf_event_interrupt+0x20c/0x510
> [c0002ac1cc69b730] c000000000027d9c performance_monitor_exception+0x4c/0x60
> [c0002ac1cc69b750] c00000000000b2f8 performance_monitor_common_virt+0x1b8/0x1c0
> --- Exception: f00 (Performance Monitor) at c0000000000cb5b0 pSeries_lpar_hpte_insert+0x0/0x160
> [link register ] c0000000000846f0 __hash_page_64K+0x210/0x540
> [c0002ac1cc69ba50] 0000000000000000 (unreliable)
> [c0002ac1cc69bb00] c000000000073ae0 update_mmu_cache+0x390/0x3a0
> [c0002ac1cc69bb70] c00000000037f024 wp_page_copy+0x364/0xce0
> [c0002ac1cc69bc20] c00000000038272c do_wp_page+0xdc/0xa60
> [c0002ac1cc69bc70] c0000000003857bc handle_mm_fault+0xb9c/0x1b60
> [c0002ac1cc69bd50] c00000000006c434 __do_page_fault+0x314/0xc90
> [c0002ac1cc69be20] c00000000000c5c8 handle_page_fault+0x10/0x2c
> --- Exception: 300 (Data Access) at 00007fff8c861fe8
> SP (7ffff6b19660) is in userspace
>
> Reported-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
> Reported-by: Anton Blanchard <anton@ozlabs.org>
> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> Fixes: 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
> caller")
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Hi,
Tested with the patch and it fixes the lockups I was seeing with my test run.
Thanks for the fix.
Tested-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
> ---
> arch/powerpc/kernel/exceptions-64s.S | 14 +++++++++++---
> arch/powerpc/mm/book3s64/hash_utils.c | 25 +++++++++++++++++++++++++
> arch/powerpc/perf/core-book3s.c | 6 ++++++
> 3 files changed, 42 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 0fc8bad878b2..446e54c3f71e 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -3072,10 +3072,18 @@ do_hash_page:
> ori r0,r0,DSISR_BAD_FAULT_64S@l
> and. r0,r5,r0 /* weird error? */
> bne- handle_page_fault /* if not, try to insert a HPTE */
> +
> + /*
> + * If we are in an "NMI" (e.g., an interrupt when soft-disabled), then
> + * don't call hash_page, just fail the fault. This is required to
> + * prevent re-entrancy problems in the hash code, namely perf
> + * interrupts hitting while something holds H_PAGE_BUSY, and taking a
> + * hash fault. See the comment in hash_preload().
> + */
> ld r11, PACA_THREAD_INFO(r13)
> - lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */
> - andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */
> - bne 77f /* then don't call hash_page now */
> + lwz r0,TI_PREEMPT(r11)
> + andis. r0,r0,NMI_MASK@h
> + bne 77f
>
> /*
> * r3 contains the trap number
> diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
> index 468169e33c86..9b9f92ad0e7a 100644
> --- a/arch/powerpc/mm/book3s64/hash_utils.c
> +++ b/arch/powerpc/mm/book3s64/hash_utils.c
> @@ -1559,6 +1559,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
> pgd_t *pgdir;
> int rc, ssize, update_flags = 0;
> unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0);
> + unsigned long flags;
>
> BUG_ON(get_region_id(ea) != USER_REGION_ID);
>
> @@ -1592,6 +1593,28 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
> return;
> #endif /* CONFIG_PPC_64K_PAGES */
>
> + /*
> + * __hash_page_* must run with interrupts off, as it sets the
> + * H_PAGE_BUSY bit. It's possible for perf interrupts to hit at any
> + * time and may take a hash fault reading the user stack, see
> + * read_user_stack_slow() in the powerpc/perf code.
> + *
> + * If that takes a hash fault on the same page as we lock here, it
> + * will bail out when seeing H_PAGE_BUSY set, and retry the access
> + * leading to an infinite loop.
> + *
> + * Disabling interrupts here does not prevent perf interrupts, but it
> + * will prevent them taking hash faults (see the NMI test in
> + * do_hash_page), then read_user_stack's copy_from_user_nofault will
> + * fail and perf will fall back to read_user_stack_slow(), which
> + * walks the Linux page tables.
> + *
> + * Interrupts must also be off for the duration of the
> + * mm_is_thread_local test and update, to prevent preempt running the
> + * mm on another CPU (XXX: this may be racy vs kthread_use_mm).
> + */
> + local_irq_save(flags);
> +
> /* Is that local to this CPU ? */
> if (mm_is_thread_local(mm))
> update_flags |= HPTE_LOCAL_UPDATE;
> @@ -1614,6 +1637,8 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
> mm_ctx_user_psize(&mm->context),
> mm_ctx_user_psize(&mm->context),
> pte_val(*ptep));
> +
> + local_irq_restore(flags);
> }
>
> /*
> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
> index cd6a742ac6ef..01d70280d287 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -2179,6 +2179,12 @@ static void __perf_event_interrupt(struct pt_regs *regs)
>
> perf_read_regs(regs);
>
> + /*
> + * If perf interrupts hit in a local_irq_disable (soft-masked) region,
> + * we consider them as NMIs. This is required to prevent hash faults on
> + * user addresses when reading callchains. See the NMI test in
> + * do_hash_page.
> + */
> nmi = perf_intr_is_nmi(regs);
> if (nmi)
> nmi_enter();
> --
> 2.23.0
>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox