LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/4] Prefixed instruction tests to cover negative cases
From: Balamuruhan S @ 2020-06-26  9:51 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev

This patchset adds support to test negative scenarios and adds testcase
for paddi with few fixes. It is based on powerpc/next and on top of
Jordan's tests for prefixed instructions patchsets,

https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-May/211394.html
https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-June/211768.html

Changes in v2:
-------------
Fix review comments from Sandipan and Jordan,
* use helper function to print word/prefix instructions
* reuse bits of `flags` to represent negative test scenario
* always set NIP instead of only setting for relative prefixed
  instructions

Balamuruhan S (4):
  powerpc test_emulate_step: enhancement to test negative scenarios
  powerpc test_emulate_step: add negative tests for prefixed addi
  powerpc sstep: introduce macros to retrieve Prefix instruction
    operands
  powerpc test_emulate_step: move extern declaration to sstep.h

 arch/powerpc/include/asm/sstep.h     |  6 ++++
 arch/powerpc/lib/sstep.c             | 12 ++++----
 arch/powerpc/lib/test_emulate_step.c | 42 ++++++++++++++++++++--------
 3 files changed, 43 insertions(+), 17 deletions(-)


base-commit: 64677779e8962c20b580b471790fe42367750599
prerequisite-patch-id: 3fff52f42000e816e2e8b4f75a2bca651dec5efe
prerequisite-patch-id: 5d7904bf38248ec39ed0f6223500286b9eaf82a9
prerequisite-patch-id: 7236d3caa4dc6de6079ae893678223876a3bb364
prerequisite-patch-id: 733e7f9b5c6ade64b8a1c7458b5aefe6b7d6fcff
prerequisite-patch-id: 4793e7716f3f56577a49976539d06db37ba31a80
prerequisite-patch-id: ffb024c2590e7249190b0137acf267e821a816a7
prerequisite-patch-id: 86e64f47de2dc6e9a6e1404de12d7c91775c22c8
prerequisite-patch-id: 9fbe5c3af9590696c230944cdee3c45e01f44d6d
prerequisite-patch-id: 3f9c6238023c867e27d87de26487e7e665a9dc12
-- 
2.24.1


^ permalink raw reply

* [PATCH v2 1/4] powerpc test_emulate_step: enhancement to test negative scenarios
From: Balamuruhan S @ 2020-06-26  9:51 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200626095158.1031507-1-bala24@linux.ibm.com>

add provision to declare test is a negative scenario, verify
whether emulation fails and avoid executing it.

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/lib/test_emulate_step.c | 30 +++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index 0ca2b7cc8d8c..7c30a69c174f 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -118,6 +118,7 @@
 #define IGNORE_GPR(n)	(0x1UL << (n))
 #define IGNORE_XER	(0x1UL << 32)
 #define IGNORE_CCR	(0x1UL << 33)
+#define NEGATIVE_TEST	(0x1UL << 63)
 
 static void __init init_pt_regs(struct pt_regs *regs)
 {
@@ -1202,8 +1203,10 @@ static struct compute_test compute_tests[] = {
 };
 
 static int __init emulate_compute_instr(struct pt_regs *regs,
-					struct ppc_inst instr)
+					struct ppc_inst instr,
+					bool negative)
 {
+	int analysed;
 	extern s32 patch__exec_instr;
 	struct instruction_op op;
 
@@ -1212,13 +1215,17 @@ static int __init emulate_compute_instr(struct pt_regs *regs,
 
 	regs->nip = patch_site_addr(&patch__exec_instr);
 
-	if (analyse_instr(&op, regs, instr) != 1 ||
-	    GETTYPE(op.type) != COMPUTE) {
-		pr_info("execution failed, instruction = %s\n", ppc_inst_as_str(instr));
+	analysed = analyse_instr(&op, regs, instr);
+	if (analysed != 1 || GETTYPE(op.type) != COMPUTE) {
+		if (negative)
+			return -EFAULT;
+		pr_info("emulation failed, instruction = %s\n", ppc_inst_as_str(instr));
 		return -EFAULT;
 	}
-
-	emulate_update_regs(regs, &op);
+	if (analysed == 1 && negative)
+		pr_info("negative test failed, instruction = %s\n", ppc_inst_as_str(instr));
+	if (!negative)
+		emulate_update_regs(regs, &op);
 	return 0;
 }
 
@@ -1256,7 +1263,7 @@ static void __init run_tests_compute(void)
 	struct pt_regs *regs, exp, got;
 	unsigned int i, j, k;
 	struct ppc_inst instr;
-	bool ignore_gpr, ignore_xer, ignore_ccr, passed;
+	bool ignore_gpr, ignore_xer, ignore_ccr, passed, rc, negative;
 
 	for (i = 0; i < ARRAY_SIZE(compute_tests); i++) {
 		test = &compute_tests[i];
@@ -1270,6 +1277,7 @@ static void __init run_tests_compute(void)
 			instr = test->subtests[j].instr;
 			flags = test->subtests[j].flags;
 			regs = &test->subtests[j].regs;
+			negative = flags & NEGATIVE_TEST;
 			ignore_xer = flags & IGNORE_XER;
 			ignore_ccr = flags & IGNORE_CCR;
 			passed = true;
@@ -1284,8 +1292,12 @@ static void __init run_tests_compute(void)
 			exp.msr = MSR_KERNEL;
 			got.msr = MSR_KERNEL;
 
-			if (emulate_compute_instr(&got, instr) ||
-			    execute_compute_instr(&exp, instr)) {
+			rc = emulate_compute_instr(&got, instr, negative) != 0;
+			if (negative) {
+				/* skip executing instruction */
+				passed = rc;
+				goto print;
+			} else if (rc || execute_compute_instr(&exp, instr)) {
 				passed = false;
 				goto print;
 			}
-- 
2.24.1


^ permalink raw reply related

* [PATCH v2 2/4] powerpc test_emulate_step: add negative tests for prefixed addi
From: Balamuruhan S @ 2020-06-26  9:51 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200626095158.1031507-1-bala24@linux.ibm.com>

testcases for `paddi` instruction to cover the negative case,
if R is equal to 1 and RA is not equal to 0, the instruction
form is invalid.

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/lib/test_emulate_step.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index 7c30a69c174f..0ee59301ef99 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -1197,6 +1197,16 @@ static struct compute_test compute_tests[] = {
 				.regs = {
 					.gpr[21] = 0,
 				}
+			},
+			/* Invalid instruction form with R = 1 and RA != 0 */
+			{
+				.descr = "RA = R22(0), SI = 0, R = 1",
+				.instr = TEST_PADDI(21, 22, 0, 1),
+				.flags = NEGATIVE_TEST,
+				.regs = {
+					.gpr[21] = 0,
+					.gpr[22] = 0,
+				}
 			}
 		}
 	}
-- 
2.24.1


^ permalink raw reply related

* [PATCH v2 3/4] powerpc sstep: introduce macros to retrieve Prefix instruction operands
From: Balamuruhan S @ 2020-06-26  9:51 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200626095158.1031507-1-bala24@linux.ibm.com>

retrieve prefix instruction operands RA and pc relative bit R values
using macros and adopt it in sstep.c and test_emulate_step.c.

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/include/asm/sstep.h |  4 ++++
 arch/powerpc/lib/sstep.c         | 12 ++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 3b01c69a44aa..325975b4ef30 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -104,6 +104,10 @@ enum instruction_type {
 
 #define MKOP(t, f, s)	((t) | (f) | SIZE(s))
 
+/* Prefix instruction operands */
+#define GET_PREFIX_RA(i)	(((i) >> 16) & 0x1f)
+#define GET_PREFIX_R(i)		((i) & (1ul << 20))
+
 struct instruction_op {
 	int type;
 	int reg;
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 5abe98216dc2..fb4c5767663d 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -200,8 +200,8 @@ static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned int instr,
 	unsigned int  dd;
 	unsigned long ea, d0, d1, d;
 
-	prefix_r = instr & (1ul << 20);
-	ra = (suffix >> 16) & 0x1f;
+	prefix_r = GET_PREFIX_R(instr);
+	ra = GET_PREFIX_RA(suffix);
 
 	d0 = instr & 0x3ffff;
 	d1 = suffix & 0xffff;
@@ -1339,8 +1339,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 	switch (opcode) {
 #ifdef __powerpc64__
 	case 1:
-		prefix_r = word & (1ul << 20);
-		ra = (suffix >> 16) & 0x1f;
+		prefix_r = GET_PREFIX_R(word);
+		ra = GET_PREFIX_RA(suffix);
 		rd = (suffix >> 21) & 0x1f;
 		op->reg = rd;
 		op->val = regs->gpr[rd];
@@ -2715,8 +2715,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 		}
 		break;
 	case 1: /* Prefixed instructions */
-		prefix_r = word & (1ul << 20);
-		ra = (suffix >> 16) & 0x1f;
+		prefix_r = GET_PREFIX_R(word);
+		ra = GET_PREFIX_RA(suffix);
 		op->update_reg = ra;
 		rd = (suffix >> 21) & 0x1f;
 		op->reg = rd;
-- 
2.24.1


^ permalink raw reply related

* [PATCH v2 4/4] powerpc test_emulate_step: move extern declaration to sstep.h
From: Balamuruhan S @ 2020-06-26  9:51 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200626095158.1031507-1-bala24@linux.ibm.com>

fix checkpatch.pl warnings by moving extern declaration from source
file to headerfile.

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/include/asm/sstep.h     | 2 ++
 arch/powerpc/lib/test_emulate_step.c | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 325975b4ef30..c8e37ef060c1 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -108,6 +108,8 @@ enum instruction_type {
 #define GET_PREFIX_RA(i)	(((i) >> 16) & 0x1f)
 #define GET_PREFIX_R(i)		((i) & (1ul << 20))
 
+extern s32 patch__exec_instr;
+
 struct instruction_op {
 	int type;
 	int reg;
diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index 0ee59301ef99..c46bf6fc199b 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -1217,7 +1217,6 @@ static int __init emulate_compute_instr(struct pt_regs *regs,
 					bool negative)
 {
 	int analysed;
-	extern s32 patch__exec_instr;
 	struct instruction_op op;
 
 	if (!regs || !ppc_inst_val(instr))
@@ -1243,7 +1242,6 @@ static int __init execute_compute_instr(struct pt_regs *regs,
 					struct ppc_inst instr)
 {
 	extern int exec_instr(struct pt_regs *regs);
-	extern s32 patch__exec_instr;
 
 	if (!regs || !ppc_inst_val(instr))
 		return -EINVAL;
-- 
2.24.1


^ permalink raw reply related

* Re: [PATCH v2 3/5] libnvdimm/nvdimm/flush: Allow architecture to override the flush barrier
From: Michal Suchánek @ 2020-06-26 10:20 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Jan Kara, linux-nvdimm, Aneesh Kumar K.V, Jeff Moyer, alistair,
	Dan Williams, linuxppc-dev
In-Reply-To: <alpine.LRH.2.02.2005220845200.17488@file01.intranet.prod.int.rdu2.redhat.com>

On Fri, May 22, 2020 at 09:01:17AM -0400, Mikulas Patocka wrote:
> 
> 
> On Fri, 22 May 2020, Aneesh Kumar K.V wrote:
> 
> > On 5/22/20 3:01 PM, Michal SuchÃ¡nek wrote:
> > > On Thu, May 21, 2020 at 02:52:30PM -0400, Mikulas Patocka wrote:
> > > > 
> > > > 
> > > > On Thu, 21 May 2020, Dan Williams wrote:
> > > > 
> > > > > On Thu, May 21, 2020 at 10:03 AM Aneesh Kumar K.V
> > > > > <aneesh.kumar@linux.ibm.com> wrote:
> > > > > > 
> > > > > > > Moving on to the patch itself--Aneesh, have you audited other
> > > > > > > persistent
> > > > > > > memory users in the kernel?  For example, drivers/md/dm-writecache.c
> > > > > > > does
> > > > > > > this:
> > > > > > > 
> > > > > > > static void writecache_commit_flushed(struct dm_writecache *wc, bool
> > > > > > > wait_for_ios)
> > > > > > > {
> > > > > > >        if (WC_MODE_PMEM(wc))
> > > > > > >                wmb(); <==========
> > > > > > >           else
> > > > > > >                   ssd_commit_flushed(wc, wait_for_ios);
> > > > > > > }
> > > > > > > 
> > > > > > > I believe you'll need to make modifications there.
> > > > > > > 
> > > > > > 
> > > > > > Correct. Thanks for catching that.
> > > > > > 
> > > > > > 
> > > > > > I don't understand dm much, wondering how this will work with
> > > > > > non-synchronous DAX device?
> > > > > 
> > > > > That's a good point. DM-writecache needs to be cognizant of things
> > > > > like virtio-pmem that violate the rule that persisent memory writes
> > > > > can be flushed by CPU functions rather than calling back into the
> > > > > driver. It seems we need to always make the flush case a dax_operation
> > > > > callback to account for this.
> > > > 
> > > > dm-writecache is normally sitting on the top of dm-linear, so it would
> > > > need to pass the wmb() call through the dm core and dm-linear target ...
> > > > that would slow it down ... I remember that you already did it this way
> > > > some times ago and then removed it.
> > > > 
> > > > What's the exact problem with POWER? Could the POWER system have two types
> > > > of persistent memory that need two different ways of flushing?
> > > 
> > > As far as I understand the discussion so far
> > > 
> > >   - on POWER $oldhardware uses $oldinstruction to ensure pmem consistency
> > >   - on POWER $newhardware uses $newinstruction to ensure pmem consistency
> > >     (compatible with $oldinstruction on $oldhardware)
> > 
> > Correct.
> > 
> > >   - on some platforms instead of barrier instruction a callback into the
> > >     driver is issued to ensure consistency 
> > 
> > This is virtio-pmem only at this point IIUC.
> > 
> > -aneesh
> 
> And does the virtio-pmem driver track which pages are dirty? Or does it 
> need to specify the range of pages to flush in the flush function?
> 
> > > None of this is reflected by the dm driver.
> 
> We could make a new dax method:
> void *(dax_get_flush_function)(void);
> 
> This would return a pointer to "wmb()" on x86 and something else on Power.
> 
> The method "dax_get_flush_function" would be called only once when 
> initializing the writecache driver (because the call would be slow because 
> it would have to go through the DM stack) and then, the returned function 
> would be called each time we need write ordering. The returned function 
> would do just "sfence; ret".

Hello,

as far as I understand the code virtio_pmem has a fush function defined
which indeed can make use of the region properties, such as memory
range. If such function exists you need quivalent of sync() - call into
the device in question. If it does not calling arch_pmem_flush_barrier()
instead of wmb() should suffice.

I am not aware of an interface to determine if the flush function exists
for a particular region.

Thanks

Michal

^ permalink raw reply

* [PATCH v3 1/2] powerpc/perf/hv-24x7: Add cpu hotplug support
From: Kajol Jain @ 2020-06-26 10:28 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: nathanl, ego, maddy, kjain, suka, anju
In-Reply-To: <20200626102824.270923-1-kjain@linux.ibm.com>

Patch here adds cpu hotplug functions to hv_24x7 pmu.
A new cpuhp_state "CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE" enum
is added.

The online callback function updates the cpumask only if its
empty. As the primary intention of adding hotplug support
is to designate a CPU to make HCALL to collect the
counter data.

The offline function test and clear corresponding cpu in a cpumask
and update cpumask to any other active cpu.

Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
---
 arch/powerpc/perf/hv-24x7.c | 45 +++++++++++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h  |  1 +
 2 files changed, 46 insertions(+)

diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index db213eb7cb02..ce4739e2b407 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -31,6 +31,8 @@ static int interface_version;
 /* Whether we have to aggregate result data for some domains. */
 static bool aggregate_result_elements;
 
+static cpumask_t hv_24x7_cpumask;
+
 static bool domain_is_valid(unsigned domain)
 {
 	switch (domain) {
@@ -1641,6 +1643,44 @@ static struct pmu h_24x7_pmu = {
 	.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
 };
 
+static int ppc_hv_24x7_cpu_online(unsigned int cpu)
+{
+	/* Make this CPU the designated target for counter collection */
+	if (cpumask_empty(&hv_24x7_cpumask))
+		cpumask_set_cpu(cpu, &hv_24x7_cpumask);
+
+	return 0;
+}
+
+static int ppc_hv_24x7_cpu_offline(unsigned int cpu)
+{
+	int target = -1;
+
+	/* Check if exiting cpu is used for collecting 24x7 events */
+	if (!cpumask_test_and_clear_cpu(cpu, &hv_24x7_cpumask))
+		return 0;
+
+	/* Find a new cpu to collect 24x7 events */
+	target = cpumask_last(cpu_active_mask);
+
+	if (target < 0 || target >= nr_cpu_ids)
+		return -1;
+
+	/* Migrate 24x7 events to the new target */
+	cpumask_set_cpu(target, &hv_24x7_cpumask);
+	perf_pmu_migrate_context(&h_24x7_pmu, cpu, target);
+
+	return 0;
+}
+
+static int hv_24x7_cpu_hotplug_init(void)
+{
+	return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
+			  "perf/powerpc/hv_24x7:online",
+			  ppc_hv_24x7_cpu_online,
+			  ppc_hv_24x7_cpu_offline);
+}
+
 static int hv_24x7_init(void)
 {
 	int r;
@@ -1685,6 +1725,11 @@ static int hv_24x7_init(void)
 	if (r)
 		return r;
 
+	/* init cpuhotplug */
+	r = hv_24x7_cpu_hotplug_init();
+	if (r)
+		pr_err("hv_24x7: CPU hotplug init failed\n");
+
 	r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1);
 	if (r)
 		return r;
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 191772d4a4d7..a2710e654b64 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -181,6 +181,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
 	CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE,
+	CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE,
 	CPUHP_AP_WATCHDOG_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
-- 
2.18.2


^ permalink raw reply related

* [PATCH v3 2/2] powerpc/hv-24x7: Add sysfs files inside hv-24x7 device to show cpumask
From: Kajol Jain @ 2020-06-26 10:28 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: nathanl, ego, maddy, kjain, suka, anju
In-Reply-To: <20200626102824.270923-1-kjain@linux.ibm.com>

Patch here adds a cpumask attr to hv_24x7 pmu along with ABI documentation.

Primary use to expose the cpumask is for the perf tool which has the
capability to parse the driver sysfs folder and understand the
cpumask file. Having cpumask file will reduce the number of perf command
line parameters (will avoid "-C" option in the perf tool
command line). It can also notify the user which is
the current cpu used to retrieve the counter data.

command:# cat /sys/devices/hv_24x7/cpumask
0

Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
---
 .../sysfs-bus-event_source-devices-hv_24x7    |  7 ++++
 arch/powerpc/perf/hv-24x7.c                   | 36 +++++++++++++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7 b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
index e8698afcd952..f9dd3755b049 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7
@@ -43,6 +43,13 @@ Description:	read only
 		This sysfs interface exposes the number of cores per chip
 		present in the system.
 
+What:		/sys/devices/hv_24x7/cpumask
+Date:		June 2020
+Contact:	Linux on PowerPC Developer List <linuxppc-dev@lists.ozlabs.org>
+Description:	read only
+		This sysfs file exposes the cpumask which is designated to make
+		HCALLs to retrieve hv-24x7 pmu event counter data.
+
 What:		/sys/bus/event_source/devices/hv_24x7/event_descs/<event-name>
 Date:		February 2014
 Contact:	Linux on PowerPC Developer List <linuxppc-dev@lists.ozlabs.org>
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index ce4739e2b407..3c699612d29f 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -448,6 +448,12 @@ static ssize_t device_show_string(struct device *dev,
 	return sprintf(buf, "%s\n", (char *)d->var);
 }
 
+static ssize_t cpumask_get_attr(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask);
+}
+
 static ssize_t sockets_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
@@ -1116,6 +1122,17 @@ static DEVICE_ATTR_RO(sockets);
 static DEVICE_ATTR_RO(chipspersocket);
 static DEVICE_ATTR_RO(coresperchip);
 
+static DEVICE_ATTR(cpumask, S_IRUGO, cpumask_get_attr, NULL);
+
+static struct attribute *cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+	.attrs = cpumask_attrs,
+};
+
 static struct bin_attribute *if_bin_attrs[] = {
 	&bin_attr_catalog,
 	NULL,
@@ -1143,6 +1160,11 @@ static const struct attribute_group *attr_groups[] = {
 	&event_desc_group,
 	&event_long_desc_group,
 	&if_group,
+	/*
+	 * This NULL is a placeholder for the cpumask attr which will update
+	 * onlyif cpuhotplug registration is successful
+	 */
+	NULL,
 	NULL,
 };
 
@@ -1683,7 +1705,7 @@ static int hv_24x7_cpu_hotplug_init(void)
 
 static int hv_24x7_init(void)
 {
-	int r;
+	int r, i = -1;
 	unsigned long hret;
 	struct hv_perf_caps caps;
 
@@ -1727,8 +1749,18 @@ static int hv_24x7_init(void)
 
 	/* init cpuhotplug */
 	r = hv_24x7_cpu_hotplug_init();
-	if (r)
+	if (r) {
 		pr_err("hv_24x7: CPU hotplug init failed\n");
+	} else {
+		/*
+		 * Cpu hotplug init is successful, add the
+		 * cpumask file as part of pmu attr group and
+		 * assign it to very first NULL location.
+		 */
+		while (attr_groups[++i])
+			/* nothing */;
+		attr_groups[i] = &cpumask_attr_group;
+	}
 
 	r = perf_pmu_register(&h_24x7_pmu, h_24x7_pmu.name, -1);
 	if (r)
-- 
2.18.2


^ permalink raw reply related

* [PATCH v3 0/2] Add cpu hotplug support for powerpc/perf/hv-24x7
From: Kajol Jain @ 2020-06-26 10:28 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: nathanl, ego, maddy, kjain, suka, anju

This patchset add cpu hotplug support for hv_24x7 driver by adding
online/offline cpu hotplug function. It also add sysfs file
"cpumask" to expose current online cpu that can be used for
hv_24x7 event count.

Changelog:
v2 -> v3
- Corrected some of the typo mistakes and update commit message
  as suggested by Gautham R Shenoy.
- Added Reviewed-by tag for the first patch in the patchset.

v1 -> v2
- Changed function to pick active cpu incase of offline
  from "cpumask_any_but" to "cpumask_last", as
  cpumask_any_but function pick very next online cpu and incase where
  we are sequentially off-lining multiple cpus, "pmu_migrate_context"
  can add extra latency.
  - Suggested by: Gautham R Shenoy.

- Change documentation for cpumask and rather then hardcode the
  initialization for cpumask_attr_group, add loop to get very first
  NULL as suggested by Gautham R Shenoy.

Kajol Jain (2):
  powerpc/perf/hv-24x7: Add cpu hotplug support
  powerpc/hv-24x7: Add sysfs files inside hv-24x7 device to show cpumask

 .../sysfs-bus-event_source-devices-hv_24x7    |  7 ++
 arch/powerpc/perf/hv-24x7.c                   | 79 ++++++++++++++++++-
 include/linux/cpuhotplug.h                    |  1 +
 3 files changed, 86 insertions(+), 1 deletion(-)

-- 
2.18.2


^ permalink raw reply

* Re: [PATCH 09/13] x86: Remove dev->archdata.iommu pointer
From: Borislav Petkov @ 2020-06-26 11:46 UTC (permalink / raw)
  To: Joerg Roedel
  Cc: linux-ia64, Heiko Stuebner, David Airlie, Joonas Lahtinen,
	Thierry Reding, Paul Mackerras, Will Deacon, Marek Szyprowski,
	x86, Russell King, Catalin Marinas, Fenghua Yu, Joerg Roedel,
	intel-gfx, Jani Nikula, Rodrigo Vivi, Matthias Brugger,
	linux-arm-kernel, Tony Luck, linuxppc-dev, linux-kernel, iommu,
	Daniel Vetter, David Woodhouse, Lu Baolu
In-Reply-To: <20200625130836.1916-10-joro@8bytes.org>

On Thu, Jun 25, 2020 at 03:08:32PM +0200, Joerg Roedel wrote:
> From: Joerg Roedel <jroedel@suse.de>
> 
> There are no users left, all drivers have been converted to use the
> per-device private pointer offered by IOMMU core.
> 
> Signed-off-by: Joerg Roedel <jroedel@suse.de>
> ---
>  arch/x86/include/asm/device.h | 3 ---
>  1 file changed, 3 deletions(-)
> 
> diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
> index 49bd6cf3eec9..7c0a52ca2f4d 100644
> --- a/arch/x86/include/asm/device.h
> +++ b/arch/x86/include/asm/device.h
> @@ -3,9 +3,6 @@
>  #define _ASM_X86_DEVICE_H
>  
>  struct dev_archdata {
> -#ifdef CONFIG_IOMMU_API
> -	void *iommu; /* hook for IOMMU specific extension */
> -#endif
>  };
>  
>  struct pdev_archdata {
> -- 

Acked-by: Borislav Petkov <bp@suse.de>

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply

* Re: [PATCH] powerpc/pseries: Use doorbells even if XIVE is available
From: Cédric Le Goater @ 2020-06-26  7:17 UTC (permalink / raw)
  To: Michael Ellerman, Nicholas Piggin, linuxppc-dev
  Cc: Anton Blanchard, kvm-ppc, David Gibson
In-Reply-To: <87r1u4aqzm.fsf@mpe.ellerman.id.au>

Adding David, 

On 6/25/20 3:11 AM, Michael Ellerman wrote:
> Nicholas Piggin <npiggin@gmail.com> writes:
>> KVM supports msgsndp in guests by trapping and emulating the
>> instruction, so it was decided to always use XIVE for IPIs if it is
>> available. However on PowerVM systems, msgsndp can be used and gives
>> better performance. On large systems, high XIVE interrupt rates can
>> have sub-linear scaling, and using msgsndp can reduce the load on
>> the interrupt controller.
>>
>> So switch to using core local doorbells even if XIVE is available.
>> This reduces performance for KVM guests with an SMT topology by
>> about 50% for ping-pong context switching between SMT vCPUs.
> 
> You have to take explicit steps to configure KVM in that way with qemu.
> eg. "qemu .. -smp 8" will give you 8 SMT1 CPUs by default.
> 
>> An option vector (or dt-cpu-ftrs) could be defined to disable msgsndp
>> to get KVM performance back.

An option vector would require a PAPR change. Unless the architecture 
reserves some bits for the implementation, but I don't think so. Same
for CAS.

> Qemu/KVM populates /proc/device-tree/hypervisor, so we *could* look at
> that. Though adding PowerVM/KVM specific hacks is obviously a very
> slippery slope.

QEMU could advertise a property "emulated-msgsndp", or something similar, 
which would be interpreted by Linux as a CPU feature and taken into account 
when doing the IPIs.

The CPU setup for XIVE needs a cleanup also. There is no need to allocate
interrupts for IPIs anymore in that case.

Thanks,

C. 


> 
>> diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
>> index 6891710833be..a737a2f87c67 100644
>> --- a/arch/powerpc/platforms/pseries/smp.c
>> +++ b/arch/powerpc/platforms/pseries/smp.c
>> @@ -188,13 +188,14 @@ static int pseries_smp_prepare_cpu(int cpu)
>>  	return 0;
>>  }
>>  
>> +static void  (*cause_ipi_offcore)(int cpu) __ro_after_init;
>> +
>>  static void smp_pseries_cause_ipi(int cpu)
> 
> This is static so the name could be more descriptive, it doesn't need
> the "smp_pseries" prefix.
> 
>>  {
>> -	/* POWER9 should not use this handler */
>>  	if (doorbell_try_core_ipi(cpu))
>>  		return;
> 
> Seems like it would be worth making that static inline so we can avoid
> the function call overhead.
> 
>> -	icp_ops->cause_ipi(cpu);
>> +	cause_ipi_offcore(cpu);
>>  }
>>  
>>  static int pseries_cause_nmi_ipi(int cpu)
>> @@ -222,10 +223,7 @@ static __init void pSeries_smp_probe_xics(void)
>>  {
>>  	xics_smp_probe();
>>  
>> -	if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest())
>> -		smp_ops->cause_ipi = smp_pseries_cause_ipi;
>> -	else
>> -		smp_ops->cause_ipi = icp_ops->cause_ipi;
>> +	smp_ops->cause_ipi = icp_ops->cause_ipi;
>>  }
>>  
>>  static __init void pSeries_smp_probe(void)
>> @@ -238,6 +236,18 @@ static __init void pSeries_smp_probe(void)
> 
> The comment just above here says:
> 
> 		/*
> 		 * Don't use P9 doorbells when XIVE is enabled. IPIs
> 		 * using MMIOs should be faster
> 		 */
>>  		xive_smp_probe();
> 
> Which is no longer true.
> 
>>  	else
>>  		pSeries_smp_probe_xics();
> 
> I think you should just fold this in, it would make the logic slightly
> easier to follow.
> 
>> +	/*
>> +	 * KVM emulates doorbells by reading the instruction, which
>> +	 * can't be done if the guest is secure. If a secure guest
>> +	 * runs under PowerVM, it could use msgsndp but would need a
>> +	 * way to distinguish.
>> +	 */
> 
> It's not clear what it needs to distinguish: That it's running under
> PowerVM and therefore *can* use msgsndp even though it's secure.
> 
> Also the comment just talks about the is_secure_guest() test, which is
> not obvious on first reading.
> 
>> +	if (cpu_has_feature(CPU_FTR_DBELL) &&
>> +	    cpu_has_feature(CPU_FTR_SMT) && !is_secure_guest()) {
>> +		cause_ipi_offcore = smp_ops->cause_ipi;
>> +		smp_ops->cause_ipi = smp_pseries_cause_ipi;
>> +	}
> 
> Because we're at the tail of the function I think this would be clearer
> if it used early returns, eg:
> 
> 	// If the CPU doesn't have doorbells then we must use xics/xive
> 	if (!cpu_has_feature(CPU_FTR_DBELL))
>         	return;
> 
> 	// If the CPU doesn't have SMT then doorbells don't help us
> 	if (!cpu_has_feature(CPU_FTR_SMT))
>         	return;
> 
> 	// Secure guests can't use doorbells because ...
> 	if (!is_secure_guest()
>         	return;
> 
> 	/*
>          * Otherwise we want to use doorbells for sibling threads and
>          * xics/xive for IPIs off the core, because it performs better
>          * on large systems ...
>          */
>         cause_ipi_offcore = smp_ops->cause_ipi;
> 	smp_ops->cause_ipi = smp_pseries_cause_ipi;
> }
> 
> 
> cheers
> 


^ permalink raw reply

* [PATCH v2 0/3] Off-load TLB invalidations to host for !GTSE
From: Bharata B Rao @ 2020-06-26 13:09 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: aneesh.kumar, Bharata B Rao, npiggin

Hypervisor may choose not to enable Guest Translation Shootdown Enable
(GTSE) option for the guest. When GTSE isn't ON, the guest OS isn't
permitted to use instructions like tblie and tlbsync directly, but is
expected to make hypervisor calls to get the TLB flushed.

This series enables the TLB flush routines in the radix code to
off-load TLB flushing to hypervisor via the newly proposed hcall
H_RPT_INVALIDATE. 

To easily check the availability of GTSE, it is made an MMU feature.
The OV5 handling and H_REGISTER_PROC_TBL hcall are changed to
handle GTSE as an optionally available feature and to not assume GTSE
when radix support is available.

The actual hcall implementation for KVM isn't included in this
patchset and will be posted separately.

Changes in v2
=============
- Dropped the patch that added H_RPT_INVALIDATE calls for the nested
  case. This patch will be posted separately along with KVM hcall
  implementation.
- Merged first two patches
- A few cleanups
- Rebased to powerpc/next

v1: https://lore.kernel.org/linuxppc-dev/20200618160930.26324-1-bharata@linux.ibm.com/

H_RPT_INVALIDATE
================
Syntax:
int64   /* H_Success: Return code on successful completion */
        /* H_Busy - repeat the call with the same */
        /* H_Parameter, H_P2, H_P3, H_P4, H_P5 : Invalid parameters */
        hcall(const uint64 H_RPT_INVALIDATE, /* Invalidate RPT translation lookaside information */
              uint64 pid,       /* PID/LPID to invalidate */
              uint64 target,    /* Invalidation target */
              uint64 type,      /* Type of lookaside information */
              uint64 pageSizes,     /* Page sizes */
              uint64 start,     /* Start of Effective Address (EA) range (inclusive) */
              uint64 end)       /* End of EA range (exclusive) */

Invalidation targets (target)
-----------------------------
Core MMU        0x01 /* All virtual processors in the partition */
Core local MMU  0x02 /* Current virtual processor */
Nest MMU        0x04 /* All nest/accelerator agents in use by the partition */

A combination of the above can be specified, except core and core local.

Type of translation to invalidate (type)
---------------------------------------
NESTED       0x0001  /* Invalidate nested guest partition-scope */
TLB          0x0002  /* Invalidate TLB */
PWC          0x0004  /* Invalidate Page Walk Cache */
PRT          0x0008  /* Invalidate Process Table Entries if NESTED is clear */
PAT          0x0008  /* Invalidate Partition Table Entries if NESTED is set */

A combination of the above can be specified.

Page size mask (pageSizes)
--------------------------
4K              0x01
64K             0x02
2M              0x04
1G              0x08
All sizes       (-1UL)

A combination of the above can be specified.
All page sizes can be selected with -1.

Semantics: Invalidate radix tree lookaside information
           matching the parameters given.
* Return H_P2, H_P3 or H_P4 if target, type, or pageSizes parameters are
  different from the defined values.
* Return H_PARAMETER if NESTED is set and pid is not a valid nested
  LPID allocated to this partition
* Return H_P5 if (start, end) doesn't form a valid range. Start and end
  should be a valid Quadrant address and  end > start.
* Return H_NotSupported if the partition is not in running in radix
  translation mode.
* May invalidate more translation information than requested.
* If start = 0 and end = -1, set the range to cover all valid addresses.
  Else start and end should be aligned to 4kB (lower 11 bits clear).
* If NESTED is clear, then invalidate process scoped lookaside information.
  Else pid specifies a nested LPID, and the invalidation is performed
  on nested guest partition table and nested guest partition scope real
  addresses.
* If pid = 0 and NESTED is clear, then valid addresses are quadrant 3 and
  quadrant 0 spaces, Else valid addresses are quadrant 0.
* Pages which are fully covered by the range are to be invalidated.
  Those which are partially covered are considered outside invalidation
  range, which allows a caller to optimally invalidate ranges that may
  contain mixed page sizes.
* Return H_SUCCESS on success.

Bharata B Rao (2):
  powerpc/mm: Enable radix GTSE only if supported.
  powerpc/pseries: H_REGISTER_PROC_TBL should ask for GTSE only if
    enabled

Nicholas Piggin (1):
  powerpc/mm/book3s64/radix: Off-load TLB invalidations to host when
    !GTSE

 .../include/asm/book3s/64/tlbflush-radix.h    | 15 ++++
 arch/powerpc/include/asm/hvcall.h             | 34 +++++++-
 arch/powerpc/include/asm/mmu.h                |  4 +
 arch/powerpc/include/asm/plpar_wrappers.h     | 50 +++++++++++
 arch/powerpc/kernel/dt_cpu_ftrs.c             |  1 +
 arch/powerpc/kernel/prom_init.c               | 13 +--
 arch/powerpc/mm/book3s64/radix_tlb.c          | 82 +++++++++++++++++--
 arch/powerpc/mm/init_64.c                     |  5 +-
 arch/powerpc/platforms/pseries/lpar.c         |  8 +-
 9 files changed, 195 insertions(+), 17 deletions(-)

-- 
2.21.3

^ permalink raw reply

* [PATCH v2 2/3] powerpc/pseries: H_REGISTER_PROC_TBL should ask for GTSE only if enabled
From: Bharata B Rao @ 2020-06-26 13:09 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: aneesh.kumar, Bharata B Rao, npiggin
In-Reply-To: <20200626131000.5207-1-bharata@linux.ibm.com>

H_REGISTER_PROC_TBL asks for GTSE by default. GTSE flag bit should
be set only when GTSE is supported.

Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
---
 arch/powerpc/platforms/pseries/lpar.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index fd26f3d21d7b..f82569a505f1 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -1680,9 +1680,11 @@ static int pseries_lpar_register_process_table(unsigned long base,
 
 	if (table_size)
 		flags |= PROC_TABLE_NEW;
-	if (radix_enabled())
-		flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE;
-	else
+	if (radix_enabled()) {
+		flags |= PROC_TABLE_RADIX;
+		if (mmu_has_feature(MMU_FTR_GTSE))
+			flags |= PROC_TABLE_GTSE;
+	} else
 		flags |= PROC_TABLE_HPT_SLB;
 	for (;;) {
 		rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
-- 
2.21.3


^ permalink raw reply related

* [PATCH v2 3/3] powerpc/mm/book3s64/radix: Off-load TLB invalidations to host when !GTSE
From: Bharata B Rao @ 2020-06-26 13:10 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: aneesh.kumar, Bharata B Rao, npiggin
In-Reply-To: <20200626131000.5207-1-bharata@linux.ibm.com>

From: Nicholas Piggin <npiggin@gmail.com>

When platform doesn't support GTSE, let TLB invalidation requests
for radix guests be off-loaded to the host using H_RPT_INVALIDATE
hcall.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
	[hcall wrapper, error path handling and renames]
---
 .../include/asm/book3s/64/tlbflush-radix.h    | 15 ++++
 arch/powerpc/include/asm/hvcall.h             | 34 +++++++-
 arch/powerpc/include/asm/plpar_wrappers.h     | 50 +++++++++++
 arch/powerpc/mm/book3s64/radix_tlb.c          | 82 +++++++++++++++++--
 4 files changed, 173 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index ca8db193ae38..e7cf50358411 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -2,10 +2,25 @@
 #ifndef _ASM_POWERPC_TLBFLUSH_RADIX_H
 #define _ASM_POWERPC_TLBFLUSH_RADIX_H
 
+#include <asm/hvcall.h>
+
 struct vm_area_struct;
 struct mm_struct;
 struct mmu_gather;
 
+static inline u64 psize_to_h_rpti(unsigned long psize)
+{
+	if (psize == MMU_PAGE_4K)
+		return H_RPTI_PAGE_4K;
+	if (psize == MMU_PAGE_64K)
+		return H_RPTI_PAGE_64K;
+	if (psize == MMU_PAGE_2M)
+		return H_RPTI_PAGE_2M;
+	if (psize == MMU_PAGE_1G)
+		return H_RPTI_PAGE_1G;
+	return H_RPTI_PAGE_ALL;
+}
+
 static inline int mmu_get_ap(int psize)
 {
 	return mmu_psize_defs[psize].ap;
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index e90c073e437e..43486e773bd6 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -305,7 +305,8 @@
 #define H_SCM_UNBIND_ALL        0x3FC
 #define H_SCM_HEALTH            0x400
 #define H_SCM_PERFORMANCE_STATS 0x418
-#define MAX_HCALL_OPCODE	H_SCM_PERFORMANCE_STATS
+#define H_RPT_INVALIDATE	0x448
+#define MAX_HCALL_OPCODE	H_RPT_INVALIDATE
 
 /* Scope args for H_SCM_UNBIND_ALL */
 #define H_UNBIND_SCOPE_ALL (0x1)
@@ -389,6 +390,37 @@
 #define PROC_TABLE_RADIX	0x04
 #define PROC_TABLE_GTSE		0x01
 
+/*
+ * Defines for
+ * H_RPT_INVALIDATE - Invalidate RPT translation lookaside information.
+ */
+
+/* Type of translation to invalidate (type) */
+#define H_RPTI_TYPE_NESTED	0x0001	/* Invalidate nested guest partition-scope */
+#define H_RPTI_TYPE_TLB		0x0002	/* Invalidate TLB */
+#define H_RPTI_TYPE_PWC		0x0004	/* Invalidate Page Walk Cache */
+/* Invalidate Process Table Entries if H_RPTI_TYPE_NESTED is clear */
+#define H_RPTI_TYPE_PRT		0x0008
+/* Invalidate Partition Table Entries if H_RPTI_TYPE_NESTED is set */
+#define H_RPTI_TYPE_PAT		0x0008
+#define H_RPTI_TYPE_ALL		(H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | \
+				 H_RPTI_TYPE_PRT)
+#define H_RPTI_TYPE_NESTED_ALL	(H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | \
+				 H_RPTI_TYPE_PAT)
+
+/* Invalidation targets (target) */
+#define H_RPTI_TARGET_CMMU		0x01 /* All virtual processors in the partition */
+#define H_RPTI_TARGET_CMMU_LOCAL	0x02 /* Current virtual processor */
+/* All nest/accelerator agents in use by the partition */
+#define H_RPTI_TARGET_NMMU		0x04
+
+/* Page size mask (page sizes) */
+#define H_RPTI_PAGE_4K	0x01
+#define H_RPTI_PAGE_64K	0x02
+#define H_RPTI_PAGE_2M	0x04
+#define H_RPTI_PAGE_1G	0x08
+#define H_RPTI_PAGE_ALL (-1UL)
+
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
index 4497c8afb573..a184923abd07 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -334,6 +334,49 @@ static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p)
 	return rc;
 }
 
+/*
+ * Wrapper to H_RPT_INVALIDATE hcall that handles return values appropriately
+ *
+ * - Returns H_SUCCESS on success
+ * - For H_BUSY return value, we retry the hcall.
+ * - For any other hcall failures, attempt a full flush once before
+ *   resorting to BUG().
+ *
+ * Note: This hcall is expected to fail only very rarely. The correct
+ * error recovery of killing the process/guest will be eventually
+ * needed.
+ */
+static inline long pseries_rpt_invalidate(u32 pid, u64 target, u64 type,
+					  u64 page_sizes, u64 start, u64 end)
+{
+	long rc;
+	unsigned long all;
+
+	while (true) {
+		rc = plpar_hcall_norets(H_RPT_INVALIDATE, pid, target, type,
+					page_sizes, start, end);
+		if (rc == H_BUSY) {
+			cpu_relax();
+			continue;
+		} else if (rc == H_SUCCESS)
+			return rc;
+
+		/* Flush request failed, try with a full flush once */
+		all = (type & H_RPTI_TYPE_NESTED) ? H_RPTI_TYPE_NESTED_ALL :
+		      H_RPTI_TYPE_ALL;
+retry:
+		rc = plpar_hcall_norets(H_RPT_INVALIDATE, pid, target,
+					all, page_sizes, 0, -1UL);
+		if (rc == H_BUSY) {
+			cpu_relax();
+			goto retry;
+		} else if (rc == H_SUCCESS)
+			return rc;
+
+		BUG();
+	}
+}
+
 #else /* !CONFIG_PPC_PSERIES */
 
 static inline long plpar_set_ciabr(unsigned long ciabr)
@@ -346,6 +389,13 @@ static inline long plpar_pte_read_4(unsigned long flags, unsigned long ptex,
 {
 	return 0;
 }
+
+static inline long pseries_rpt_invalidate(u32 pid, u64 target, u64 type,
+					  u64 page_sizes, u64 start, u64 end)
+{
+	return 0;
+}
+
 #endif /* CONFIG_PPC_PSERIES */
 
 #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b5cc9b23cf02..180d8ddcf6e3 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -16,6 +16,7 @@
 #include <asm/tlbflush.h>
 #include <asm/trace.h>
 #include <asm/cputhreads.h>
+#include <asm/plpar_wrappers.h>
 
 #define RIC_FLUSH_TLB 0
 #define RIC_FLUSH_PWC 1
@@ -694,7 +695,14 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 			goto local;
 		}
 
-		if (cputlb_use_tlbie()) {
+		if (!mmu_has_feature(MMU_FTR_GTSE)) {
+			unsigned long tgt = H_RPTI_TARGET_CMMU;
+
+			if (atomic_read(&mm->context.copros) > 0)
+				tgt |= H_RPTI_TARGET_NMMU;
+			pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB,
+					       H_RPTI_PAGE_ALL, 0, -1UL);
+		} else if (cputlb_use_tlbie()) {
 			if (mm_needs_flush_escalation(mm))
 				_tlbie_pid(pid, RIC_FLUSH_ALL);
 			else
@@ -727,7 +735,16 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
 				goto local;
 			}
 		}
-		if (cputlb_use_tlbie())
+		if (!mmu_has_feature(MMU_FTR_GTSE)) {
+			unsigned long tgt = H_RPTI_TARGET_CMMU;
+			unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+					     H_RPTI_TYPE_PRT;
+
+			if (atomic_read(&mm->context.copros) > 0)
+				tgt |= H_RPTI_TARGET_NMMU;
+			pseries_rpt_invalidate(pid, tgt, type,
+					       H_RPTI_PAGE_ALL, 0, -1UL);
+		} else if (cputlb_use_tlbie())
 			_tlbie_pid(pid, RIC_FLUSH_ALL);
 		else
 			_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
@@ -760,7 +777,19 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 			exit_flush_lazy_tlbs(mm);
 			goto local;
 		}
-		if (cputlb_use_tlbie())
+		if (!mmu_has_feature(MMU_FTR_GTSE)) {
+			unsigned long tgt, page_sizes, size;
+
+			tgt = H_RPTI_TARGET_CMMU;
+			page_sizes = psize_to_h_rpti(psize);
+			size = 1UL << mmu_psize_to_shift(psize);
+
+			if (atomic_read(&mm->context.copros) > 0)
+				tgt |= H_RPTI_TARGET_NMMU;
+			pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB,
+					       page_sizes, vmaddr,
+					       vmaddr + size);
+		} else if (cputlb_use_tlbie())
 			_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 		else
 			_tlbiel_va_multicast(mm, vmaddr, pid, psize, RIC_FLUSH_TLB);
@@ -810,7 +839,14 @@ static inline void _tlbiel_kernel_broadcast(void)
  */
 void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-	if (cputlb_use_tlbie())
+	if (!mmu_has_feature(MMU_FTR_GTSE)) {
+		unsigned long tgt = H_RPTI_TARGET_CMMU | H_RPTI_TARGET_NMMU;
+		unsigned long type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+				     H_RPTI_TYPE_PRT;
+
+		pseries_rpt_invalidate(0, tgt, type, H_RPTI_PAGE_ALL,
+				       start, end);
+	} else if (cputlb_use_tlbie())
 		_tlbie_pid(0, RIC_FLUSH_ALL);
 	else
 		_tlbiel_kernel_broadcast();
@@ -864,7 +900,17 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 				nr_pages > tlb_local_single_page_flush_ceiling);
 	}
 
-	if (full) {
+	if (!mmu_has_feature(MMU_FTR_GTSE) && !local) {
+		unsigned long tgt = H_RPTI_TARGET_CMMU;
+		unsigned long page_sizes = psize_to_h_rpti(mmu_virtual_psize);
+
+		if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+			page_sizes |= psize_to_h_rpti(MMU_PAGE_2M);
+		if (atomic_read(&mm->context.copros) > 0)
+			tgt |= H_RPTI_TARGET_NMMU;
+		pseries_rpt_invalidate(pid, tgt, H_RPTI_TYPE_TLB, page_sizes,
+				       start, end);
+	} else if (full) {
 		if (local) {
 			_tlbiel_pid(pid, RIC_FLUSH_TLB);
 		} else {
@@ -1046,7 +1092,17 @@ static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 				nr_pages > tlb_local_single_page_flush_ceiling);
 	}
 
-	if (full) {
+	if (!mmu_has_feature(MMU_FTR_GTSE) && !local) {
+		unsigned long tgt = H_RPTI_TARGET_CMMU;
+		unsigned long type = H_RPTI_TYPE_TLB;
+		unsigned long page_sizes = psize_to_h_rpti(psize);
+
+		if (also_pwc)
+			type |= H_RPTI_TYPE_PWC;
+		if (atomic_read(&mm->context.copros) > 0)
+			tgt |= H_RPTI_TARGET_NMMU;
+		pseries_rpt_invalidate(pid, tgt, type, page_sizes, start, end);
+	} else if (full) {
 		if (local) {
 			_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
 		} else {
@@ -1111,7 +1167,19 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 			exit_flush_lazy_tlbs(mm);
 			goto local;
 		}
-		if (cputlb_use_tlbie())
+		if (!mmu_has_feature(MMU_FTR_GTSE)) {
+			unsigned long tgt, type, page_sizes;
+
+			tgt = H_RPTI_TARGET_CMMU;
+			type = H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC |
+			       H_RPTI_TYPE_PRT;
+			page_sizes = psize_to_h_rpti(mmu_virtual_psize);
+
+			if (atomic_read(&mm->context.copros) > 0)
+				tgt |= H_RPTI_TARGET_NMMU;
+			pseries_rpt_invalidate(pid, tgt, type, page_sizes,
+					       addr, end);
+		} else if (cputlb_use_tlbie())
 			_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
 		else
 			_tlbiel_va_range_multicast(mm,
-- 
2.21.3


^ permalink raw reply related

* [PATCH v2 1/3] powerpc/mm: Enable radix GTSE only if supported.
From: Bharata B Rao @ 2020-06-26 13:09 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: aneesh.kumar, Bharata B Rao, npiggin
In-Reply-To: <20200626131000.5207-1-bharata@linux.ibm.com>

Make GTSE an MMU feature and enable it by default for radix.
However for guest, conditionally enable it if hypervisor supports
it via OV5 vector. Let prom_init ask for radix GTSE only if the
support exists.

Having GTSE as an MMU feature will make it easy to enable radix
without GTSE. Currently radix assumes GTSE is enabled by default.

Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
---
 arch/powerpc/include/asm/mmu.h    |  4 ++++
 arch/powerpc/kernel/dt_cpu_ftrs.c |  1 +
 arch/powerpc/kernel/prom_init.c   | 13 ++++++++-----
 arch/powerpc/mm/init_64.c         |  5 ++++-
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index f4ac25d4df05..884d51995934 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -28,6 +28,9 @@
  * Individual features below.
  */
 
+/* Guest Translation Shootdown Enable */
+#define MMU_FTR_GTSE			ASM_CONST(0x00001000)
+
 /*
  * Support for 68 bit VA space. We added that from ISA 2.05
  */
@@ -173,6 +176,7 @@ enum {
 #endif
 #ifdef CONFIG_PPC_RADIX_MMU
 		MMU_FTR_TYPE_RADIX |
+		MMU_FTR_GTSE |
 #ifdef CONFIG_PPC_KUAP
 		MMU_FTR_RADIX_KUAP |
 #endif /* CONFIG_PPC_KUAP */
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index a0edeb391e3e..ac650c233cd9 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -336,6 +336,7 @@ static int __init feat_enable_mmu_radix(struct dt_cpu_feature *f)
 #ifdef CONFIG_PPC_RADIX_MMU
 	cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
 	cur_cpu_spec->mmu_features |= MMU_FTRS_HASH_BASE;
+	cur_cpu_spec->mmu_features |= MMU_FTR_GTSE;
 	cur_cpu_spec->cpu_user_features |= PPC_FEATURE_HAS_MMU;
 
 	return 1;
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 90c604d00b7d..cbc605cfdec0 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -1336,12 +1336,15 @@ static void __init prom_check_platform_support(void)
 		}
 	}
 
-	if (supported.radix_mmu && supported.radix_gtse &&
-	    IS_ENABLED(CONFIG_PPC_RADIX_MMU)) {
-		/* Radix preferred - but we require GTSE for now */
-		prom_debug("Asking for radix with GTSE\n");
+	if (supported.radix_mmu && IS_ENABLED(CONFIG_PPC_RADIX_MMU)) {
+		/* Radix preferred - Check if GTSE is also supported */
+		prom_debug("Asking for radix\n");
 		ibm_architecture_vec.vec5.mmu = OV5_FEAT(OV5_MMU_RADIX);
-		ibm_architecture_vec.vec5.radix_ext = OV5_FEAT(OV5_RADIX_GTSE);
+		if (supported.radix_gtse)
+			ibm_architecture_vec.vec5.radix_ext =
+					OV5_FEAT(OV5_RADIX_GTSE);
+		else
+			prom_debug("Radix GTSE isn't supported\n");
 	} else if (supported.hash_mmu) {
 		/* Default to hash mmu (if we can) */
 		prom_debug("Asking for hash\n");
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index bc73abf0bc25..152aa0200cef 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -407,12 +407,15 @@ static void __init early_check_vec5(void)
 		if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] &
 						OV5_FEAT(OV5_RADIX_GTSE))) {
 			pr_warn("WARNING: Hypervisor doesn't support RADIX with GTSE\n");
-		}
+			cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
+		} else
+			cur_cpu_spec->mmu_features |= MMU_FTR_GTSE;
 		/* Do radix anyway - the hypervisor said we had to */
 		cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
 	} else if (mmu_supported == OV5_FEAT(OV5_MMU_HASH)) {
 		/* Hypervisor only supports hash - disable radix */
 		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
+		cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE;
 	}
 }
 
-- 
2.21.3


^ permalink raw reply related

* Re: [PATCH v2 5/6] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition
From: Leonardo Bras @ 2020-06-26 15:23 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200624062411.367796-6-leobras.c@gmail.com>

On Wed, 2020-06-24 at 03:24 -0300, Leonardo Bras wrote:
> As of today, if a DDW is created and can't map the whole partition, it's
> removed and the default DMA window "ibm,dma-window" is used instead.
> 
> Usually this DDW is bigger than the default DMA window, so it would be
> better to make use of it instead.
> 
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> ---

I tested this change with a 256GB DDW which did not map the whole
partition, with a MT27700 Family [ConnectX-4 Virtual Function].

I noticed the performance improvement is about the same as using DDW
with IOMMU bypass.

64 thread write throughput: +203.0%
64 thread read throughput: +17.5%
1 thread write throughput: +20.5%
1 thread read throughput: +3.43%
Averag
e write latency: -23.0%
Average read latency:  -2.26%



^ permalink raw reply

* Re: [PATCH v2 00/15] Documentation fixes
From: Jonathan Corbet @ 2020-06-26 16:13 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: linux-ia64, Linux Doc Mailing List, Peter Zijlstra (Intel),
	linux-pci, Ram Pai, James E.J. Bottomley, linux-mm, Eric Dumazet,
	netdev, Paul Mackerras, Sandipan Das, linux-kselftest,
	H. Peter Anvin, Jan Kara, Sukadev Bhattiprolu, Shuah Khan,
	Christoph Hellwig, Marek Szyprowski, Stephen Rothwell,
	Florian Fainelli, Will Deacon, Helge Deller, x86, Haren Myneni,
	Russell King, kasan-dev, Ingo Molnar, Gerald Schaefer,
	Jakub Kicinski, Alexey Dobriyan, linux-media, Fenghua Yu,
	Marco Elver, Kees Cook, Robin Murphy, Borislav Petkov,
	Alexander Viro, Bjorn Helgaas, Thomas Gleixner, Dmitry Vyukov,
	Tony Luck, linux-parisc, Dave Hansen, Alexey Gladkov,
	Akira Shimahara, Jeff Layton, linux-kernel, iommu,
	Eric W. Biederman, Greg Kroah-Hartman, linux-fsdevel,
	Andrew Morton, linuxppc-dev, David S. Miller,
	Thiago Jung Bauermann, Mike Kravetz
In-Reply-To: <cover.1592895969.git.mchehab+huawei@kernel.org>

On Tue, 23 Jun 2020 09:08:56 +0200
Mauro Carvalho Chehab <mchehab+huawei@kernel.org> wrote:

> As requested, this is a rebase of a previous series posted on Jan, 15.
> 
> Since then, several patches got merged via other trees or became
> obsolete. There were also 2 patches before that fits better at the
> ReST conversion patchset. So, I'll be sending it on another patch
> series together with the remaining ReST conversions.
> 
> I also added reviews/acks received.
> 
> So, the series reduced from 29 to 15 patches.
> 
> Let's hope b4 would be able to properly handle this one.

Nope.  I don't know what it is about your patch series, but b4 is never
able to put them together.

I've applied the series except for #1, which already went through the -mm
tree.

Thanks,

jon

^ permalink raw reply

* [PATCH] selftests/powerpc: Purge extra count_pmc() calls of ebb selftests
From: Desnes A. Nunes do Rosario @ 2020-06-26 16:47 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: desnesn, shuah

An extra count on ebb_state.stats.pmc_count[PMC_INDEX(pmc)] is being per-
formed when count_pmc() is used to reset PMCs on a few selftests. This
extra pmc_count can occasionally invalidate results, such as the ones from
cycles_test shown hereafter. The ebb_check_count() failed with an above
the upper limit error due to the extra value on ebb_state.stats.pmc_count.

Furthermore, this extra count is also indicated by extra PMC1 trace_log on
the output of the cycle test (as well as on pmc56_overflow_test):

==========
   ...
   [21]: counter = 8
   [22]: register SPRN_MMCR0 = 0x0000000080000080
   [23]: register SPRN_PMC1  = 0x0000000080000004
   [24]: counter = 9
   [25]: register SPRN_MMCR0 = 0x0000000080000080
   [26]: register SPRN_PMC1  = 0x0000000080000004
   [27]: counter = 10
   [28]: register SPRN_MMCR0 = 0x0000000080000080
   [29]: register SPRN_PMC1  = 0x0000000080000004
>> [30]: register SPRN_PMC1  = 0x000000004000051e
PMC1 count (0x280000546) above upper limit 0x2800003e8 (+0x15e)
[FAIL] Test FAILED on line 52
failure: cycles
==========

Signed-off-by: Desnes A. Nunes do Rosario <desnesn@linux.ibm.com>
---
 .../selftests/powerpc/pmu/ebb/back_to_back_ebbs_test.c     | 2 --
 tools/testing/selftests/powerpc/pmu/ebb/cycles_test.c      | 2 --
 .../selftests/powerpc/pmu/ebb/cycles_with_freeze_test.c    | 2 --
 .../selftests/powerpc/pmu/ebb/cycles_with_mmcr2_test.c     | 2 --
 tools/testing/selftests/powerpc/pmu/ebb/ebb.c              | 2 --
 .../selftests/powerpc/pmu/ebb/ebb_on_willing_child_test.c  | 2 --
 .../selftests/powerpc/pmu/ebb/lost_exception_test.c        | 1 -
 .../testing/selftests/powerpc/pmu/ebb/multi_counter_test.c | 7 -------
 .../selftests/powerpc/pmu/ebb/multi_ebb_procs_test.c       | 2 --
 .../testing/selftests/powerpc/pmu/ebb/pmae_handling_test.c | 2 --
 .../selftests/powerpc/pmu/ebb/pmc56_overflow_test.c        | 2 --
 11 files changed, 26 deletions(-)

diff --git a/tools/testing/selftests/powerpc/pmu/ebb/back_to_back_ebbs_test.c b/tools/testing/selftests/powerpc/pmu/ebb/back_to_back_ebbs_test.c
index a2d7b0e3dca9..a26ac122c759 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/back_to_back_ebbs_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/back_to_back_ebbs_test.c
@@ -91,8 +91,6 @@ int back_to_back_ebbs(void)
 	ebb_global_disable();
 	ebb_freeze_pmcs();
 
-	count_pmc(1, sample_period);
-
 	dump_ebb_state();
 
 	event_close(&event);
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/cycles_test.c b/tools/testing/selftests/powerpc/pmu/ebb/cycles_test.c
index bc893813483e..bb9f587fa76e 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/cycles_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/cycles_test.c
@@ -42,8 +42,6 @@ int cycles(void)
 	ebb_global_disable();
 	ebb_freeze_pmcs();
 
-	count_pmc(1, sample_period);
-
 	dump_ebb_state();
 
 	event_close(&event);
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_freeze_test.c b/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_freeze_test.c
index dcd351d20328..9ae795ce314e 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_freeze_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_freeze_test.c
@@ -99,8 +99,6 @@ int cycles_with_freeze(void)
 	ebb_global_disable();
 	ebb_freeze_pmcs();
 
-	count_pmc(1, sample_period);
-
 	dump_ebb_state();
 
 	printf("EBBs while frozen %d\n", ebbs_while_frozen);
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_mmcr2_test.c b/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_mmcr2_test.c
index 94c99c12c0f2..4b45a2e70f62 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_mmcr2_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_mmcr2_test.c
@@ -71,8 +71,6 @@ int cycles_with_mmcr2(void)
 	ebb_global_disable();
 	ebb_freeze_pmcs();
 
-	count_pmc(1, sample_period);
-
 	dump_ebb_state();
 
 	event_close(&event);
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
index dfbc5c3ad52d..21537d6eb6b7 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
@@ -396,8 +396,6 @@ int ebb_child(union pipe read_pipe, union pipe write_pipe)
 	ebb_global_disable();
 	ebb_freeze_pmcs();
 
-	count_pmc(1, sample_period);
-
 	dump_ebb_state();
 
 	event_close(&event);
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb_on_willing_child_test.c b/tools/testing/selftests/powerpc/pmu/ebb/ebb_on_willing_child_test.c
index ca2f7d729155..b208bf6ad58d 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/ebb_on_willing_child_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb_on_willing_child_test.c
@@ -38,8 +38,6 @@ static int victim_child(union pipe read_pipe, union pipe write_pipe)
 	ebb_global_disable();
 	ebb_freeze_pmcs();
 
-	count_pmc(1, sample_period);
-
 	dump_ebb_state();
 
 	FAIL_IF(ebb_state.stats.ebb_count == 0);
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/lost_exception_test.c b/tools/testing/selftests/powerpc/pmu/ebb/lost_exception_test.c
index ac3e6e182614..ba2681a12cc7 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/lost_exception_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/lost_exception_test.c
@@ -75,7 +75,6 @@ static int test_body(void)
 	ebb_freeze_pmcs();
 	ebb_global_disable();
 
-	count_pmc(4, sample_period);
 	mtspr(SPRN_PMC4, 0xdead);
 
 	dump_summary_ebb_state();
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/multi_counter_test.c b/tools/testing/selftests/powerpc/pmu/ebb/multi_counter_test.c
index b8242e9d97d2..791d37ba327b 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/multi_counter_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/multi_counter_test.c
@@ -70,13 +70,6 @@ int multi_counter(void)
 	ebb_global_disable();
 	ebb_freeze_pmcs();
 
-	count_pmc(1, sample_period);
-	count_pmc(2, sample_period);
-	count_pmc(3, sample_period);
-	count_pmc(4, sample_period);
-	count_pmc(5, sample_period);
-	count_pmc(6, sample_period);
-
 	dump_ebb_state();
 
 	for (i = 0; i < 6; i++)
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/multi_ebb_procs_test.c b/tools/testing/selftests/powerpc/pmu/ebb/multi_ebb_procs_test.c
index a05c0e18ded6..9b0f70d59702 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/multi_ebb_procs_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/multi_ebb_procs_test.c
@@ -61,8 +61,6 @@ static int cycles_child(void)
 	ebb_global_disable();
 	ebb_freeze_pmcs();
 
-	count_pmc(1, sample_period);
-
 	dump_summary_ebb_state();
 
 	event_close(&event);
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/pmae_handling_test.c b/tools/testing/selftests/powerpc/pmu/ebb/pmae_handling_test.c
index 153ebc92234f..2904c741e04e 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/pmae_handling_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/pmae_handling_test.c
@@ -82,8 +82,6 @@ static int test_body(void)
 	ebb_global_disable();
 	ebb_freeze_pmcs();
 
-	count_pmc(1, sample_period);
-
 	dump_ebb_state();
 
 	if (mmcr0_mismatch)
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/pmc56_overflow_test.c b/tools/testing/selftests/powerpc/pmu/ebb/pmc56_overflow_test.c
index eadad75ed7e6..b29f8ba22d1e 100644
--- a/tools/testing/selftests/powerpc/pmu/ebb/pmc56_overflow_test.c
+++ b/tools/testing/selftests/powerpc/pmu/ebb/pmc56_overflow_test.c
@@ -76,8 +76,6 @@ int pmc56_overflow(void)
 	ebb_global_disable();
 	ebb_freeze_pmcs();
 
-	count_pmc(2, sample_period);
-
 	dump_ebb_state();
 
 	printf("PMC5/6 overflow %d\n", pmc56_overflowed);
-- 
2.21.3


^ permalink raw reply related

* Re: [PATCH v2 6/6] powerpc/pseries/iommu: Avoid errors when DDW starts at 0x00
From: Leonardo Bras @ 2020-06-26 17:46 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200624062411.367796-7-leobras.c@gmail.com>

On Wed, 2020-06-24 at 03:24 -0300, Leonardo Bras wrote:
> As of today, enable_ddw() will return a non-null DMA address if the
> created DDW maps the whole partition. If the address is valid,
> iommu_bypass_supported_pSeriesLP() will consider iommu bypass enabled.
> 
> This can cause some trouble if the DDW happens to start at 0x00.
> 
> Instead if checking if the address is non-null, check directly if
> the DDW maps the whole partition, so it can bypass iommu.
> 
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>

This patch has a bug in it. I will rework it soon.
Please keep reviewing patches 1-5.

Best regards,
Leonardo


^ permalink raw reply

* Re: [PATCH v2 5/6] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition
From: Leonardo Bras @ 2020-06-26 17:55 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Alexey Kardashevskiy, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <14da928d1cae8779a93228455c4873c127eeffa6.camel@gmail.com>

On Fri, 2020-06-26 at 12:23 -0300, Leonardo Bras wrote:
> On Wed, 2020-06-24 at 03:24 -0300, Leonardo Bras wrote:
> > As of today, if a DDW is created and can't map the whole partition, it's
> > removed and the default DMA window "ibm,dma-window" is used instead.
> > 
> > Usually this DDW is bigger than the default DMA window, so it would be
> > better to make use of it instead.
> > 
> > Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> > ---
> 
> I tested this change with a 256GB DDW which did not map the whole
> partition, with a MT27700 Family [ConnectX-4 Virtual Function].
> 
> I noticed the performance improvement is about the same as using DDW
> with IOMMU bypass.
> 
> 64 thread write throughput: +203.0%
> 64 thread read throughput: +17.5%
> 1 thread write throughput: +20.5%
> 1 thread read throughput: +3.43%
> Average write latency: -23.0%
> Average read latency:  -2.26%

The above improvements are based on the default DMA window, which is
currently used if DDW can't map the whole partition.

Those values are an average of 20 tests for each environment, 30
seconds each test.

I also did some intense testing, for 5 hour each:
64 thread write throughput 
64 thread read throughput

The throughput values are stable in the whole test, and I noticed no
error on dmesg / journalctl.


^ permalink raw reply

* Re: [PATCH 0/8 v2] PCI: Align return values of PCIe capability and PCI accessors
From: Bjorn Helgaas @ 2020-06-26 18:52 UTC (permalink / raw)
  To: refactormyself
  Cc: Don Brace, Sam Bobroff, Mike Marciniszyn, linux-scsi,
	Martin K. Petersen, Jason Gunthorpe, linux-rdma, linux-pci,
	Dennis Dalessandro, esc.storagedev, Doug Ledford, linux-kernel,
	dmaengine, Vinod Koul, Oliver O'Halloran, skhan, bjorn,
	linuxppc-dev, James E.J. Bottomley, linux-kernel-mentees
In-Reply-To: <20200615073225.24061-1-refactormyself@gmail.com>

On Mon, Jun 15, 2020 at 09:32:17AM +0200, refactormyself@gmail.com wrote:
> From: Bolarinwa Olayemi Saheed <refactormyself@gmail.com>
> 
> 
> PATCH 1/8 to 7/8:
> PCIBIOS_ error codes have positive values and they are passed down the
> call heirarchy from accessors. For functions which are meant to return
> only a negative value on failure, passing on this value is a bug.
> To mitigate this, call pcibios_err_to_errno() before passing on return
> value from PCIe capability accessors call heirarchy. This function
> converts any positive PCIBIOS_ error codes to negative generic error
> values.
> 
> PATCH 8/8:
> The PCIe capability accessors can return 0, -EINVAL, or any PCIBIOS_ error
> code. The pci accessor on the other hand can only return 0 or any PCIBIOS_
> error code.This inconsistency among these accessor makes it harder for
> callers to check for errors.
> Return PCIBIOS_BAD_REGISTER_NUMBER instead of -EINVAL in all PCIe
> capability accessors.
> 
> MERGING:
> These may all be merged via the PCI tree, since it is a collection of
> similar fixes. This way they all get merged at once.
> 
> Version 2:
> * cc to maintainers and mailing lists
> * Edit the Subject to conform with previous style
> * reorder "Signed by" and "Suggested by"
> * made spelling corrections
> * fixed redundant initialisation in PATCH 3/8
> * include missing call to pcibios_err_to_errno() in PATCH 6/8 and 7/8
> 
> 
> Bolarinwa Olayemi Saheed (8):
>   dmaengine: ioatdma: Convert PCIBIOS_* errors to generic -E* errors
>   IB/hfi1: Convert PCIBIOS_* errors to generic -E* errors
>   IB/hfi1: Convert PCIBIOS_* errors to generic -E* errors
>   PCI: Convert PCIBIOS_* errors to generic -E* errors
>   scsi: smartpqi: Convert PCIBIOS_* errors to generic -E* errors
>   PCI/AER: Convert PCIBIOS_* errors to generic -E* errors
>   PCI/AER: Convert PCIBIOS_* errors to generic -E* errors
>   PCI: Align return values of PCIe capability and PCI accessorss
> 
>  drivers/dma/ioat/init.c               |  4 ++--
>  drivers/infiniband/hw/hfi1/pcie.c     | 18 +++++++++++++-----
>  drivers/pci/access.c                  |  8 ++++----
>  drivers/pci/pci.c                     | 10 ++++++++--
>  drivers/pci/pcie/aer.c                | 12 ++++++++++--
>  drivers/scsi/smartpqi/smartpqi_init.c |  6 +++++-
>  6 files changed, 42 insertions(+), 16 deletions(-)

Since these are really fixing a single PCI API problem, not individual
driver-related problems, I squashed the pcibios_err_to_errno() patches
together (except IB/hfi1, since Jason will take those separately) and
applied them to pci/misc, thanks!

The squashed patch as applied is:

commit d20df83b66cc ("PCI: Convert PCIe capability PCIBIOS errors to errno")
Author: Bolarinwa Olayemi Saheed <refactormyself@gmail.com>
Date:   Mon Jun 15 09:32:18 2020 +0200

    PCI: Convert PCIe capability PCIBIOS errors to errno
    
    The PCI config accessors (pci_read_config_word(), et al) return
    PCIBIOS_SUCCESSFUL (zero) or positive error values like
    PCIBIOS_FUNC_NOT_SUPPORTED.
    
    The PCIe capability accessors (pcie_capability_read_word(), et al)
    similarly return PCIBIOS errors, but some callers assume they return
    generic errno values like -EINVAL.
    
    For example, the Myri-10G probe function returns a positive PCIBIOS error
    if the pcie_capability_clear_and_set_word() in pcie_set_readrq() fails:
    
      myri10ge_probe
        status = pcie_set_readrq
          return pcie_capability_clear_and_set_word
        if (status)
          return status
    
    A positive return from a PCI driver probe function would cause a "Driver
    probe function unexpectedly returned" warning from local_pci_probe()
    instead of the desired probe failure.
    
    Convert PCIBIOS errors to generic errno for all callers of:
    
      pcie_capability_read_word
      pcie_capability_read_dword
      pcie_capability_write_word
      pcie_capability_write_dword
      pcie_capability_set_word
      pcie_capability_set_dword
      pcie_capability_clear_word
      pcie_capability_clear_dword
      pcie_capability_clear_and_set_word
      pcie_capability_clear_and_set_dword
    
    that check the return code for anything other than zero.
    
    [bhelgaas: commit log, squash together]
    Suggested-by: Bjorn Helgaas <bjorn@helgaas.com>
    Link: https://lore.kernel.org/r/20200615073225.24061-1-refactormyself@gmail.com
    Signed-off-by: Bolarinwa Olayemi Saheed <refactormyself@gmail.com>
    Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>

diff --git a/drivers/dma/ioat/init.c b/drivers/dma/ioat/init.c
index 58d13564f88b..9a6a9ec3cf48 100644
--- a/drivers/dma/ioat/init.c
+++ b/drivers/dma/ioat/init.c
@@ -1195,13 +1195,13 @@ static int ioat3_dma_probe(struct ioatdma_device *ioat_dma, int dca)
 	/* disable relaxed ordering */
 	err = pcie_capability_read_word(pdev, IOAT_DEVCTRL_OFFSET, &val16);
 	if (err)
-		return err;
+		return pcibios_err_to_errno(err);
 
 	/* clear relaxed ordering enable */
 	val16 &= ~IOAT_DEVCTRL_ROE;
 	err = pcie_capability_write_word(pdev, IOAT_DEVCTRL_OFFSET, val16);
 	if (err)
-		return err;
+		return pcibios_err_to_errno(err);
 
 	if (ioat_dma->cap & IOAT_CAP_DPS)
 		writeb(ioat_pending_level + 1,
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index ce096272f52b..45c51aff9c03 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5688,6 +5688,7 @@ EXPORT_SYMBOL(pcie_get_readrq);
 int pcie_set_readrq(struct pci_dev *dev, int rq)
 {
 	u16 v;
+	int ret;
 
 	if (rq < 128 || rq > 4096 || !is_power_of_2(rq))
 		return -EINVAL;
@@ -5706,8 +5707,10 @@ int pcie_set_readrq(struct pci_dev *dev, int rq)
 
 	v = (ffs(rq) - 8) << 12;
 
-	return pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
+	ret = pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
 						  PCI_EXP_DEVCTL_READRQ, v);
+
+	return pcibios_err_to_errno(ret);
 }
 EXPORT_SYMBOL(pcie_set_readrq);
 
@@ -5738,6 +5741,7 @@ EXPORT_SYMBOL(pcie_get_mps);
 int pcie_set_mps(struct pci_dev *dev, int mps)
 {
 	u16 v;
+	int ret;
 
 	if (mps < 128 || mps > 4096 || !is_power_of_2(mps))
 		return -EINVAL;
@@ -5747,8 +5751,10 @@ int pcie_set_mps(struct pci_dev *dev, int mps)
 		return -EINVAL;
 	v <<= 5;
 
-	return pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
+	ret = pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
 						  PCI_EXP_DEVCTL_PAYLOAD, v);
+
+	return pcibios_err_to_errno(ret);
 }
 EXPORT_SYMBOL(pcie_set_mps);
 
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 3acf56683915..2dbc1fd2910b 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -224,20 +224,25 @@ int pcie_aer_is_native(struct pci_dev *dev)
 
 int pci_enable_pcie_error_reporting(struct pci_dev *dev)
 {
+	int rc;
+
 	if (!pcie_aer_is_native(dev))
 		return -EIO;
 
-	return pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_AER_FLAGS);
+	rc = pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_AER_FLAGS);
+	return pcibios_err_to_errno(rc);
 }
 EXPORT_SYMBOL_GPL(pci_enable_pcie_error_reporting);
 
 int pci_disable_pcie_error_reporting(struct pci_dev *dev)
 {
+	int rc;
+
 	if (!pcie_aer_is_native(dev))
 		return -EIO;
 
-	return pcie_capability_clear_word(dev, PCI_EXP_DEVCTL,
-					  PCI_EXP_AER_FLAGS);
+	rc = pcie_capability_clear_word(dev, PCI_EXP_DEVCTL, PCI_EXP_AER_FLAGS);
+	return pcibios_err_to_errno(rc);
 }
 EXPORT_SYMBOL_GPL(pci_disable_pcie_error_reporting);
 
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c
index cd157f11eb22..bd38c8cea56e 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -7423,8 +7423,12 @@ static int pqi_ctrl_init_resume(struct pqi_ctrl_info *ctrl_info)
 static inline int pqi_set_pcie_completion_timeout(struct pci_dev *pci_dev,
 	u16 timeout)
 {
-	return pcie_capability_clear_and_set_word(pci_dev, PCI_EXP_DEVCTL2,
+	int rc;
+
+	rc = pcie_capability_clear_and_set_word(pci_dev, PCI_EXP_DEVCTL2,
 		PCI_EXP_DEVCTL2_COMP_TIMEOUT, timeout);
+
+	return pcibios_err_to_errno(rc);
 }
 
 static int pqi_pci_init(struct pqi_ctrl_info *ctrl_info)



^ permalink raw reply related

* [PATCH 00/11] ppc64: enable kdump support for kexec_file_load syscall
From: Hari Bathini @ 2020-06-26 19:04 UTC (permalink / raw)
  To: Michael Ellerman, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Petr Tesarik, Mahesh J Salgaonkar,
	Sourabh Jain, lkml, linuxppc-dev, Mimi Zohar, Vivek Goyal,
	Dave Young, Thiago Jung Bauermann, Eric Biederman

This patch series enables kdump support for kexec_file_load system
call (kexec -s -p) on PPC64. The changes are inspired from kexec-tools
code but heavily modified for kernel consumption. There is scope to
expand purgatory to verify sha digest but tried to keep purgatory
changes minimal in the interest of this series.

The first patch adds a weak arch_kexec_add_buffer function to override
locate memory hole logic suiting arch needs. There are some special
regions in ppc64 which should be avoided while loading buffer & there
are multiple callers to kexec_add_buffer making it too complicated to
maintain range sanity and using generic lookup at the same time.

The second patch marks ppc64 specific code within arch/powerpc/kexec
and arch/powerpc/purgatory to make the subsequent code changes easy
to understand.

The next patch adds helper function to setup different memory ranges
needed for loading kdump kernel, booting into it and exporting the
crashing kernel's elfcore.

The fourth patch overrides arch_kexec_add_buffer to locate memory
hole for kdump segments by accounting for the special memory regions,
referred to as excluded memory ranges, and calls __kexec_add_buffer
with kbuf->mem set to skip the generic locate memory hole lookup.

The fifth patch moves walk_drmem_lmbs() out of .init section with
a few changes to reuse it for setting up kdump kernel's usable memory
ranges. The next patch uses walk_drmem_lmbs() to look up the LMBs
and set linux,drconf-usable-memory & linux,usable-memory properties
in order to restrict kdump kernel's memory usage.

The seventh patch adds relocation support for the purgatory. Patch 8
helps setup the stack for the purgatory. The next patch setups up
backup region as a segment while loading kdump kernel and teaches
purgatory to copy it from source to destination.

Patch 10 builds the elfcore header for the running kernel & passes
the info to kdump kernel via "elfcorehdr=" parameter to export as
/proc/vmcore file. The last patch sets up the memory reserve map
for the kexec kernel and also claims kdump support as all the
necessary changes are added.

Tested the changes successfully on P8, P9 lpars & an OpenPOWER box.

---

Hari Bathini (11):
      kexec_file: allow archs to handle special regions while locating memory hole
      powerpc/kexec_file: mark PPC64 specific code
      powerpc/kexec_file: add helper functions for getting memory ranges
      ppc64/kexec_file: avoid stomping memory used by special regions
      powerpc/drmem: make lmb walk a bit more flexible
      ppc64/kexec_file: restrict memory usage of kdump kernel
      ppc64/kexec_file: add support to relocate purgatory
      ppc64/kexec_file: setup the stack for purgatory
      ppc64/kexec_file: setup backup region for kdump kernel
      ppc64/kexec_file: prepare elfcore header for crashing kernel
      ppc64/kexec_file: add appropriate regions for memory reserve map

 arch/powerpc/include/asm/crashdump-ppc64.h |   15 
 arch/powerpc/include/asm/drmem.h           |    9 
 arch/powerpc/include/asm/kexec.h           |   35 +
 arch/powerpc/include/asm/kexec_ranges.h    |   18 
 arch/powerpc/kernel/prom.c                 |   13 
 arch/powerpc/kexec/Makefile                |    2 
 arch/powerpc/kexec/elf_64.c                |   35 +
 arch/powerpc/kexec/file_load.c             |   78 +
 arch/powerpc/kexec/file_load_64.c          | 1461 ++++++++++++++++++++++++++++
 arch/powerpc/kexec/ranges.c                |  387 +++++++
 arch/powerpc/mm/drmem.c                    |   87 +-
 arch/powerpc/mm/numa.c                     |   13 
 arch/powerpc/purgatory/Makefile            |   28 -
 arch/powerpc/purgatory/purgatory_64.c      |   35 +
 arch/powerpc/purgatory/trampoline.S        |  117 --
 arch/powerpc/purgatory/trampoline_64.S     |  153 +++
 include/linux/kexec.h                      |    5 
 kernel/kexec_file.c                        |   37 +
 18 files changed, 2327 insertions(+), 201 deletions(-)
 create mode 100644 arch/powerpc/include/asm/crashdump-ppc64.h
 create mode 100644 arch/powerpc/include/asm/kexec_ranges.h
 create mode 100644 arch/powerpc/kexec/file_load_64.c
 create mode 100644 arch/powerpc/kexec/ranges.c
 create mode 100644 arch/powerpc/purgatory/purgatory_64.c
 delete mode 100644 arch/powerpc/purgatory/trampoline.S
 create mode 100644 arch/powerpc/purgatory/trampoline_64.S

^ permalink raw reply

* [PATCH 01/11] kexec_file: allow archs to handle special regions while locating memory hole
From: Hari Bathini @ 2020-06-26 19:04 UTC (permalink / raw)
  To: Michael Ellerman, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Petr Tesarik, Mahesh J Salgaonkar,
	Sourabh Jain, lkml, linuxppc-dev, Mimi Zohar, Vivek Goyal,
	Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <159319825403.16351.7253978047621755765.stgit@hbathini.in.ibm.com>

Some archs can have special memory regions, within the given memory
range, which can't be used for the buffer in a kexec segment. As
kexec_add_buffer() function is being called from generic code as well,
add weak arch_kexec_add_buffer definition for archs to override & take
care of special regions before trying to locate a memory hole.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
---
 include/linux/kexec.h |    5 +++++
 kernel/kexec_file.c   |   37 +++++++++++++++++++++++++++++++++----
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 1776eb2..1237682 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -195,6 +195,11 @@ int __weak arch_kexec_apply_relocations(struct purgatory_info *pi,
 					const Elf_Shdr *relsec,
 					const Elf_Shdr *symtab);
 
+extern int arch_kexec_add_buffer(struct kexec_buf *kbuf);
+
+/* arch_kexec_add_buffer calls this when it is ready */
+extern int __kexec_add_buffer(struct kexec_buf *kbuf);
+
 extern int kexec_add_buffer(struct kexec_buf *kbuf);
 int kexec_locate_mem_hole(struct kexec_buf *kbuf);
 
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index bb05fd5..a0b4f7f 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -669,10 +669,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
  */
 int kexec_add_buffer(struct kexec_buf *kbuf)
 {
-
-	struct kexec_segment *ksegment;
-	int ret;
-
 	/* Currently adding segment this way is allowed only in file mode */
 	if (!kbuf->image->file_mode)
 		return -EINVAL;
@@ -696,6 +692,25 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
 	kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
 	kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
 
+	return arch_kexec_add_buffer(kbuf);
+}
+
+/**
+ * __kexec_add_buffer - arch_kexec_add_buffer would call this function after
+ *                      updating kbuf, to place a buffer in a kexec segment.
+ * @kbuf:               Buffer contents and memory parameters.
+ *
+ * This function assumes that kexec_mutex is held.
+ * On successful return, @kbuf->mem will have the physical address of
+ * the buffer in memory.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int __kexec_add_buffer(struct kexec_buf *kbuf)
+{
+	struct kexec_segment *ksegment;
+	int ret;
+
 	/* Walk the RAM ranges and allocate a suitable range for the buffer */
 	ret = kexec_locate_mem_hole(kbuf);
 	if (ret)
@@ -711,6 +726,20 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
 	return 0;
 }
 
+/**
+ * arch_kexec_add_buffer - Some archs have memory regions within the given
+ *                         range that can't be used to place a kexec segment.
+ *                         Such archs can override this function to take care
+ *                         of them before trying to locate the memory hole.
+ * @kbuf:                  Buffer contents and memory parameters.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int __weak arch_kexec_add_buffer(struct kexec_buf *kbuf)
+{
+	return __kexec_add_buffer(kbuf);
+}
+
 /* Calculate and store the digest of segments */
 static int kexec_calculate_store_digests(struct kimage *image)
 {


^ permalink raw reply related

* [PATCH 02/11] powerpc/kexec_file: mark PPC64 specific code
From: Hari Bathini @ 2020-06-26 19:04 UTC (permalink / raw)
  To: Michael Ellerman, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Petr Tesarik, Mahesh J Salgaonkar,
	Sourabh Jain, lkml, linuxppc-dev, Mimi Zohar, Vivek Goyal,
	Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <159319825403.16351.7253978047621755765.stgit@hbathini.in.ibm.com>

Some of the kexec_file_load code isn't PPC64 specific. Move PPC64
specific code from kexec/file_load.c to kexec/file_load_64.c. Also,
rename purgatory/trampoline.S to purgatory/trampoline_64.S in the
same spirit.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
---
 arch/powerpc/include/asm/kexec.h       |   11 +++
 arch/powerpc/kexec/Makefile            |    2 -
 arch/powerpc/kexec/elf_64.c            |    7 +-
 arch/powerpc/kexec/file_load.c         |   37 ++--------
 arch/powerpc/kexec/file_load_64.c      |  108 ++++++++++++++++++++++++++++++
 arch/powerpc/purgatory/Makefile        |    4 +
 arch/powerpc/purgatory/trampoline.S    |  117 --------------------------------
 arch/powerpc/purgatory/trampoline_64.S |  117 ++++++++++++++++++++++++++++++++
 8 files changed, 248 insertions(+), 155 deletions(-)
 create mode 100644 arch/powerpc/kexec/file_load_64.c
 delete mode 100644 arch/powerpc/purgatory/trampoline.S
 create mode 100644 arch/powerpc/purgatory/trampoline_64.S

diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index c684768..7008ea1 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -114,8 +114,17 @@ int setup_purgatory(struct kimage *image, const void *slave_code,
 		    unsigned long fdt_load_addr);
 int setup_new_fdt(const struct kimage *image, void *fdt,
 		  unsigned long initrd_load_addr, unsigned long initrd_len,
-		  const char *cmdline);
+		  const char *cmdline, int *node);
 int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size);
+
+#ifdef CONFIG_PPC64
+int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
+			  const void *fdt, unsigned long kernel_load_addr,
+			  unsigned long fdt_load_addr);
+int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
+			unsigned long initrd_load_addr,
+			unsigned long initrd_len, const char *cmdline);
+#endif /* CONFIG_PPC64 */
 #endif /* CONFIG_KEXEC_FILE */
 
 #else /* !CONFIG_KEXEC_CORE */
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
index 86380c6..67c3553 100644
--- a/arch/powerpc/kexec/Makefile
+++ b/arch/powerpc/kexec/Makefile
@@ -7,7 +7,7 @@ obj-y				+= core.o crash.o core_$(BITS).o
 
 obj-$(CONFIG_PPC32)		+= relocate_32.o
 
-obj-$(CONFIG_KEXEC_FILE)	+= file_load.o elf_$(BITS).o
+obj-$(CONFIG_KEXEC_FILE)	+= file_load.o file_load_$(BITS).o elf_$(BITS).o
 
 ifdef CONFIG_HAVE_IMA_KEXEC
 ifdef CONFIG_IMA
diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index 3072fd6..23ad04c 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -88,7 +88,8 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
 		goto out;
 	}
 
-	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
+	ret = setup_new_fdt_ppc64(image, fdt, initrd_load_addr,
+				  initrd_len, cmdline);
 	if (ret)
 		goto out;
 
@@ -107,8 +108,8 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
 	pr_debug("Loaded device tree at 0x%lx\n", fdt_load_addr);
 
 	slave_code = elf_info.buffer + elf_info.proghdrs[0].p_offset;
-	ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
-			      fdt_load_addr);
+	ret = setup_purgatory_ppc64(image, slave_code, fdt, kernel_load_addr,
+				    fdt_load_addr);
 	if (ret)
 		pr_err("Error setting up the purgatory.\n");
 
diff --git a/arch/powerpc/kexec/file_load.c b/arch/powerpc/kexec/file_load.c
index 143c917..99a2c4d 100644
--- a/arch/powerpc/kexec/file_load.c
+++ b/arch/powerpc/kexec/file_load.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * ppc64 code to implement the kexec_file_load syscall
+ * powerpc code to implement the kexec_file_load syscall
  *
  * Copyright (C) 2004  Adam Litke (agl@us.ibm.com)
  * Copyright (C) 2004  IBM Corp.
@@ -16,26 +16,10 @@
 
 #include <linux/slab.h>
 #include <linux/kexec.h>
-#include <linux/of_fdt.h>
 #include <linux/libfdt.h>
 #include <asm/ima.h>
 
-#define SLAVE_CODE_SIZE		256
-
-const struct kexec_file_ops * const kexec_file_loaders[] = {
-	&kexec_elf64_ops,
-	NULL
-};
-
-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
-				  unsigned long buf_len)
-{
-	/* We don't support crash kernels yet. */
-	if (image->type == KEXEC_TYPE_CRASH)
-		return -EOPNOTSUPP;
-
-	return kexec_image_probe_default(image, buf, buf_len);
-}
+#define SLAVE_CODE_SIZE		256	/* First 0x100 bytes */
 
 /**
  * setup_purgatory - initialize the purgatory's global variables
@@ -127,24 +111,17 @@ int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
  * @initrd_len:		Size of the next initrd, or 0 if there will be none.
  * @cmdline:		Command line for the next kernel, or NULL if there will
  *			be none.
+ * @chosen_node:        Set this output parameter to chosen_node.
  *
  * Return: 0 on success, or negative errno on error.
  */
 int setup_new_fdt(const struct kimage *image, void *fdt,
 		  unsigned long initrd_load_addr, unsigned long initrd_len,
-		  const char *cmdline)
+		  const char *cmdline, int *node)
 {
 	int ret, chosen_node;
 	const void *prop;
 
-	/* Remove memory reservation for the current device tree. */
-	ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params),
-				 fdt_totalsize(initial_boot_params));
-	if (ret == 0)
-		pr_debug("Removed old device tree reservation.\n");
-	else if (ret != -ENOENT)
-		return ret;
-
 	chosen_node = fdt_path_offset(fdt, "/chosen");
 	if (chosen_node == -FDT_ERR_NOTFOUND) {
 		chosen_node = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
@@ -157,6 +134,8 @@ int setup_new_fdt(const struct kimage *image, void *fdt,
 		pr_err("Malformed device tree: error reading /chosen.\n");
 		return -EINVAL;
 	}
+	if (node)
+		*node = chosen_node;
 
 	/* Did we boot using an initrd? */
 	prop = fdt_getprop(fdt, chosen_node, "linux,initrd-start", NULL);
@@ -242,10 +221,6 @@ int setup_new_fdt(const struct kimage *image, void *fdt,
 		return ret;
 	}
 
-	ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
-	if (ret)
-		goto err;
-
 	return 0;
 
 err:
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
new file mode 100644
index 0000000..e6bff960
--- /dev/null
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ppc64 code to implement the kexec_file_load syscall
+ *
+ * Copyright (C) 2004  Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004  IBM Corp.
+ * Copyright (C) 2004,2005  Milton D Miller II, IBM Corporation
+ * Copyright (C) 2005  R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006  Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2020  IBM Corporation
+ *
+ * Based on kexec-tools' kexec-ppc64.c, kexec-elf-rel-ppc64.c, fs2dt.c.
+ * Heavily modified for the kernel by
+ * Hari Bathini <hbathini@linux.ibm.com>.
+ */
+
+#include <linux/kexec.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+
+const struct kexec_file_ops * const kexec_file_loaders[] = {
+	&kexec_elf64_ops,
+	NULL
+};
+
+/**
+ * setup_purgatory_ppc64 - initialize PPC64 specific purgatory's global
+ *                         variables and call setup_purgatory() to initialize
+ *                         common global variable.
+ * @image:                 kexec image.
+ * @slave_code:            Slave code for the purgatory.
+ * @fdt:                   Flattened device tree for the next kernel.
+ * @kernel_load_addr:      Address where the kernel is loaded.
+ * @fdt_load_addr:         Address where the flattened device tree is loaded.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
+			  const void *fdt, unsigned long kernel_load_addr,
+			  unsigned long fdt_load_addr)
+{
+	int ret;
+
+	ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
+			      fdt_load_addr);
+	if (ret)
+		pr_err("Failed to setup purgatory symbols");
+	return ret;
+}
+
+/**
+ * setup_new_fdt_ppc64 - Update the flattend device-tree of the kernel
+ *                       being loaded.
+ * @image:               kexec image being loaded.
+ * @fdt:                 Flattened device tree for the next kernel.
+ * @initrd_load_addr:    Address where the next initrd will be loaded.
+ * @initrd_len:          Size of the next initrd, or 0 if there will be none.
+ * @cmdline:             Command line for the next kernel, or NULL if there will
+ *                       be none.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
+			unsigned long initrd_load_addr,
+			unsigned long initrd_len, const char *cmdline)
+{
+	int chosen_node, ret;
+
+	/* Remove memory reservation for the current device tree. */
+	ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params),
+				 fdt_totalsize(initial_boot_params));
+	if (ret == 0)
+		pr_debug("Removed old device tree reservation.\n");
+	else if (ret != -ENOENT) {
+		pr_err("Failed to remove old device-tree reservation.\n");
+		return ret;
+	}
+
+	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len,
+			    cmdline, &chosen_node);
+	if (ret)
+		return ret;
+
+	ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
+	if (ret)
+		pr_err("Failed to update device-tree with linux,booted-from-kexec\n");
+
+	return ret;
+}
+
+/**
+ * arch_kexec_kernel_image_probe - Does additional handling needed to setup
+ *                                 kexec segments.
+ * @image:                         kexec image being loaded.
+ * @buf:                           Buffer pointing to elf data.
+ * @buf_len:                       Length of the buffer.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+				  unsigned long buf_len)
+{
+	/* We don't support crash kernels yet. */
+	if (image->type == KEXEC_TYPE_CRASH)
+		return -EOPNOTSUPP;
+
+	return kexec_image_probe_default(image, buf, buf_len);
+}
diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile
index 7c6d8b1..348f5958 100644
--- a/arch/powerpc/purgatory/Makefile
+++ b/arch/powerpc/purgatory/Makefile
@@ -2,11 +2,11 @@
 
 KASAN_SANITIZE := n
 
-targets += trampoline.o purgatory.ro kexec-purgatory.c
+targets += trampoline_$(BITS).o purgatory.ro kexec-purgatory.c
 
 LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined
 
-$(obj)/purgatory.ro: $(obj)/trampoline.o FORCE
+$(obj)/purgatory.ro: $(obj)/trampoline_$(BITS).o FORCE
 		$(call if_changed,ld)
 
 quiet_cmd_bin2c = BIN2C   $@
diff --git a/arch/powerpc/purgatory/trampoline.S b/arch/powerpc/purgatory/trampoline.S
deleted file mode 100644
index a5a83c3..0000000
--- a/arch/powerpc/purgatory/trampoline.S
+++ /dev/null
@@ -1,117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * kexec trampoline
- *
- * Based on code taken from kexec-tools and kexec-lite.
- *
- * Copyright (C) 2004 - 2005, Milton D Miller II, IBM Corporation
- * Copyright (C) 2006, Mohan Kumar M, IBM Corporation
- * Copyright (C) 2013, Anton Blanchard, IBM Corporation
- */
-
-#include <asm/asm-compat.h>
-
-	.machine ppc64
-	.balign 256
-	.globl purgatory_start
-purgatory_start:
-	b	master
-
-	/* ABI: possible run_at_load flag at 0x5c */
-	.org purgatory_start + 0x5c
-	.globl run_at_load
-run_at_load:
-	.long 0
-	.size run_at_load, . - run_at_load
-
-	/* ABI: slaves start at 60 with r3=phys */
-	.org purgatory_start + 0x60
-slave:
-	b .
-	/* ABI: end of copied region */
-	.org purgatory_start + 0x100
-	.size purgatory_start, . - purgatory_start
-
-/*
- * The above 0x100 bytes at purgatory_start are replaced with the
- * code from the kernel (or next stage) by setup_purgatory().
- */
-
-master:
-	or	%r1,%r1,%r1	/* low priority to let other threads catchup */
-	isync
-	mr	%r17,%r3	/* save cpu id to r17 */
-	mr	%r15,%r4	/* save physical address in reg15 */
-
-	or	%r3,%r3,%r3	/* ok now to high priority, lets boot */
-	lis	%r6,0x1
-	mtctr	%r6		/* delay a bit for slaves to catch up */
-	bdnz	.		/* before we overwrite 0-100 again */
-
-	bl	0f		/* Work out where we're running */
-0:	mflr	%r18
-
-	/* load device-tree address */
-	ld	%r3, (dt_offset - 0b)(%r18)
-	mr	%r16,%r3	/* save dt address in reg16 */
-	li	%r4,20
-	LWZX_BE	%r6,%r3,%r4	/* fetch __be32 version number at byte 20 */
-	cmpwi	%cr0,%r6,2	/* v2 or later? */
-	blt	1f
-	li	%r4,28
-	STWX_BE	%r17,%r3,%r4	/* Store my cpu as __be32 at byte 28 */
-1:
-	/* load the kernel address */
-	ld	%r4,(kernel - 0b)(%r18)
-
-	/* load the run_at_load flag */
-	/* possibly patched by kexec */
-	ld	%r6,(run_at_load - 0b)(%r18)
-	/* and patch it into the kernel */
-	stw	%r6,(0x5c)(%r4)
-
-	mr	%r3,%r16	/* restore dt address */
-
-	li	%r5,0		/* r5 will be 0 for kernel */
-
-	mfmsr	%r11
-	andi.	%r10,%r11,1	/* test MSR_LE */
-	bne	.Little_endian
-
-	mtctr	%r4		/* prepare branch to */
-	bctr			/* start kernel */
-
-.Little_endian:
-	mtsrr0	%r4		/* prepare branch to */
-
-	clrrdi	%r11,%r11,1	/* clear MSR_LE */
-	mtsrr1	%r11
-
-	rfid			/* update MSR and start kernel */
-
-
-	.balign 8
-	.globl kernel
-kernel:
-	.8byte  0x0
-	.size kernel, . - kernel
-
-	.balign 8
-	.globl dt_offset
-dt_offset:
-	.8byte  0x0
-	.size dt_offset, . - dt_offset
-
-
-	.data
-	.balign 8
-.globl purgatory_sha256_digest
-purgatory_sha256_digest:
-	.skip	32
-	.size purgatory_sha256_digest, . - purgatory_sha256_digest
-
-	.balign 8
-.globl purgatory_sha_regions
-purgatory_sha_regions:
-	.skip	8 * 2 * 16
-	.size purgatory_sha_regions, . - purgatory_sha_regions
diff --git a/arch/powerpc/purgatory/trampoline_64.S b/arch/powerpc/purgatory/trampoline_64.S
new file mode 100644
index 0000000..a5a83c3
--- /dev/null
+++ b/arch/powerpc/purgatory/trampoline_64.S
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * kexec trampoline
+ *
+ * Based on code taken from kexec-tools and kexec-lite.
+ *
+ * Copyright (C) 2004 - 2005, Milton D Miller II, IBM Corporation
+ * Copyright (C) 2006, Mohan Kumar M, IBM Corporation
+ * Copyright (C) 2013, Anton Blanchard, IBM Corporation
+ */
+
+#include <asm/asm-compat.h>
+
+	.machine ppc64
+	.balign 256
+	.globl purgatory_start
+purgatory_start:
+	b	master
+
+	/* ABI: possible run_at_load flag at 0x5c */
+	.org purgatory_start + 0x5c
+	.globl run_at_load
+run_at_load:
+	.long 0
+	.size run_at_load, . - run_at_load
+
+	/* ABI: slaves start at 60 with r3=phys */
+	.org purgatory_start + 0x60
+slave:
+	b .
+	/* ABI: end of copied region */
+	.org purgatory_start + 0x100
+	.size purgatory_start, . - purgatory_start
+
+/*
+ * The above 0x100 bytes at purgatory_start are replaced with the
+ * code from the kernel (or next stage) by setup_purgatory().
+ */
+
+master:
+	or	%r1,%r1,%r1	/* low priority to let other threads catchup */
+	isync
+	mr	%r17,%r3	/* save cpu id to r17 */
+	mr	%r15,%r4	/* save physical address in reg15 */
+
+	or	%r3,%r3,%r3	/* ok now to high priority, lets boot */
+	lis	%r6,0x1
+	mtctr	%r6		/* delay a bit for slaves to catch up */
+	bdnz	.		/* before we overwrite 0-100 again */
+
+	bl	0f		/* Work out where we're running */
+0:	mflr	%r18
+
+	/* load device-tree address */
+	ld	%r3, (dt_offset - 0b)(%r18)
+	mr	%r16,%r3	/* save dt address in reg16 */
+	li	%r4,20
+	LWZX_BE	%r6,%r3,%r4	/* fetch __be32 version number at byte 20 */
+	cmpwi	%cr0,%r6,2	/* v2 or later? */
+	blt	1f
+	li	%r4,28
+	STWX_BE	%r17,%r3,%r4	/* Store my cpu as __be32 at byte 28 */
+1:
+	/* load the kernel address */
+	ld	%r4,(kernel - 0b)(%r18)
+
+	/* load the run_at_load flag */
+	/* possibly patched by kexec */
+	ld	%r6,(run_at_load - 0b)(%r18)
+	/* and patch it into the kernel */
+	stw	%r6,(0x5c)(%r4)
+
+	mr	%r3,%r16	/* restore dt address */
+
+	li	%r5,0		/* r5 will be 0 for kernel */
+
+	mfmsr	%r11
+	andi.	%r10,%r11,1	/* test MSR_LE */
+	bne	.Little_endian
+
+	mtctr	%r4		/* prepare branch to */
+	bctr			/* start kernel */
+
+.Little_endian:
+	mtsrr0	%r4		/* prepare branch to */
+
+	clrrdi	%r11,%r11,1	/* clear MSR_LE */
+	mtsrr1	%r11
+
+	rfid			/* update MSR and start kernel */
+
+
+	.balign 8
+	.globl kernel
+kernel:
+	.8byte  0x0
+	.size kernel, . - kernel
+
+	.balign 8
+	.globl dt_offset
+dt_offset:
+	.8byte  0x0
+	.size dt_offset, . - dt_offset
+
+
+	.data
+	.balign 8
+.globl purgatory_sha256_digest
+purgatory_sha256_digest:
+	.skip	32
+	.size purgatory_sha256_digest, . - purgatory_sha256_digest
+
+	.balign 8
+.globl purgatory_sha_regions
+purgatory_sha_regions:
+	.skip	8 * 2 * 16
+	.size purgatory_sha_regions, . - purgatory_sha_regions


^ permalink raw reply related

* [PATCH 03/11] powerpc/kexec_file: add helper functions for getting memory ranges
From: Hari Bathini @ 2020-06-26 19:05 UTC (permalink / raw)
  To: Michael Ellerman, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Petr Tesarik, Mahesh J Salgaonkar,
	Sourabh Jain, lkml, linuxppc-dev, Mimi Zohar, Vivek Goyal,
	Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <159319825403.16351.7253978047621755765.stgit@hbathini.in.ibm.com>

In kexec case, the kernel to be loaded uses the same memory layout as
the running kernel. So, passing on the DT of the running kernel would
be good enough.

But in case of kdump, different memory ranges are needed to manage
loading the kdump kernel, booting into it and exporting the elfcore
of the crashing kernel. The ranges are exlude memory ranges, usable
memory ranges, reserved memory ranges and crash memory ranges.

Exclude memory ranges specify the list of memory ranges to avoid while
loading kdump segments. Usable memory ranges list the memory ranges
that could be used for booting kdump kernel. Reserved memory ranges
list the memory regions for the loading kernel's reserve map. Crash
memory ranges list the memory ranges to be exported as the crashing
kernel's elfcore.

Add helper functions for setting up the above mentioned memory ranges.
This helpers facilitate in understanding the subsequent changes better
and make it easy to setup the different memory ranges listed above, as
and when appropriate.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
---
 arch/powerpc/include/asm/kexec_ranges.h |   18 +
 arch/powerpc/kexec/Makefile             |    2 
 arch/powerpc/kexec/ranges.c             |  387 +++++++++++++++++++++++++++++++
 3 files changed, 406 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/kexec_ranges.h
 create mode 100644 arch/powerpc/kexec/ranges.c

diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h
new file mode 100644
index 0000000..29eaa65
--- /dev/null
+++ b/arch/powerpc/include/asm/kexec_ranges.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _ARCH_POWERPC_KEXEC_RANGES_H
+#define _ARCH_POWERPC_KEXEC_RANGES_H
+
+#define MEM_RANGE_CHUNK_SZ		2048	/* Memory ranges size chunk */
+
+struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges);
+int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size);
+int add_tce_mem_ranges(struct crash_mem **mem_ranges);
+int add_initrd_mem_range(struct crash_mem **mem_ranges);
+int add_htab_mem_range(struct crash_mem **mem_ranges);
+int add_kernel_mem_range(struct crash_mem **mem_ranges);
+int add_rtas_mem_range(struct crash_mem **mem_ranges, bool within_crashkernel);
+int add_opal_mem_range(struct crash_mem **mem_ranges, bool within_crashkernel);
+int add_reserved_ranges(struct crash_mem **mem_ranges);
+void sort_memory_ranges(struct crash_mem *mrngs);
+
+#endif /* _ARCH_POWERPC_KEXEC_RANGES_H */
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
index 67c3553..4aff684 100644
--- a/arch/powerpc/kexec/Makefile
+++ b/arch/powerpc/kexec/Makefile
@@ -7,7 +7,7 @@ obj-y				+= core.o crash.o core_$(BITS).o
 
 obj-$(CONFIG_PPC32)		+= relocate_32.o
 
-obj-$(CONFIG_KEXEC_FILE)	+= file_load.o file_load_$(BITS).o elf_$(BITS).o
+obj-$(CONFIG_KEXEC_FILE)	+= file_load.o ranges.o file_load_$(BITS).o elf_$(BITS).o
 
 ifdef CONFIG_HAVE_IMA_KEXEC
 ifdef CONFIG_IMA
diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c
new file mode 100644
index 0000000..f21ddea
--- /dev/null
+++ b/arch/powerpc/kexec/ranges.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * powerpc code to implement the kexec_file_load syscall
+ *
+ * Copyright (C) 2004  Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004  IBM Corp.
+ * Copyright (C) 2004,2005  Milton D Miller II, IBM Corporation
+ * Copyright (C) 2005  R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006  Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2020  IBM Corporation
+ *
+ * Based on kexec-tools' kexec-ppc64.c, fs2dt.c.
+ * Heavily modified for the kernel by
+ * Hari Bathini <hbathini@linux.ibm.com>.
+ */
+
+#include <linux/kexec.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+#include <asm/sections.h>
+#include <asm/kexec_ranges.h>
+
+/**
+ * get_max_nr_ranges - Get the max no. of ranges crash_mem structure
+ *                     could hold, given the size allocated for it.
+ * @size:              Allocation size of crash_mem structure.
+ *
+ * Returns the maximum no. of ranges.
+ */
+static inline unsigned int get_max_nr_ranges(size_t size)
+{
+	return ((size - sizeof(struct crash_mem)) /
+		sizeof(struct crash_mem_range));
+}
+
+/**
+ * get_mem_rngs_size - Get the allocated size of mrngs based on
+ *                     max_nr_ranges and chunk size.
+ * @mrngs:             Memory ranges.
+ *
+ * Returns the maximum no. of ranges.
+ */
+static inline size_t get_mem_rngs_size(struct crash_mem *mrngs)
+{
+	size_t size;
+
+	if (!mrngs)
+		return 0;
+
+	size = (sizeof(struct crash_mem) +
+		(mrngs->max_nr_ranges * sizeof(struct crash_mem_range)));
+
+	/*
+	 * Memory is allocated in size multiple of MEM_RANGE_CHUNK_SZ.
+	 * So, align to get the actual length.
+	 */
+	return ALIGN(size, MEM_RANGE_CHUNK_SZ);
+}
+
+/**
+ * __add_mem_range - add a memory range to memory ranges list.
+ * @mem_ranges:      Range list to add the memory range to.
+ * @base:            Base address of the range to add.
+ * @size:            Size of the memory range to add.
+ *
+ * (Re)allocates memory, if needed.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int __add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
+{
+	struct crash_mem *mrngs = *mem_ranges;
+
+	if ((mrngs == NULL) || (mrngs->nr_ranges == mrngs->max_nr_ranges)) {
+		mrngs = realloc_mem_ranges(mem_ranges);
+		if (!mrngs)
+			return -ENOMEM;
+	}
+
+	mrngs->ranges[mrngs->nr_ranges].start = base;
+	mrngs->ranges[mrngs->nr_ranges].end = base + size - 1;
+	mrngs->nr_ranges++;
+	return 0;
+}
+
+/**
+ * realloc_mem_ranges - reallocate mem_ranges with size incremented
+ *                      by MEM_RANGE_CHUNK_SZ. Frees up the old memory,
+ *                      if memory allocation fails.
+ * @mem_ranges:         Memory ranges to reallocate.
+ *
+ * Returns pointer to reallocated memory on success, NULL otherwise.
+ */
+struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges)
+{
+	struct crash_mem *mrngs = *mem_ranges;
+	unsigned int nr_ranges;
+	size_t size;
+
+	size = get_mem_rngs_size(mrngs);
+	nr_ranges = mrngs ? mrngs->nr_ranges : 0;
+
+	size += MEM_RANGE_CHUNK_SZ;
+	mrngs = krealloc(*mem_ranges, size, GFP_KERNEL);
+	if (!mrngs) {
+		kfree(*mem_ranges);
+		*mem_ranges = NULL;
+		return NULL;
+	}
+
+	mrngs->nr_ranges = nr_ranges;
+	mrngs->max_nr_ranges = get_max_nr_ranges(size);
+	*mem_ranges = mrngs;
+
+	return mrngs;
+}
+
+/**
+ * add_mem_range - Updates existing memory range, if there is an overlap.
+ *                 Else, adds a new memory range.
+ * @mem_ranges:    Range list to add the memory range to.
+ * @base:          Base address of the range to add.
+ * @size:          Size of the memory range to add.
+ *
+ * (Re)allocates memory, if needed.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
+{
+	struct crash_mem *mrngs = *mem_ranges;
+	u64 mstart, mend, end;
+	unsigned int i;
+
+	if (!size)
+		return 0;
+
+	end = base + size - 1;
+
+	if ((mrngs == NULL) || (mrngs->nr_ranges == 0))
+		return __add_mem_range(mem_ranges, base, size);
+
+	for (i = 0; i < mrngs->nr_ranges; i++) {
+		mstart = mrngs->ranges[i].start;
+		mend = mrngs->ranges[i].end;
+		if (base < mend && end > mstart) {
+			if (base < mstart)
+				mrngs->ranges[i].start = base;
+			if (end > mend)
+				mrngs->ranges[i].end = end;
+			return 0;
+		}
+	}
+
+	return __add_mem_range(mem_ranges, base, size);
+}
+
+/**
+ * add_tce_mem_ranges - Adds tce-table range to the given memory ranges list.
+ * @mem_ranges:         Range list to add the memory range(s) to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_tce_mem_ranges(struct crash_mem **mem_ranges)
+{
+	struct device_node *dn;
+	int ret;
+
+	for_each_node_by_type(dn, "pci") {
+		u64 base;
+		u32 size;
+
+		ret = of_property_read_u64(dn, "linux,tce-base", &base);
+		ret |= of_property_read_u32(dn, "linux,tce-size", &size);
+		if (!ret)
+			continue;
+
+		ret = add_mem_range(mem_ranges, base, size);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/**
+ * add_initrd_mem_range - Adds initrd range to the given memory ranges list,
+ *                        if the initrd was retained.
+ * @mem_ranges:           Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_initrd_mem_range(struct crash_mem **mem_ranges)
+{
+	u64 base, end;
+	int ret = 0;
+	char *str;
+
+	/* This range means something only if initrd was retained */
+	str = strstr(saved_command_line, "retain_initrd");
+	if (!str)
+		return 0;
+
+	ret = of_property_read_u64(of_chosen, "linux,initrd-start", &base);
+	ret |= of_property_read_u64(of_chosen, "linux,initrd-end", &end);
+	if (!ret)
+		ret = add_mem_range(mem_ranges, base, end - base + 1);
+	return ret;
+}
+
+/**
+ * add_htab_mem_range - Adds htab range to the given memory ranges list,
+ *                      if it exists
+ * @mem_ranges:         Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_htab_mem_range(struct crash_mem **mem_ranges)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	int ret;
+
+	if (!htab_address)
+		return 0;
+
+	ret = add_mem_range(mem_ranges, __pa(htab_address), htab_size_bytes);
+	return ret;
+#else
+	return 0;
+#endif
+}
+
+/**
+ * add_kernel_mem_range - Adds kernel text region to the given
+ *                        memory ranges list.
+ * @mem_ranges:           Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_kernel_mem_range(struct crash_mem **mem_ranges)
+{
+	int ret;
+
+	ret = add_mem_range(mem_ranges, 0, __pa(_end));
+	return ret;
+}
+
+/**
+ * add_rtas_mem_range - Adds RTAS region to the given memory ranges list.
+ * @mem_ranges:         Range list to add the memory range to.
+ * @within_crashkernel: If true, adds only the part of the region within
+ *                      crashkernel range. Otherwise, all the region is added.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_rtas_mem_range(struct crash_mem **mem_ranges, bool within_crashkernel)
+{
+	struct device_node *dn;
+	int ret = 0;
+
+	dn = of_find_node_by_path("/rtas");
+	if (dn) {
+		u32 base, size;
+
+		ret = of_property_read_u32(dn, "linux,rtas-base", &base);
+		ret |= of_property_read_u32(dn, "rtas-size", &size);
+
+		if (ret)
+			return ret;
+
+		if (within_crashkernel) {
+			if (overlaps_crashkernel(base, size)) {
+				if (base < crashk_res.start)
+					base = crashk_res.start;
+				if (crashk_res.end < (base + size - 1))
+					size = crashk_res.end - base + 1;
+
+				ret = add_mem_range(mem_ranges, base, size);
+			}
+		} else
+			ret = add_mem_range(mem_ranges, base, size);
+	}
+	return ret;
+}
+
+/**
+ * add_opal_mem_range - Adds OPAL region to the given memory ranges list.
+ * @mem_ranges:         Range list to add the memory range to.
+ * @within_crashkernel: If true, adds only the part of the region within
+ *                      crashkernel range. Otherwise, all the region is added.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_opal_mem_range(struct crash_mem **mem_ranges, bool within_crashkernel)
+{
+	struct device_node *dn;
+	int ret = 0;
+
+	dn = of_find_node_by_path("/ibm,opal");
+	if (dn) {
+		u64 base, size;
+
+		ret = of_property_read_u64(dn, "opal-base-address", &base);
+		ret |= of_property_read_u64(dn, "opal-runtime-size", &size);
+
+		if (ret)
+			return ret;
+
+		if (within_crashkernel) {
+			if (overlaps_crashkernel(base, size)) {
+				if (base < crashk_res.start)
+					base = crashk_res.start;
+				if (crashk_res.end < (base + size - 1))
+					size = crashk_res.end - base + 1;
+
+				ret = add_mem_range(mem_ranges, base, size);
+			}
+		} else
+			ret = add_mem_range(mem_ranges, base, size);
+	}
+	return ret;
+}
+
+/**
+ * add_reserved_ranges - Adds "/reserved-ranges" regions exported by f/w
+ *                       to the given memory ranges list.
+ * @mem_ranges:          Range list to add the memory ranges to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_reserved_ranges(struct crash_mem **mem_ranges)
+{
+	int i, len, ret = 0;
+	const __be32 *prop;
+
+	prop = of_get_property(of_root, "reserved-ranges", &len);
+	if (!prop)
+		return 0;
+
+	/*
+	 * Each reserved range is an (address,size) pair, 2 cells each,
+	 * totalling 4 cells per range.
+	 */
+	for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+		u64 base, size;
+
+		base = of_read_number(prop + (i * 4) + 0, 2);
+		size = of_read_number(prop + (i * 4) + 2, 2);
+
+		ret = add_mem_range(mem_ranges, base, size);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/**
+ * sort_memory_ranges - Sorts the given memory ranges list.
+ * @mem_ranges:         Range list to sort.
+ *
+ * Returns nothing.
+ */
+void sort_memory_ranges(struct crash_mem *mrngs)
+{
+	struct crash_mem_range *rngs;
+	struct crash_mem_range rng;
+	int i, j, idx;
+
+	if (!mrngs)
+		return;
+
+	/* Sort the ranges in-place */
+	rngs = &mrngs->ranges[0];
+	for (i = 0; i < mrngs->nr_ranges; i++) {
+		idx = i;
+		for (j = (i + 1); j < mrngs->nr_ranges; j++) {
+			if (rngs[idx].start > rngs[j].start)
+				idx = j;
+		}
+		if (idx != i) {
+			rng = rngs[idx];
+			rngs[idx] = rngs[i];
+			rngs[i] = rng;
+		}
+	}
+}


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox