LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] powerpc test_emulate_step: update nip with patched instruction address
From: Balamuruhan S @ 2020-06-22  7:09 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200622070941.759307-1-bala24@linux.ibm.com>

pt_regs are initialized to zero in the test infrastructure, R bit
in prefixed instruction form is used to specify whether the effective
address of the storage operand is computed relative to the address
of the instruction.

If R = 1 and RA = R0|0, the sum of the address of the instruction
and the value SI is placed into register RT. So to assert the emulated
instruction with executed instruction, update nip of emulated pt_regs.

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/lib/test_emulate_step.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index 33a72b7d2764..d5902b7b4e5c 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -1204,13 +1204,24 @@ static struct compute_test compute_tests[] = {
 static int __init emulate_compute_instr(struct pt_regs *regs,
 					struct ppc_inst instr)
 {
+	int prefix_r, ra;
 	extern s32 patch__exec_instr;
 	struct instruction_op op;
 
 	if (!regs || !ppc_inst_val(instr))
 		return -EINVAL;
 
-	regs->nip = patch_site_addr(&patch__exec_instr);
+	/*
+	 * If R=1 and RA=0 in Prefixed instruction form, calculate the address
+	 * of the instruction and update nip to assert with executed
+	 * instruction
+	 */
+	if (ppc_inst_prefixed(instr)) {
+		prefix_r = ppc_inst_val(instr) & (1UL << 20);
+		ra = (ppc_inst_suffix(instr) >> 16) & 0x1f;
+		if (prefix_r && !ra)
+			regs->nip = patch_site_addr(&patch__exec_instr);
+	}
 
 	if (analyse_instr(&op, regs, instr) != 1 ||
 	    GETTYPE(op.type) != COMPUTE) {
-- 
2.24.1


^ permalink raw reply related

* [PATCH 2/6] powerpc test_emulate_step: fix pr_info() to print 8-byte for prefixed instruction
From: Balamuruhan S @ 2020-06-22  7:09 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200622070941.759307-1-bala24@linux.ibm.com>

On test failure, `pr_log()` prints 4 bytes instruction
irrespective of word/prefix instruction, fix it by printing
them appropriately.

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/lib/test_emulate_step.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index d5902b7b4e5c..e3b1797adfae 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -1225,7 +1225,14 @@ static int __init emulate_compute_instr(struct pt_regs *regs,
 
 	if (analyse_instr(&op, regs, instr) != 1 ||
 	    GETTYPE(op.type) != COMPUTE) {
-		pr_info("emulation failed, instruction = 0x%08x\n", ppc_inst_val(instr));
+		if (!ppc_inst_prefixed(instr)) {
+			pr_info("emulation failed, instruction = 0x%08x\n",
+				ppc_inst_val(instr));
+		} else {
+			pr_info("emulation failed, instruction = 0x%08x 0x%08x\n",
+				ppc_inst_val(instr),
+				ppc_inst_suffix(instr));
+		}
 		return -EFAULT;
 	}
 
-- 
2.24.1


^ permalink raw reply related

* [PATCH 3/6] powerpc test_emulate_step: enhancement to test negative scenarios
From: Balamuruhan S @ 2020-06-22  7:09 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200622070941.759307-1-bala24@linux.ibm.com>

add provision to declare test is a negative scenario, verify
whether emulation fails and avoid executing it.

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/lib/test_emulate_step.c | 46 ++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index e3b1797adfae..79acc899a618 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -703,6 +703,7 @@ struct compute_test {
 		unsigned long flags;
 		struct ppc_inst instr;
 		struct pt_regs regs;
+		bool negative;
 	} subtests[MAX_SUBTESTS + 1];
 };
 
@@ -1202,9 +1203,10 @@ static struct compute_test compute_tests[] = {
 };
 
 static int __init emulate_compute_instr(struct pt_regs *regs,
-					struct ppc_inst instr)
+					struct ppc_inst instr,
+					bool negative)
 {
-	int prefix_r, ra;
+	int prefix_r, ra, analysed;
 	extern s32 patch__exec_instr;
 	struct instruction_op op;
 
@@ -1223,8 +1225,10 @@ static int __init emulate_compute_instr(struct pt_regs *regs,
 			regs->nip = patch_site_addr(&patch__exec_instr);
 	}
 
-	if (analyse_instr(&op, regs, instr) != 1 ||
-	    GETTYPE(op.type) != COMPUTE) {
+	analysed = analyse_instr(&op, regs, instr);
+	if (analysed != 1 || GETTYPE(op.type) != COMPUTE) {
+		if (negative)
+			return -EFAULT;
 		if (!ppc_inst_prefixed(instr)) {
 			pr_info("emulation failed, instruction = 0x%08x\n",
 				ppc_inst_val(instr));
@@ -1235,8 +1239,18 @@ static int __init emulate_compute_instr(struct pt_regs *regs,
 		}
 		return -EFAULT;
 	}
-
-	emulate_update_regs(regs, &op);
+	if (analysed == 1 && negative) {
+		if (!ppc_inst_prefixed(instr)) {
+			pr_info("negative test failed, instruction = 0x%08x\n",
+				ppc_inst_val(instr));
+		} else {
+			pr_info("negative test  failed, instruction = 0x%08x 0x%08x\n",
+				ppc_inst_val(instr),
+				ppc_inst_suffix(instr));
+		}
+	}
+	if (!negative)
+		emulate_update_regs(regs, &op);
 	return 0;
 }
 
@@ -1252,7 +1266,14 @@ static int __init execute_compute_instr(struct pt_regs *regs,
 	/* Patch the NOP with the actual instruction */
 	patch_instruction_site(&patch__exec_instr, instr);
 	if (exec_instr(regs)) {
-		pr_info("execution failed, instruction = 0x%08x\n", ppc_inst_val(instr));
+		if (!ppc_inst_prefixed(instr)) {
+			pr_info("execution failed, instruction = 0x%08x\n",
+				ppc_inst_val(instr));
+		} else {
+			pr_info("execution failed, instruction = 0x%08x 0x%08x\n",
+				ppc_inst_val(instr),
+				ppc_inst_suffix(instr));
+		}
 		return -EFAULT;
 	}
 
@@ -1274,7 +1295,7 @@ static void __init run_tests_compute(void)
 	struct pt_regs *regs, exp, got;
 	unsigned int i, j, k;
 	struct ppc_inst instr;
-	bool ignore_gpr, ignore_xer, ignore_ccr, passed;
+	bool ignore_gpr, ignore_xer, ignore_ccr, passed, rc, negative;
 
 	for (i = 0; i < ARRAY_SIZE(compute_tests); i++) {
 		test = &compute_tests[i];
@@ -1288,6 +1309,7 @@ static void __init run_tests_compute(void)
 			instr = test->subtests[j].instr;
 			flags = test->subtests[j].flags;
 			regs = &test->subtests[j].regs;
+			negative = test->subtests[j].negative;
 			ignore_xer = flags & IGNORE_XER;
 			ignore_ccr = flags & IGNORE_CCR;
 			passed = true;
@@ -1302,8 +1324,12 @@ static void __init run_tests_compute(void)
 			exp.msr = MSR_KERNEL;
 			got.msr = MSR_KERNEL;
 
-			if (emulate_compute_instr(&got, instr) ||
-			    execute_compute_instr(&exp, instr)) {
+			rc = emulate_compute_instr(&got, instr, negative) != 0;
+			if (negative) {
+				/* skip executing instruction */
+				passed = rc;
+				goto print;
+			} else if (rc || execute_compute_instr(&exp, instr)) {
 				passed = false;
 				goto print;
 			}
-- 
2.24.1


^ permalink raw reply related

* [PATCH 4/6] powerpc test_emulate_step: add negative tests for prefixed addi
From: Balamuruhan S @ 2020-06-22  7:09 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200622070941.759307-1-bala24@linux.ibm.com>

testcases for `paddi` instruction to cover the negative case,
if R is equal to 1 and RA is not equal to 0, the instruction
form is invalid.

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/lib/test_emulate_step.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index 79acc899a618..f9825c275c31 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -1197,6 +1197,16 @@ static struct compute_test compute_tests[] = {
 				.regs = {
 					.gpr[21] = 0,
 				}
+			},
+			/* Invalid instruction form with R = 1 and RA != 0 */
+			{
+				.descr = "RA = R22(0), SI = 0, R = 1",
+				.instr = TEST_PADDI(21, 22, 0, 1),
+				.negative = true,
+				.regs = {
+					.gpr[21] = 0,
+					.gpr[22] = 0,
+				}
 			}
 		}
 	}
-- 
2.24.1


^ permalink raw reply related

* [PATCH 5/6] powerpc sstep: introduce macros to retrieve Prefix instruction operands
From: Balamuruhan S @ 2020-06-22  7:09 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200622070941.759307-1-bala24@linux.ibm.com>

retrieve prefix instruction operands RA and pc relative bit R values
using macros and adopt it in sstep.c and test_emulate_step.c.

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/include/asm/sstep.h     |  4 ++++
 arch/powerpc/lib/sstep.c             | 12 ++++++------
 arch/powerpc/lib/test_emulate_step.c |  4 ++--
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 3b01c69a44aa..325975b4ef30 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -104,6 +104,10 @@ enum instruction_type {
 
 #define MKOP(t, f, s)	((t) | (f) | SIZE(s))
 
+/* Prefix instruction operands */
+#define GET_PREFIX_RA(i)	(((i) >> 16) & 0x1f)
+#define GET_PREFIX_R(i)		((i) & (1ul << 20))
+
 struct instruction_op {
 	int type;
 	int reg;
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 5abe98216dc2..fb4c5767663d 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -200,8 +200,8 @@ static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned int instr,
 	unsigned int  dd;
 	unsigned long ea, d0, d1, d;
 
-	prefix_r = instr & (1ul << 20);
-	ra = (suffix >> 16) & 0x1f;
+	prefix_r = GET_PREFIX_R(instr);
+	ra = GET_PREFIX_RA(suffix);
 
 	d0 = instr & 0x3ffff;
 	d1 = suffix & 0xffff;
@@ -1339,8 +1339,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 	switch (opcode) {
 #ifdef __powerpc64__
 	case 1:
-		prefix_r = word & (1ul << 20);
-		ra = (suffix >> 16) & 0x1f;
+		prefix_r = GET_PREFIX_R(word);
+		ra = GET_PREFIX_RA(suffix);
 		rd = (suffix >> 21) & 0x1f;
 		op->reg = rd;
 		op->val = regs->gpr[rd];
@@ -2715,8 +2715,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 		}
 		break;
 	case 1: /* Prefixed instructions */
-		prefix_r = word & (1ul << 20);
-		ra = (suffix >> 16) & 0x1f;
+		prefix_r = GET_PREFIX_R(word);
+		ra = GET_PREFIX_RA(suffix);
 		op->update_reg = ra;
 		rd = (suffix >> 21) & 0x1f;
 		op->reg = rd;
diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index f9825c275c31..f1a447026b6e 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -1229,8 +1229,8 @@ static int __init emulate_compute_instr(struct pt_regs *regs,
 	 * instruction
 	 */
 	if (ppc_inst_prefixed(instr)) {
-		prefix_r = ppc_inst_val(instr) & (1UL << 20);
-		ra = (ppc_inst_suffix(instr) >> 16) & 0x1f;
+		prefix_r = GET_PREFIX_R(ppc_inst_val(instr));
+		ra = GET_PREFIX_RA(ppc_inst_suffix(instr));
 		if (prefix_r && !ra)
 			regs->nip = patch_site_addr(&patch__exec_instr);
 	}
-- 
2.24.1


^ permalink raw reply related

* [PATCH 6/6] powerpc test_emulate_step: move extern declaration to sstep.h
From: Balamuruhan S @ 2020-06-22  7:09 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200622070941.759307-1-bala24@linux.ibm.com>

fix checkpatch.pl warnings by moving extern declaration from source
file to headerfile.

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/include/asm/sstep.h     | 2 ++
 arch/powerpc/lib/test_emulate_step.c | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 325975b4ef30..c8e37ef060c1 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -108,6 +108,8 @@ enum instruction_type {
 #define GET_PREFIX_RA(i)	(((i) >> 16) & 0x1f)
 #define GET_PREFIX_R(i)		((i) & (1ul << 20))
 
+extern s32 patch__exec_instr;
+
 struct instruction_op {
 	int type;
 	int reg;
diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index f1a447026b6e..386245607568 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -1217,7 +1217,6 @@ static int __init emulate_compute_instr(struct pt_regs *regs,
 					bool negative)
 {
 	int prefix_r, ra, analysed;
-	extern s32 patch__exec_instr;
 	struct instruction_op op;
 
 	if (!regs || !ppc_inst_val(instr))
@@ -1268,7 +1267,6 @@ static int __init execute_compute_instr(struct pt_regs *regs,
 					struct ppc_inst instr)
 {
 	extern int exec_instr(struct pt_regs *regs);
-	extern s32 patch__exec_instr;
 
 	if (!regs || !ppc_inst_val(instr))
 		return -EINVAL;
-- 
2.24.1


^ permalink raw reply related

* [PATCH] ASoC: fsl_mqs: Fix unchecked return value for clk_prepare_enable
From: Shengjiu Wang @ 2020-06-22  8:48 UTC (permalink / raw)
  To: timur, nicoleotsuka, Xiubo.Lee, festevam, broonie, perex, tiwai,
	alsa-devel
  Cc: linuxppc-dev, linux-kernel

Fix unchecked return value for clk_prepare_enable.

And because clk_prepare_enable and clk_disable_unprepare should
check input clock parameter is NULL or not, then we don't need
to check it before calling the function.

Fixes: 9e28f6532c61 ("ASoC: fsl_mqs: Add MQS component driver")
Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
---
 sound/soc/fsl/fsl_mqs.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/sound/soc/fsl/fsl_mqs.c b/sound/soc/fsl/fsl_mqs.c
index 0c813a45bba7..69aeb0e71844 100644
--- a/sound/soc/fsl/fsl_mqs.c
+++ b/sound/soc/fsl/fsl_mqs.c
@@ -265,12 +265,20 @@ static int fsl_mqs_remove(struct platform_device *pdev)
 static int fsl_mqs_runtime_resume(struct device *dev)
 {
 	struct fsl_mqs *mqs_priv = dev_get_drvdata(dev);
+	int ret;
 
-	if (mqs_priv->ipg)
-		clk_prepare_enable(mqs_priv->ipg);
+	ret = clk_prepare_enable(mqs_priv->ipg);
+	if (ret) {
+		dev_err(dev, "failed to enable ipg clock\n");
+		return ret;
+	}
 
-	if (mqs_priv->mclk)
-		clk_prepare_enable(mqs_priv->mclk);
+	ret = clk_prepare_enable(mqs_priv->mclk);
+	if (ret) {
+		dev_err(dev, "failed to enable mclk clock\n");
+		clk_disable_unprepare(mqs_priv->ipg);
+		return ret;
+	}
 
 	if (mqs_priv->use_gpr)
 		regmap_write(mqs_priv->regmap, IOMUXC_GPR2,
@@ -292,11 +300,8 @@ static int fsl_mqs_runtime_suspend(struct device *dev)
 		regmap_read(mqs_priv->regmap, REG_MQS_CTRL,
 			    &mqs_priv->reg_mqs_ctrl);
 
-	if (mqs_priv->mclk)
-		clk_disable_unprepare(mqs_priv->mclk);
-
-	if (mqs_priv->ipg)
-		clk_disable_unprepare(mqs_priv->ipg);
+	clk_disable_unprepare(mqs_priv->mclk);
+	clk_disable_unprepare(mqs_priv->ipg);
 
 	return 0;
 }
-- 
2.21.0


^ permalink raw reply related

* [PATCH] ASoC: fsl_easrc: Fix uninitialized scalar variable in fsl_easrc_set_ctx_format
From: Shengjiu Wang @ 2020-06-22  9:03 UTC (permalink / raw)
  To: timur, nicoleotsuka, Xiubo.Lee, festevam, broonie, alsa-devel,
	lgirdwood, perex, tiwai
  Cc: linuxppc-dev, linux-kernel

The "ret" in fsl_easrc_set_ctx_format is not initialized, then
the unknown value maybe returned by this function.

Fixes: 955ac624058f ("ASoC: fsl_easrc: Add EASRC ASoC CPU DAI drivers")
Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
---
 sound/soc/fsl/fsl_easrc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/fsl/fsl_easrc.c b/sound/soc/fsl/fsl_easrc.c
index 2f6b3d8bfcfc..03b3aef41d34 100644
--- a/sound/soc/fsl/fsl_easrc.c
+++ b/sound/soc/fsl/fsl_easrc.c
@@ -1132,7 +1132,7 @@ static int fsl_easrc_set_ctx_format(struct fsl_asrc_pair *ctx,
 	struct fsl_easrc_ctx_priv *ctx_priv = ctx->private;
 	struct fsl_easrc_data_fmt *in_fmt = &ctx_priv->in_params.fmt;
 	struct fsl_easrc_data_fmt *out_fmt = &ctx_priv->out_params.fmt;
-	int ret;
+	int ret = 0;
 
 	/* Get the bitfield values for input data format */
 	if (in_raw_format && out_raw_format) {
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH 3/6] powerpc test_emulate_step: enhancement to test negative scenarios
From: Sandipan Das @ 2020-06-22  9:34 UTC (permalink / raw)
  To: Balamuruhan S; +Cc: ravi.bangoria, paulus, jniethe5, naveen.n.rao, linuxppc-dev
In-Reply-To: <20200622070941.759307-4-bala24@linux.ibm.com>

Hi Bala,

On 22/06/20 12:39 pm, Balamuruhan S wrote:
> add provision to declare test is a negative scenario, verify
> whether emulation fails and avoid executing it.
> 
> Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
> ---
>  arch/powerpc/lib/test_emulate_step.c | 46 ++++++++++++++++++++++------
>  1 file changed, 36 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
> index e3b1797adfae..79acc899a618 100644
> --- a/arch/powerpc/lib/test_emulate_step.c
> +++ b/arch/powerpc/lib/test_emulate_step.c
> @@ -703,6 +703,7 @@ struct compute_test {
>  		unsigned long flags;
>  		struct ppc_inst instr;
>  		struct pt_regs regs;
> +		bool negative;
>  	} subtests[MAX_SUBTESTS + 1];
>  };
>  

Bits of 'flags' are currently used to specify if parts of the resulting pt_regs
are to be ignored. Instead of adding a new member to the struct, can we not do
this using a bit in 'flags'?


- Sandipan

^ permalink raw reply

* Re: [PATCH 1/4] powerpc/pseries/iommu: Update call to ibm,query-pe-dma-windows
From: Alexey Kardashevskiy @ 2020-06-22 10:02 UTC (permalink / raw)
  To: Leonardo Bras
  Cc: Ram Pai, linux-kernel, Paul Mackerras, linuxppc-dev,
	Thiago Jung Bauermann
In-Reply-To: <20200619050619.266888-2-leobras.c@gmail.com>



On 19/06/2020 15:06, Leonardo Bras wrote:
> From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
> outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
> 
> This change of output size is meant to expand the address size of
> largest_available_block PE TCE from 32-bit to 64-bit, which ends up
> shifting page_size and migration_capable.
> 
> This ends up requiring the update of
> ddw_query_response->largest_available_block from u32 to u64, and manually
> assigning the values from the buffer into this struct, according to
> output size.
> 
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 57 +++++++++++++++++++++-----
>  1 file changed, 46 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 6d47b4a3ce39..e5a617738c8b 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -334,7 +334,7 @@ struct direct_window {
>  /* Dynamic DMA Window support */
>  struct ddw_query_response {
>  	u32 windows_available;
> -	u32 largest_available_block;
> +	u64 largest_available_block;
>  	u32 page_size;
>  	u32 migration_capable;
>  };
> @@ -869,14 +869,32 @@ static int find_existing_ddw_windows(void)
>  }
>  machine_arch_initcall(pseries, find_existing_ddw_windows);
>  
> +/*
> + * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many output
> + * parameters ibm,query-pe-dma-windows will have, ranging from 5 to 6.
> + */
> +
> +static int query_ddw_out_sz(struct device_node *par_dn)

Can easily be folded into query_ddw().

> +{
> +	int ret;
> +	u32 ddw_ext[3];
> +
> +	ret = of_property_read_u32_array(par_dn, "ibm,ddw-extensions",
> +					 &ddw_ext[0], 3);
> +	if (ret || ddw_ext[0] < 2 || ddw_ext[2] != 1)


Oh that PAPR thing again :-/

===
The “ibm,ddw-extensions” property value is a list of integers the first
integer indicates the number of extensions implemented and subsequent
integers, one per extension, provide a value associated with that
extension.
===

So ddw_ext[0] is length.
Listindex==2 is for "reset" says PAPR and
Listindex==3 is for this new 64bit "largest_available_block".

So I'd expect ddw_ext[2] to have the "reset" token and ddw_ext[3] to
have "1" for this new feature but indexes are smaller. I am confused.
Either way these "2" and "3" needs to be defined in macros, "0" probably
too.

Please post 'lsprop "ibm,ddw-extensions"' here. Thanks,



> +		return 5;
> +	return 6;
> +}
> +
>  static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
> -			struct ddw_query_response *query)
> +		     struct ddw_query_response *query,
> +		     struct device_node *par_dn)
>  {
>  	struct device_node *dn;
>  	struct pci_dn *pdn;
> -	u32 cfg_addr;
> +	u32 cfg_addr, query_out[5];
>  	u64 buid;
> -	int ret;
> +	int ret, out_sz;
>  
>  	/*
>  	 * Get the config address and phb buid of the PE window.
> @@ -888,12 +906,29 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
>  	pdn = PCI_DN(dn);
>  	buid = pdn->phb->buid;
>  	cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
> +	out_sz = query_ddw_out_sz(par_dn);
> +
> +	ret = rtas_call(ddw_avail[0], 3, out_sz, query_out,
> +			cfg_addr, BUID_HI(buid), BUID_LO(buid));
> +	dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d\n",
> +		 ddw_avail[0], cfg_addr, BUID_HI(buid), BUID_LO(buid), ret);
> +
> +	switch (out_sz) {
> +	case 5:
> +		query->windows_available = query_out[0];
> +		query->largest_available_block = query_out[1];
> +		query->page_size = query_out[2];
> +		query->migration_capable = query_out[3];
> +		break;
> +	case 6:
> +		query->windows_available = query_out[0];
> +		query->largest_available_block = ((u64)query_out[1] << 32) |
> +						 query_out[2];
> +		query->page_size = query_out[3];
> +		query->migration_capable = query_out[4];
> +		break;
> +	}
>  
> -	ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
> -		  cfg_addr, BUID_HI(buid), BUID_LO(buid));
> -	dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
> -		" returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
> -		BUID_LO(buid), ret);
>  	return ret;
>  }
>  
> @@ -1040,7 +1075,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  	 * of page sizes: supported and supported for migrate-dma.
>  	 */
>  	dn = pci_device_to_OF_node(dev);
> -	ret = query_ddw(dev, ddw_avail, &query);
> +	ret = query_ddw(dev, ddw_avail, &query, pdn);
>  	if (ret != 0)
>  		goto out_failed;
>  
> @@ -1068,7 +1103,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  	/* check largest block * page size > max memory hotplug addr */
>  	max_addr = ddw_memory_hotplug_max();
>  	if (query.largest_available_block < (max_addr >> page_shift)) {
> -		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
> +		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
>  			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
>  			  1ULL << page_shift);
>  		goto out_failed;
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 2/4] powerpc/pseries/iommu: Implement ibm,reset-pe-dma-windows rtas call
From: Alexey Kardashevskiy @ 2020-06-22 10:02 UTC (permalink / raw)
  To: Leonardo Bras, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200619050619.266888-3-leobras.c@gmail.com>



On 19/06/2020 15:06, Leonardo Bras wrote:
> Platforms supporting the DDW option starting with LoPAR level 2.7 implement
> ibm,ddw-extensions. The first extension available (index 2) carries the
> token for ibm,reset-pe-dma-windows rtas call, which is used to restore
> the default DMA window for a device, if it has been deleted.
> 
> It does so by resetting the TCE table allocation for the PE to it's
> boot time value, available in "ibm,dma-window" device tree node.
> 
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 33 ++++++++++++++++++++++++++
>  1 file changed, 33 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index e5a617738c8b..5e1fbc176a37 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -1012,6 +1012,39 @@ static phys_addr_t ddw_memory_hotplug_max(void)
>  	return max_addr;
>  }
>  
> +/*
> + * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
> + * ibm,ddw-extensions, which carries the rtas token for
> + * ibm,reset-pe-dma-windows.
> + * That rtas-call can be used to restore the default DMA window for the device.
> + */
> +static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
> +{
> +	int ret;
> +	u32 cfg_addr, ddw_ext[3];
> +	u64 buid;
> +	struct device_node *dn;
> +	struct pci_dn *pdn;
> +
> +	ret = of_property_read_u32_array(par_dn, "ibm,ddw-extensions",
> +					 &ddw_ext[0], 3);

s/3/2/ as for the reset extension you do not need the "64bit largest
block" extension.


> +	if (ret)
> +		return;
> +
> +	dn = pci_device_to_OF_node(dev);
> +	pdn = PCI_DN(dn);
> +	buid = pdn->phb->buid;
> +	cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
> +
> +	ret = rtas_call(ddw_ext[1], 3, 1, NULL, cfg_addr,


Here the "reset" extention is in ddw_ext[1]. Hm. 1/4 has a bug then.

And I am pretty sure it won't compile as reset_dma_window() is not used
and it is static so fold it into one the next patches. Thanks,


> +			BUID_HI(buid), BUID_LO(buid));
> +	if (ret)
> +		dev_info(&dev->dev,
> +			 "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
> +			 ddw_ext[1], cfg_addr, BUID_HI(buid), BUID_LO(buid),
> +			 ret);
> +}
> +
>  /*
>   * If the PE supports dynamic dma windows, and there is space for a table
>   * that can map all pages in a linear offset, then setup such a table,
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 3/4] powerpc/pseries/iommu: Move window-removing part of remove_ddw into remove_dma_window
From: Alexey Kardashevskiy @ 2020-06-22 10:02 UTC (permalink / raw)
  To: Leonardo Bras, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200619050619.266888-4-leobras.c@gmail.com>



On 19/06/2020 15:06, Leonardo Bras wrote:
> Move the window-removing part of remove_ddw into a new function
> (remove_dma_window), so it can be used to remove other DMA windows.
> 
> It's useful for removing DMA windows that don't create DIRECT64_PROPNAME
> property, like the default DMA window from the device, which uses
> "ibm,dma-window".
> 
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 53 +++++++++++++++-----------
>  1 file changed, 31 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 5e1fbc176a37..de633f6ae093 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -767,25 +767,14 @@ static int __init disable_ddw_setup(char *str)
>  
>  early_param("disable_ddw", disable_ddw_setup);
>  
> -static void remove_ddw(struct device_node *np, bool remove_prop)
> +static void remove_dma_window(struct device_node *pdn, u32 *ddw_avail,

You do not need the entire ddw_avail here, pass just the token you need.

Also, despite this particular file, the "pdn" name is usually used for
struct pci_dn (not device_node), let's keep it that way.


> +			      struct property *win)
>  {
>  	struct dynamic_dma_window_prop *dwp;
> -	struct property *win64;
> -	u32 ddw_avail[3];
>  	u64 liobn;
> -	int ret = 0;
> -
> -	ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
> -					 &ddw_avail[0], 3);
> -
> -	win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
> -	if (!win64)
> -		return;
> -
> -	if (ret || win64->length < sizeof(*dwp))
> -		goto delprop;
> +	int ret;
>  
> -	dwp = win64->value;
> +	dwp = win->value;
>  	liobn = (u64)be32_to_cpu(dwp->liobn);
>  
>  	/* clear the whole window, note the arg is in kernel pages */
> @@ -793,24 +782,44 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
>  		1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp);
>  	if (ret)
>  		pr_warn("%pOF failed to clear tces in window.\n",
> -			np);
> +			pdn);
>  	else
>  		pr_debug("%pOF successfully cleared tces in window.\n",
> -			 np);
> +			 pdn);
>  
>  	ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn);
>  	if (ret)
>  		pr_warn("%pOF: failed to remove direct window: rtas returned "
>  			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
> -			np, ret, ddw_avail[2], liobn);
> +			pdn, ret, ddw_avail[2], liobn);
>  	else
>  		pr_debug("%pOF: successfully removed direct window: rtas returned "
>  			"%d to ibm,remove-pe-dma-window(%x) %llx\n",
> -			np, ret, ddw_avail[2], liobn);
> +			pdn, ret, ddw_avail[2], liobn);
> +}
> +
> +static void remove_ddw(struct device_node *np, bool remove_prop)
> +{
> +	struct property *win;
> +	u32 ddw_avail[3];
> +	int ret = 0;
> +
> +	ret = of_property_read_u32_array(np, "ibm,ddw-applicable",
> +					 &ddw_avail[0], 3);
> +	if (ret)
> +		return;
> +
> +	win = of_find_property(np, DIRECT64_PROPNAME, NULL);
> +	if (!win)
> +		return;
> +
> +	if (win->length >= sizeof(struct dynamic_dma_window_prop))


Any good reason not to make it "=="? Is there something optional or we
expect extension (which may not grow from the end but may add cells in
between). Thanks,


> +		remove_dma_window(np, ddw_avail, win);
> +
> +	if (!remove_prop)
> +		return;
>  
> -delprop:
> -	if (remove_prop)
> -		ret = of_remove_property(np, win64);
> +	ret = of_remove_property(np, win);
>  	if (ret)
>  		pr_warn("%pOF: failed to remove direct window property: %d\n",
>  			np, ret);
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 4/4] powerpc/pseries/iommu: Remove default DMA window before creating DDW
From: Alexey Kardashevskiy @ 2020-06-22 10:02 UTC (permalink / raw)
  To: Leonardo Bras, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <20200619050619.266888-5-leobras.c@gmail.com>



On 19/06/2020 15:06, Leonardo Bras wrote:
> On LoPAR "DMA Window Manipulation Calls", it's recommended to remove the
> default DMA window for the device, before attempting to configure a DDW,
> in order to make the maximum resources available for the next DDW to be
> created.
> 
> This is a requirement for some devices to use DDW, given they only
> allow one DMA window.
> 
> If setting up a new DDW fails anywhere after the removal of this
> default DMA window, restore it using reset_dma_window.

Nah... If we do it like this, then under pHyp we lose 32bit DMA for good
as pHyp can only create a single window and it has to map at
0x800.0000.0000.0000. They probably do not care though.

Under KVM, this will fail as VFIO allows creating  2 windows and it
starts from 0 but the existing iommu_bypass_supported_pSeriesLP() treats
the window address == 0 as a failure. And we want to keep both DMA
windows for PCI adapters with both 64bit and 32bit PCI functions (I
heard AMD GPU video + audio are like this) or someone could hotplug
32bit DMA device on a vphb with already present 64bit DMA window so we
do not remove the default window.

The last discussed thing I remember was that there was supposed to be a
new bit in "ibm,architecture-vec-5" (forgot the details), we could use
that to decide whether to keep the default window or not, like this.

> 
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 20 +++++++++++++++++---
>  1 file changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index de633f6ae093..68d1ea957ac7 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -1074,8 +1074,9 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  	u64 dma_addr, max_addr;
>  	struct device_node *dn;
>  	u32 ddw_avail[3];
> +
>  	struct direct_window *window;
> -	struct property *win64;
> +	struct property *win64, *dfl_win;

Make it "default_win" or "def_win", "dfl" hurts to read :)

>  	struct dynamic_dma_window_prop *ddwprop;
>  	struct failed_ddw_pdn *fpdn;
>  
> @@ -1110,8 +1111,19 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  	if (ret)
>  		goto out_failed;
>  
> -       /*
> -	 * Query if there is a second window of size to map the
> +	/*
> +	 * First step of setting up DDW is removing the default DMA window,
> +	 * if it's present. It will make all the resources available to the
> +	 * new DDW window.
> +	 * If anything fails after this, we need to restore it.
> +	 */
> +
> +	dfl_win = of_find_property(pdn, "ibm,dma-window", NULL);
> +	if (dfl_win)
> +		remove_dma_window(pdn, ddw_avail, dfl_win);

Before doing so, you want to make sure that the "reset" is actually
supported. Thanks,


> +
> +	/*
> +	 * Query if there is a window of size to map the
>  	 * whole partition.  Query returns number of windows, largest
>  	 * block assigned to PE (partition endpoint), and two bitmasks
>  	 * of page sizes: supported and supported for migrate-dma.
> @@ -1219,6 +1231,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  	kfree(win64);
>  
>  out_failed:
> +	if (dfl_win)
> +		reset_dma_window(dev, pdn);
>  
>  	fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
>  	if (!fpdn)
> 

-- 
Alexey

^ permalink raw reply

* [next-20200621] LTP tests af_alg02/05 failure on POWER9 PowerVM LPAR
From: Sachin Sant @ 2020-06-22 12:25 UTC (permalink / raw)
  To: herbert, linux-crypto; +Cc: Linux Next Mailing List, linuxppc-dev

With recent next(next-20200621) af_alg02/05 tests fail while running on POWER9
PowerVM LPAR.

Results from  5.8.0-rc1-next-20200622
# ./af_alg02
tst_test.c:1096: INFO: Timeout per run is 0h 00m 20s
af_alg02.c:52: BROK: Timed out while reading from request socket.
#

5.8.0-rc1-next-20200618 was good. The test case ran fine.

Root cause analysis point to following commit:

commit f3c802a1f30013f8f723b62d7fa49eb9e991da23
    crypto: algif_aead - Only wake up when ctx->more is zero

Reverting this commit allows the test to PASS.

Results after reverting the mentioned commit:

# uname -r
5.8.0-rc1-next-20200622-dirty
# ./af_alg02 
tst_test.c:1096: INFO: Timeout per run is 0h 00m 20s
af_alg02.c:33: PASS: Successfully "encrypted" an empty message
#
# ./af_alg05
tst_test.c:1096: INFO: Timeout per run is 0h 05m 00s
af_alg05.c:34: PASS: read() expectedly failed with EINVAL
# 

Thanks
-Sachin

^ permalink raw reply

* Re: [RFC PATCH v0 2/5] powerpc/mm/radix: Create separate mappings for hot-plugged memory
From: Aneesh Kumar K.V @ 2020-06-22 12:46 UTC (permalink / raw)
  To: Bharata B Rao, linuxppc-dev
  Cc: leonardo, aneesh.kumar, npiggin, Bharata B Rao
In-Reply-To: <20200406034925.22586-3-bharata@linux.ibm.com>

Bharata B Rao <bharata@linux.ibm.com> writes:

> Memory that gets hot-plugged _during_ boot (and not the memory
> that gets plugged in after boot), is mapped with 1G mappings
> and will undergo splitting when it is unplugged. The splitting
> code has a few issues:
>
> 1. Recursive locking
> --------------------
> Memory unplug path takes cpu_hotplug_lock and calls stop_machine()
> for splitting the mappings. However stop_machine() takes
> cpu_hotplug_lock again causing deadlock.
>
> 2. BUG: sleeping function called from in_atomic() context
> ---------------------------------------------------------
> Memory unplug path (remove_pagetable) takes init_mm.page_table_lock
> spinlock and later calls stop_machine() which does wait_for_completion()
>
> 3. Bad unlock unbalance
> -----------------------
> Memory unplug path takes init_mm.page_table_lock spinlock and calls
> stop_machine(). The stop_machine thread function runs in a different
> thread context (migration thread) which tries to release and reaquire
> ptl. Releasing ptl from a different thread than which acquired it
> causes bad unlock unbalance.
>
> These problems can be avoided if we avoid mapping hot-plugged memory
> with 1G mapping, thereby removing the need for splitting them during
> unplug. During radix init, identify(*) the hot-plugged memory region
> and create separate mappings for each LMB so that they don't get mapped
> with 1G mappings.
>
> To create separate mappings for every LMB in the hot-plugged
> region, we need lmb-size. I am currently using memory_block_size_bytes()
> API to get the lmb-size. Since this is early init time code, the
> machine type isn't probed yet and hence memory_block_size_bytes()
> would return the default LMB size as 16MB. Hence we end up creating
> separate mappings at much lower granularity than what we can ideally
> do for pseries machine.
>
> (*) Identifying and differentiating hot-plugged memory from the
> boot time memory is now possible with PAPR extension to LMB flags.
> (Ref: https://lore.kernel.org/linuxppc-dev/f55a7b65a43cc9dc7b22385cf9960f8b11d5ce2e.camel@linux.ibm.com/T/#t)
>

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> ---
>  arch/powerpc/mm/book3s64/radix_pgtable.c | 15 ++++++++++++---
>  1 file changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index dd1bea45325c..4a4fb30f6c3d 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -16,6 +16,7 @@
>  #include <linux/hugetlb.h>
>  #include <linux/string_helpers.h>
>  #include <linux/stop_machine.h>
> +#include <linux/memory.h>
>  
>  #include <asm/pgtable.h>
>  #include <asm/pgalloc.h>
> @@ -313,6 +314,8 @@ static void __init radix_init_pgtable(void)
>  {
>  	unsigned long rts_field;
>  	struct memblock_region *reg;
> +	phys_addr_t addr;
> +	u64 lmb_size = memory_block_size_bytes();
>  
>  	/* We don't support slb for radix */
>  	mmu_slb_size = 0;
> @@ -331,9 +334,15 @@ static void __init radix_init_pgtable(void)
>  			continue;
>  		}
>  
> -		WARN_ON(create_physical_mapping(reg->base,
> -						reg->base + reg->size,
> -						-1));
> +		if (memblock_is_hotpluggable(reg)) {
> +			for (addr = reg->base; addr < (reg->base + reg->size);
> +				addr += lmb_size)
> +				WARN_ON(create_physical_mapping(addr,
> +				addr + lmb_size, -1));

Is that indentation correct? 

> +		} else
> +			WARN_ON(create_physical_mapping(reg->base,
> +							reg->base + reg->size,
> +							-1));
>  	}
>  
>  	/* Find out how many PID bits are supported */
> -- 
> 2.21.0

^ permalink raw reply

* Re: [RFC PATCH v0 3/5] powerpc/mm/radix: Fix PTE/PMD fragment count for early page table mappings
From: Aneesh Kumar K.V @ 2020-06-22 12:53 UTC (permalink / raw)
  To: Bharata B Rao, linuxppc-dev
  Cc: leonardo, aneesh.kumar, npiggin, Bharata B Rao
In-Reply-To: <20200406034925.22586-4-bharata@linux.ibm.com>

Bharata B Rao <bharata@linux.ibm.com> writes:

> We can hit the following BUG_ON during memory unplug
>
> kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:344!
> Oops: Exception in kernel mode, sig: 5 [#1]
> LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> NIP [c000000000097d48] pmd_fragment_free+0x48/0xd0
> LR [c0000000016aaefc] remove_pagetable+0x494/0x530
> Call Trace:
> _raw_spin_lock+0x54/0x80 (unreliable)
> remove_pagetable+0x2b0/0x530
> radix__remove_section_mapping+0x18/0x2c
> remove_section_mapping+0x38/0x5c
> arch_remove_memory+0x124/0x190
> try_remove_memory+0xd0/0x1c0
> __remove_memory+0x20/0x40
> dlpar_remove_lmb+0xbc/0x110
> dlpar_memory+0xa90/0xd40
> handle_dlpar_errorlog+0xa8/0x160
> pseries_hp_work_fn+0x2c/0x60
> process_one_work+0x47c/0x870
> worker_thread+0x364/0x5e0
> kthread+0x1b4/0x1c0
> ret_from_kernel_thread+0x5c/0x74
>
> This occurs when unplug is attempted for such memory which has
> been mapped using memblock pages as part of early kernel page
> table setup. We wouldn't have initialized the PMD or PTE fragment
> count for those PMD or PTE pages.
>
> Fixing this includes 3 parts:
>
> - Re-walk the init_mm page tables from mem_init() and initialize
>   the PMD and PTE fragment count to 1.
> - When freeing PUD, PMD and PTE page table pages, check explicitly
>   if they come from memblock and if so free then appropriately.
> - When we do early memblock based allocation of PMD and PUD pages,
>   allocate in PAGE_SIZE granularity so that we are sure the
>   complete page is used as pagetable page.
>
> Since we now do PAGE_SIZE allocations for both PUD table and
> PMD table (Note that PTE table allocation is already of PAGE_SIZE),
> we end up allocating more memory for the same amount of system RAM.
> Here is a comparision of how much more we need for a 64T and 2G
> system after this patch:
>
> 1. 64T system
> -------------
> 64T RAM would need 64G for vmemmap with struct page size being 64B.
>
> 128 PUD tables for 64T memory (1G mappings)
> 1 PUD table and 64 PMD tables for 64G vmemmap (2M mappings)
>
> With default PUD[PMD]_TABLE_SIZE(4K), (128+1+64)*4K=772K
> With PAGE_SIZE(64K) table allocations, (128+1+64)*64K=12352K
>
> 2. 2G system
> ------------
> 2G RAM would need 2M for vmemmap with struct page size being 64B.
>
> 1 PUD table for 2G memory (1G mapping)
> 1 PUD table and 1 PMD table for 2M vmemmap (2M mappings)
>
> With default PUD[PMD]_TABLE_SIZE(4K), (1+1+1)*4K=12K
> With new PAGE_SIZE(64K) table allocations, (1+1+1)*64K=192K
>

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/book3s/64/pgalloc.h | 11 ++-
>  arch/powerpc/include/asm/book3s/64/radix.h   |  1 +
>  arch/powerpc/include/asm/sparsemem.h         |  1 +
>  arch/powerpc/mm/book3s64/pgtable.c           | 31 ++++++++-
>  arch/powerpc/mm/book3s64/radix_pgtable.c     | 72 ++++++++++++++++++--
>  arch/powerpc/mm/mem.c                        |  5 ++
>  arch/powerpc/mm/pgtable-frag.c               |  9 ++-
>  7 files changed, 121 insertions(+), 9 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> index a41e91bd0580..e96572fb2871 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
> @@ -109,7 +109,16 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
>  
>  static inline void pud_free(struct mm_struct *mm, pud_t *pud)
>  {
> -	kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
> +	struct page *page = virt_to_page(pud);
> +
> +	/*
> +	 * Early pud pages allocated via memblock allocator
> +	 * can't be directly freed to slab
> +	 */
> +	if (PageReserved(page))
> +		free_reserved_page(page);
> +	else
> +		kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
>  }
>  
>  static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index d97db3ad9aae..0aff8750181a 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -291,6 +291,7 @@ static inline unsigned long radix__get_tree_size(void)
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  int radix__create_section_mapping(unsigned long start, unsigned long end, int nid);
>  int radix__remove_section_mapping(unsigned long start, unsigned long end);
> +void radix__fixup_pgtable_fragments(void);
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  #endif /* __ASSEMBLY__ */
>  #endif
> diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h
> index 3192d454a733..e662f9232d35 100644
> --- a/arch/powerpc/include/asm/sparsemem.h
> +++ b/arch/powerpc/include/asm/sparsemem.h
> @@ -15,6 +15,7 @@
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  extern int create_section_mapping(unsigned long start, unsigned long end, int nid);
>  extern int remove_section_mapping(unsigned long start, unsigned long end);
> +void fixup_pgtable_fragments(void);
>  
>  #ifdef CONFIG_PPC_BOOK3S_64
>  extern int resize_hpt_for_hotplug(unsigned long new_mem_size);
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
> index 2bf7e1b4fd82..be7aa8786747 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -186,6 +186,13 @@ int __meminit remove_section_mapping(unsigned long start, unsigned long end)
>  
>  	return hash__remove_section_mapping(start, end);
>  }
> +
> +void fixup_pgtable_fragments(void)
> +{
> +	if (radix_enabled())
> +		radix__fixup_pgtable_fragments();
> +}
> +
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  
>  void __init mmu_partition_table_init(void)
> @@ -343,13 +350,23 @@ void pmd_fragment_free(unsigned long *pmd)
>  
>  	BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
>  	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
> -		pgtable_pmd_page_dtor(page);
> -		__free_page(page);
> +		/*
> +		 * Early pmd pages allocated via memblock
> +		 * allocator wouldn't have called _ctor
> +		 */
> +		if (PageReserved(page))
> +			free_reserved_page(page);
> +		else {
> +			pgtable_pmd_page_dtor(page);
> +			__free_page(page);
> +		}
>  	}
>  }
>  
>  static inline void pgtable_free(void *table, int index)
>  {
> +	struct page *page;
> +
>  	switch (index) {
>  	case PTE_INDEX:
>  		pte_fragment_free(table, 0);
> @@ -358,7 +375,15 @@ static inline void pgtable_free(void *table, int index)
>  		pmd_fragment_free(table);
>  		break;
>  	case PUD_INDEX:
> -		kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
> +		page = virt_to_page(table);
> +		/*
> +		 * Early pud pages allocated via memblock
> +		 * allocator need to be freed differently
> +		 */
> +		if (PageReserved(page))
> +			free_reserved_page(page);
> +		else
> +			kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
>  		break;
>  #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
>  		/* 16M hugepd directory at pud level */
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 4a4fb30f6c3d..e675c0bbf9a4 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -36,6 +36,70 @@
>  unsigned int mmu_pid_bits;
>  unsigned int mmu_base_pid;
>  
> +static void fixup_pte_fragments(pmd_t *pmd)
> +{
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
> +		pte_t *pte;
> +		struct page *page;
> +
> +		if (pmd_none(*pmd))
> +			continue;
> +		if (pmd_is_leaf(*pmd))
> +			continue;
> +
> +		pte = pte_offset_kernel(pmd, 0);
> +		page = virt_to_page(pte);
> +		atomic_inc(&page->pt_frag_refcount);
> +	}
> +}
> +
> +static void fixup_pmd_fragments(pud_t *pud)
> +{
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
> +		pmd_t *pmd;
> +		struct page *page;
> +
> +		if (pud_none(*pud))
> +			continue;
> +		if (pud_is_leaf(*pud))
> +			continue;
> +
> +		pmd = pmd_offset(pud, 0);
> +		page = virt_to_page(pmd);
> +		atomic_inc(&page->pt_frag_refcount);
> +		fixup_pte_fragments(pmd);
> +	}
> +}
> +
> +/*
> + * Walk the init_mm page tables and fixup the PMD and PTE fragment
> + * counts. This allows the PUD, PMD and PTE pages to be freed
> + * back to buddy allocator properly during memory unplug.
> + */
> +void radix__fixup_pgtable_fragments(void)
> +{
> +	int i;
> +	pgd_t *pgd = pgd_offset_k(0UL);
> +
> +	spin_lock(&init_mm.page_table_lock);
> +	for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
> +		pud_t *pud;
> +
> +		if (pgd_none(*pgd))
> +			continue;
> +		if (pgd_is_leaf(*pgd))
> +			continue;
> +
> +		pud = pud_offset(pgd, 0);
> +		fixup_pmd_fragments(pud);
> +	}
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +
>  static __ref void *early_alloc_pgtable(unsigned long size, int nid,
>  			unsigned long region_start, unsigned long region_end)
>  {
> @@ -71,8 +135,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
>  
>  	pgdp = pgd_offset_k(ea);
>  	if (pgd_none(*pgdp)) {
> -		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
> -						region_start, region_end);
> +		pudp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
> +					   region_end);
>  		pgd_populate(&init_mm, pgdp, pudp);
>  	}
>  	pudp = pud_offset(pgdp, ea);
> @@ -81,8 +145,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
>  		goto set_the_pte;
>  	}
>  	if (pud_none(*pudp)) {
> -		pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
> -						region_start, region_end);
> +		pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
> +					   region_end);
>  		pud_populate(&init_mm, pudp, pmdp);
>  	}
>  	pmdp = pmd_offset(pudp, ea);
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 1c07d5a3f543..d43ad701f693 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -53,6 +53,10 @@
>  
>  #include <mm/mmu_decl.h>
>  
> +void __weak fixup_pgtable_fragments(void)
> +{
> +}
> +
>  #ifndef CPU_FTR_COHERENT_ICACHE
>  #define CPU_FTR_COHERENT_ICACHE	0	/* XXX for now */
>  #define CPU_FTR_NOEXECUTE	0
> @@ -307,6 +311,7 @@ void __init mem_init(void)
>  
>  	memblock_free_all();
>  
> +	fixup_pgtable_fragments();
>  #ifdef CONFIG_HIGHMEM
>  	{
>  		unsigned long pfn, highmem_mapnr;
> diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
> index ee4bd6d38602..16213c09896a 100644
> --- a/arch/powerpc/mm/pgtable-frag.c
> +++ b/arch/powerpc/mm/pgtable-frag.c
> @@ -114,6 +114,13 @@ void pte_fragment_free(unsigned long *table, int kernel)
>  	if (atomic_dec_and_test(&page->pt_frag_refcount)) {
>  		if (!kernel)
>  			pgtable_pte_page_dtor(page);
> -		__free_page(page);
> +		/*
> +		 * Early pte pages allocated via memblock
> +		 * allocator need to be freed differently
> +		 */
> +		if (PageReserved(page))
> +			free_reserved_page(page);
> +		else
> +			__free_page(page);
>  	}
>  }
> -- 
> 2.21.0

^ permalink raw reply

* Re: [RFC PATCH v0 4/5] powerpc/mm/radix: Free PUD table when freeing pagetable
From: Aneesh Kumar K.V @ 2020-06-22 13:07 UTC (permalink / raw)
  To: Bharata B Rao, linuxppc-dev
  Cc: leonardo, aneesh.kumar, npiggin, Bharata B Rao
In-Reply-To: <20200406034925.22586-5-bharata@linux.ibm.com>

Bharata B Rao <bharata@linux.ibm.com> writes:

> remove_pagetable() isn't freeing PUD table. This causes memory
> leak during memory unplug. Fix this.
>

We had changes w.r.t p4d (folded 5 level table). You may want to get
this updated to recent kernel.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>


> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> ---
>  arch/powerpc/mm/book3s64/radix_pgtable.c | 16 ++++++++++++++++
>  1 file changed, 16 insertions(+)
>
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index e675c0bbf9a4..0d9ef3277579 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -767,6 +767,21 @@ static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
>  	pud_clear(pud);
>  }
>  
> +static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
> +{
> +	pud_t *pud;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++) {
> +		pud = pud_start + i;
> +		if (!pud_none(*pud))
> +			return;
> +	}
> +
> +	pud_free(&init_mm, pud_start);
> +	pgd_clear(pgd);
> +}
> +
>  struct change_mapping_params {
>  	pte_t *pte;
>  	unsigned long start;
> @@ -937,6 +952,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
>  
>  		pud_base = (pud_t *)pgd_page_vaddr(*pgd);
>  		remove_pud_table(pud_base, addr, next);
> +		free_pud_table(pud_base, pgd);
>  	}
>  
>  	spin_unlock(&init_mm.page_table_lock);
> -- 
> 2.21.0

^ permalink raw reply

* Re: [RFC PATCH v0 5/5] powerpc/mm/radix: Remove split_kernel_mapping()
From: Aneesh Kumar K.V @ 2020-06-22 13:07 UTC (permalink / raw)
  To: Bharata B Rao, linuxppc-dev
  Cc: leonardo, aneesh.kumar, npiggin, Bharata B Rao
In-Reply-To: <20200406034925.22586-6-bharata@linux.ibm.com>

Bharata B Rao <bharata@linux.ibm.com> writes:

> With hot-plugged memory getting mapped with 2M mappings always,
> there will be no need to split any mappings during unplug.
>
> Hence remove split_kernel_mapping() and associated code. This
> essentially is a revert of
> commit 4dd5f8a99e791 ("powerpc/mm/radix: Split linear mapping on hot-unplug")
>

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>

> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> ---
>  arch/powerpc/mm/book3s64/radix_pgtable.c | 93 +++++-------------------
>  1 file changed, 19 insertions(+), 74 deletions(-)
>
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 0d9ef3277579..56f2c698deac 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -15,7 +15,6 @@
>  #include <linux/mm.h>
>  #include <linux/hugetlb.h>
>  #include <linux/string_helpers.h>
> -#include <linux/stop_machine.h>
>  #include <linux/memory.h>
>  
>  #include <asm/pgtable.h>
> @@ -782,30 +781,6 @@ static void free_pud_table(pud_t *pud_start, pgd_t *pgd)
>  	pgd_clear(pgd);
>  }
>  
> -struct change_mapping_params {
> -	pte_t *pte;
> -	unsigned long start;
> -	unsigned long end;
> -	unsigned long aligned_start;
> -	unsigned long aligned_end;
> -};
> -
> -static int __meminit stop_machine_change_mapping(void *data)
> -{
> -	struct change_mapping_params *params =
> -			(struct change_mapping_params *)data;
> -
> -	if (!data)
> -		return -1;
> -
> -	spin_unlock(&init_mm.page_table_lock);
> -	pte_clear(&init_mm, params->aligned_start, params->pte);
> -	create_physical_mapping(__pa(params->aligned_start), __pa(params->start), -1);
> -	create_physical_mapping(__pa(params->end), __pa(params->aligned_end), -1);
> -	spin_lock(&init_mm.page_table_lock);
> -	return 0;
> -}
> -
>  static void remove_pte_table(pte_t *pte_start, unsigned long addr,
>  			     unsigned long end)
>  {
> @@ -834,52 +809,6 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr,
>  	}
>  }
>  
> -/*
> - * clear the pte and potentially split the mapping helper
> - */
> -static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
> -				unsigned long size, pte_t *pte)
> -{
> -	unsigned long mask = ~(size - 1);
> -	unsigned long aligned_start = addr & mask;
> -	unsigned long aligned_end = addr + size;
> -	struct change_mapping_params params;
> -	bool split_region = false;
> -
> -	if ((end - addr) < size) {
> -		/*
> -		 * We're going to clear the PTE, but not flushed
> -		 * the mapping, time to remap and flush. The
> -		 * effects if visible outside the processor or
> -		 * if we are running in code close to the
> -		 * mapping we cleared, we are in trouble.
> -		 */
> -		if (overlaps_kernel_text(aligned_start, addr) ||
> -			overlaps_kernel_text(end, aligned_end)) {
> -			/*
> -			 * Hack, just return, don't pte_clear
> -			 */
> -			WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
> -				  "text, not splitting\n", addr, end);
> -			return;
> -		}
> -		split_region = true;
> -	}
> -
> -	if (split_region) {
> -		params.pte = pte;
> -		params.start = addr;
> -		params.end = end;
> -		params.aligned_start = addr & ~(size - 1);
> -		params.aligned_end = min_t(unsigned long, aligned_end,
> -				(unsigned long)__va(memblock_end_of_DRAM()));
> -		stop_machine(stop_machine_change_mapping, &params, NULL);
> -		return;
> -	}
> -
> -	pte_clear(&init_mm, addr, pte);
> -}
> -
>  static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
>  			     unsigned long end)
>  {
> @@ -895,7 +824,12 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
>  			continue;
>  
>  		if (pmd_is_leaf(*pmd)) {
> -			split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
> +			if (!IS_ALIGNED(addr, PMD_SIZE) ||
> +			    !IS_ALIGNED(next, PMD_SIZE)) {
> +				WARN_ONCE(1, "%s: unaligned range\n", __func__);
> +				continue;
> +			}
> +			pte_clear(&init_mm, addr, (pte_t *)pmd);
>  			continue;
>  		}
>  
> @@ -920,7 +854,12 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr,
>  			continue;
>  
>  		if (pud_is_leaf(*pud)) {
> -			split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
> +			if (!IS_ALIGNED(addr, PUD_SIZE) ||
> +			    !IS_ALIGNED(next, PUD_SIZE)) {
> +				WARN_ONCE(1, "%s: unaligned range\n", __func__);
> +				continue;
> +			}
> +			pte_clear(&init_mm, addr, (pte_t *)pud);
>  			continue;
>  		}
>  
> @@ -946,7 +885,13 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
>  			continue;
>  
>  		if (pgd_is_leaf(*pgd)) {
> -			split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
> +			if (!IS_ALIGNED(addr, PGDIR_SIZE) ||
> +			    !IS_ALIGNED(next, PGDIR_SIZE)) {
> +				WARN_ONCE(1, "%s: unaligned range\n", __func__);
> +				continue;
> +			}
> +
> +			pte_clear(&init_mm, addr, (pte_t *)pgd);
>  			continue;
>  		}
>  
> -- 
> 2.21.0

^ permalink raw reply

* Re: [RFC PATCH v0 3/5] powerpc/mm/radix: Fix PTE/PMD fragment count for early page table mappings
From: Aneesh Kumar K.V @ 2020-06-22 13:22 UTC (permalink / raw)
  To: Bharata B Rao, linuxppc-dev
  Cc: leonardo, aneesh.kumar, npiggin, Bharata B Rao
In-Reply-To: <20200406034925.22586-4-bharata@linux.ibm.com>


....

 @@ -71,8 +135,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
>  
>  	pgdp = pgd_offset_k(ea);
>  	if (pgd_none(*pgdp)) {
> -		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
> -						region_start, region_end);
> +		pudp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
> +					   region_end);
>  		pgd_populate(&init_mm, pgdp, pudp);


Add a comment here explaining why we are using PAGE_SIZE instead of the
required PUD_TABLE_SIZE.

>  	}
>  	pudp = pud_offset(pgdp, ea);
> @@ -81,8 +145,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
>  		goto set_the_pte;
>  	}
>  	if (pud_none(*pudp)) {
> -		pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
> -						region_start, region_end);
> +		pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
> +					   region_end);
>  		pud_populate(&init_mm, pudp, pmdp);
>  	}
>  	pmdp = pmd_offset(pudp, ea);


-aneesh

^ permalink raw reply

* Re: [next-20200621] LTP tests af_alg02/05 failure on POWER9 PowerVM LPAR
From: Herbert Xu @ 2020-06-22 13:33 UTC (permalink / raw)
  To: Sachin Sant; +Cc: Linux Next Mailing List, linuxppc-dev, linux-crypto
In-Reply-To: <DF21C7B5-3824-4A6E-B59C-78B67E247383@linux.vnet.ibm.com>

On Mon, Jun 22, 2020 at 05:55:29PM +0530, Sachin Sant wrote:
> With recent next(next-20200621) af_alg02/05 tests fail while running on POWER9
> PowerVM LPAR.
> 
> Results from  5.8.0-rc1-next-20200622
> # ./af_alg02
> tst_test.c:1096: INFO: Timeout per run is 0h 00m 20s
> af_alg02.c:52: BROK: Timed out while reading from request socket.
> #
> 
> 5.8.0-rc1-next-20200618 was good. The test case ran fine.
> 
> Root cause analysis point to following commit:
> 
> commit f3c802a1f30013f8f723b62d7fa49eb9e991da23
>     crypto: algif_aead - Only wake up when ctx->more is zero
> 
> Reverting this commit allows the test to PASS.

Yes that commit does change the behaviour if you don't terminate
your AEAD request by clearing the MSG_MORE flag.  AEAD does not
support chaining so each request must be sent in full before it
can be processed through recvmsg(2).  Setting the MSG_MORE flag
indicates that your request has not been sent in full.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH] hmi: Move hmi irq stat from percpu variable to paca.
From: Mahesh Salgaonkar @ 2020-06-22 15:16 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Aneesh Kumar K.V, Nicholas Piggin

With the patch series posted at
https://lore.kernel.org/linuxppc-dev/20200608070904.387440-1-aneesh.kumar@linux.ibm.com/
the percpu first chunk memory area can come from vmalloc ranges. This makes
kernel to crash whenever percpu variable is accessed in real mode.  This
patch fixes this issue by moving the hmi irq stat inside paca for safe
access in realmode.

Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
---
Machine check handling as well touches percpu variables in realmode. Will
address that in separate patchset.
---
 arch/powerpc/include/asm/hardirq.h |    1 -
 arch/powerpc/include/asm/paca.h    |    1 +
 arch/powerpc/kernel/irq.c          |    4 ++--
 arch/powerpc/kernel/mce.c          |    2 +-
 arch/powerpc/kvm/book3s_hv_ras.c   |    2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/hardirq.h b/arch/powerpc/include/asm/hardirq.h
index f1e9067bd5ac..f133b5930ae1 100644
--- a/arch/powerpc/include/asm/hardirq.h
+++ b/arch/powerpc/include/asm/hardirq.h
@@ -13,7 +13,6 @@ typedef struct {
 	unsigned int pmu_irqs;
 	unsigned int mce_exceptions;
 	unsigned int spurious_irqs;
-	unsigned int hmi_exceptions;
 	unsigned int sreset_irqs;
 #ifdef CONFIG_PPC_WATCHDOG
 	unsigned int soft_nmi_irqs;
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 45a839a7c6cf..cc07c399306e 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -225,6 +225,7 @@ struct paca_struct {
 	u16 in_mce;
 	u8 hmi_event_available;		/* HMI event is available */
 	u8 hmi_p9_special_emu;		/* HMI P9 special emulation */
+	u32 hmi_irqs;			/* HMI irq stat */
 #endif
 	u8 ftrace_enabled;		/* Hard disable ftrace */
 
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 112d150354b2..d7df03a2958d 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -625,7 +625,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%*s: ", prec, "HMI");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ",
-					per_cpu(irq_stat, j).hmi_exceptions);
+					paca_ptrs[j]->hmi_irqs);
 		seq_printf(p, "  Hypervisor Maintenance Interrupts\n");
 	}
 
@@ -665,7 +665,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += per_cpu(irq_stat, cpu).mce_exceptions;
 	sum += per_cpu(irq_stat, cpu).spurious_irqs;
 	sum += per_cpu(irq_stat, cpu).timer_irqs_others;
-	sum += per_cpu(irq_stat, cpu).hmi_exceptions;
+	sum += paca_ptrs[cpu]->hmi_irqs;
 	sum += per_cpu(irq_stat, cpu).sreset_irqs;
 #ifdef CONFIG_PPC_WATCHDOG
 	sum += per_cpu(irq_stat, cpu).soft_nmi_irqs;
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd90c0eda229..dc11fc16750f 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -711,7 +711,7 @@ long hmi_exception_realmode(struct pt_regs *regs)
 {	
 	int ret;
 
-	__this_cpu_inc(irq_stat.hmi_exceptions);
+	local_paca->hmi_irqs++;
 
 	ret = hmi_handle_debugtrig(regs);
 	if (ret >= 0)
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index 79f7d07ef674..6028628ea3ac 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -244,7 +244,7 @@ long kvmppc_realmode_hmi_handler(void)
 {
 	bool resync_req;
 
-	__this_cpu_inc(irq_stat.hmi_exceptions);
+	local_paca->hmi_irqs++;
 
 	if (hmi_handle_debugtrig(NULL) >= 0)
 		return 1;



^ permalink raw reply related

* Re: [PATCH] ASoC: fsl_mqs: Fix unchecked return value for clk_prepare_enable
From: Markus Elfring @ 2020-06-22 16:07 UTC (permalink / raw)
  To: Shengjiu Wang, alsa-devel, linuxppc-dev
  Cc: Timur Tabi, Xiubo Li, Takashi Iwai, kernel-janitors, linux-kernel,
	Jaroslav Kysela, Nicolin Chen, Mark Brown, Fabio Estevam

> Fix unchecked return value for clk_prepare_enable.
>
> And because clk_prepare_enable and clk_disable_unprepare should
> check input clock parameter is NULL or not, then we don't need
> to check it before calling the function.

I propose to split the adjustment of two function implementations
into separate update steps for a small patch series.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/process/submitting-patches.rst?id=625d3449788f85569096780592549d0340e9c0c7#n138

I suggest to improve the change descriptions accordingly.

Regards,
Markus

^ permalink raw reply

* Re: [PATCH 1/4] powerpc/pseries/iommu: Update call to ibm,query-pe-dma-windows
From: Leonardo Bras @ 2020-06-22 18:58 UTC (permalink / raw)
  To: Alexey Kardashevskiy
  Cc: Ram Pai, linux-kernel, Paul Mackerras, linuxppc-dev,
	Thiago Jung Bauermann
In-Reply-To: <cfbcacde-ca7f-5fc7-2fcf-267f698f3d49@ozlabs.ru>

Hello Alexey, thank you for the feedback!

On Mon, 2020-06-22 at 20:02 +1000, Alexey Kardashevskiy wrote:
> 
> On 19/06/2020 15:06, Leonardo Bras wrote:
> > From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can make the number of
> > outputs from "ibm,query-pe-dma-windows" go from 5 to 6.
> > 
> > This change of output size is meant to expand the address size of
> > largest_available_block PE TCE from 32-bit to 64-bit, which ends up
> > shifting page_size and migration_capable.
> > 
> > This ends up requiring the update of
> > ddw_query_response->largest_available_block from u32 to u64, and manually
> > assigning the values from the buffer into this struct, according to
> > output size.
> > 
> > Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> > ---
> >  arch/powerpc/platforms/pseries/iommu.c | 57 +++++++++++++++++++++-----
> >  1 file changed, 46 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > index 6d47b4a3ce39..e5a617738c8b 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -334,7 +334,7 @@ struct direct_window {
> >  /* Dynamic DMA Window support */
> >  struct ddw_query_response {
> >  	u32 windows_available;
> > -	u32 largest_available_block;
> > +	u64 largest_available_block;
> >  	u32 page_size;
> >  	u32 migration_capable;
> >  };
> > @@ -869,14 +869,32 @@ static int find_existing_ddw_windows(void)
> >  }
> >  machine_arch_initcall(pseries, find_existing_ddw_windows);
> >  
> > +/*
> > + * From LoPAR level 2.8, "ibm,ddw-extensions" index 3 can rule how many output
> > + * parameters ibm,query-pe-dma-windows will have, ranging from 5 to 6.
> > + */
> > +
> > +static int query_ddw_out_sz(struct device_node *par_dn)
> 
> Can easily be folded into query_ddw().

Sure, but it will get inlined by the compiler, and I think it reads
better this way. 

I mean, I understand you have a reason to think it's better to fold it
in query_ddw(), and I would like to better understand that to improve
my code in the future.

> > +{
> > +	int ret;
> > +	u32 ddw_ext[3];
> > +
> > +	ret = of_property_read_u32_array(par_dn, "ibm,ddw-extensions",
> > +					 &ddw_ext[0], 3);
> > +	if (ret || ddw_ext[0] < 2 || ddw_ext[2] != 1)
> 
> Oh that PAPR thing again :-/
> 
> ===
> The “ibm,ddw-extensions” property value is a list of integers the first
> integer indicates the number of extensions implemented and subsequent
> integers, one per extension, provide a value associated with that
> extension.
> ===
> 
> So ddw_ext[0] is length.
> Listindex==2 is for "reset" says PAPR and
> Listindex==3 is for this new 64bit "largest_available_block".
> 
> So I'd expect ddw_ext[2] to have the "reset" token and ddw_ext[3] to
> have "1" for this new feature but indexes are smaller. I am confused.
> Either way these "2" and "3" needs to be defined in macros, "0" probably
> too.

Remember these indexes are not C-like 0-starting indexes, where the
size would be Listindex==1.
Basically, in C-like array it's :
a[0] == size, 
a[1] == reset_token, 
a[2] == new 64bit "largest_available_block"

> Please post 'lsprop "ibm,ddw-extensions"' here. Thanks,

Sure:
[root@host pci@800000029004005]# lsprop "ibm,ddw-extensions"
ibm,dd
w-extensions
                 00000002 00000056 00000000


> 
> > +		return 5;
> > +	return 6;
> > +}
> > +
> >  static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
> > -			struct ddw_query_response *query)
> > +		     struct ddw_query_response *query,
> > +		     struct device_node *par_dn)
> >  {
> >  	struct device_node *dn;
> >  	struct pci_dn *pdn;
> > -	u32 cfg_addr;
> > +	u32 cfg_addr, query_out[5];
> >  	u64 buid;
> > -	int ret;
> > +	int ret, out_sz;
> >  
> >  	/*
> >  	 * Get the config address and phb buid of the PE window.
> > @@ -888,12 +906,29 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail,
> >  	pdn = PCI_DN(dn);
> >  	buid = pdn->phb->buid;
> >  	cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
> > +	out_sz = query_ddw_out_sz(par_dn);
> > +
> > +	ret = rtas_call(ddw_avail[0], 3, out_sz, query_out,
> > +			cfg_addr, BUID_HI(buid), BUID_LO(buid));
> > +	dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x returned %d\n",
> > +		 ddw_avail[0], cfg_addr, BUID_HI(buid), BUID_LO(buid), ret);
> > +
> > +	switch (out_sz) {
> > +	case 5:
> > +		query->windows_available = query_out[0];
> > +		query->largest_available_block = query_out[1];
> > +		query->page_size = query_out[2];
> > +		query->migration_capable = query_out[3];
> > +		break;
> > +	case 6:
> > +		query->windows_available = query_out[0];
> > +		query->largest_available_block = ((u64)query_out[1] << 32) |
> > +						 query_out[2];
> > +		query->page_size = query_out[3];
> > +		query->migration_capable = query_out[4];
> > +		break;
> > +	}
> >  
> > -	ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query,
> > -		  cfg_addr, BUID_HI(buid), BUID_LO(buid));
> > -	dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x"
> > -		" returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid),
> > -		BUID_LO(buid), ret);
> >  	return ret;
> >  }
> >  
> > @@ -1040,7 +1075,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> >  	 * of page sizes: supported and supported for migrate-dma.
> >  	 */
> >  	dn = pci_device_to_OF_node(dev);
> > -	ret = query_ddw(dev, ddw_avail, &query);
> > +	ret = query_ddw(dev, ddw_avail, &query, pdn);
> >  	if (ret != 0)
> >  		goto out_failed;
> >  
> > @@ -1068,7 +1103,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> >  	/* check largest block * page size > max memory hotplug addr */
> >  	max_addr = ddw_memory_hotplug_max();
> >  	if (query.largest_available_block < (max_addr >> page_shift)) {
> > -		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
> > +		dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
> >  			  "%llu-sized pages\n", max_addr,  query.largest_available_block,
> >  			  1ULL << page_shift);
> >  		goto out_failed;
> > 

Best regards,
Leonardo


^ permalink raw reply

* Re: [PATCH] powerpc/pseries: new lparcfg key/value pair: partition_affinity_score
From: Tyrel Datwyler @ 2020-06-22 18:59 UTC (permalink / raw)
  To: Nathan Lynch, Scott Cheloha, linuxppc-dev
In-Reply-To: <87d05ufugs.fsf@linux.ibm.com>

On 6/19/20 5:32 PM, Nathan Lynch wrote:
> Hi Tyrel,
> 
> Tyrel Datwyler <tyreld@linux.ibm.com> writes:
>> On 6/19/20 8:34 AM, Scott Cheloha wrote:
>>> The H_GetPerformanceCounterInfo PHYP hypercall has a subcall,
>>> Affinity_Domain_Info_By_Partition, which returns, among other things,
>>> a "partition affinity score" for a given LPAR.  This score, a value on
>>> [0-100], represents the processor-memory affinity for the LPAR in
>>> question.  A score of 0 indicates the worst possible affinity while a
>>> score of 100 indicates perfect affinity.  The score can be used to
>>> reason about performance.
>>>
>>> This patch adds the score for the local LPAR to the lparcfg procfile
>>> under a new 'partition_affinity_score' key.
>>
>> I expect that you will probably get a NACK from Michael on this. The overall
>> desire is to move away from these dated /proc interfaces. While its true that I
>> did add a new value recently it was strictly to facilitate and correct the
>> calculation of a derived value that was already dependent on a couple other
>> existing values in lparcfg.
>>
>> With that said I would expect that you would likely be advised to expose this as
>> a sysfs attribute. The question is where? We probably should put some thought in
>> to this as I would like to port each lparcfg value over to sysfs so that we can
>> move to deprecating lparcfg. Putting everything under something like
>> /sys/kernel/lparcfg/* maybe. Michael may have a better suggestion.
> 
> I think this score fits pretty naturally in lparcfg: it's a simple
> metric that is specific to the pseries/papr platform, like everything
> else in there.
> 
> A few dozen key=value pairs contained in a single file is simple and
> efficient, unlike sysfs with its rather inconsistently applied
> one-value-per-file convention. Surely it's OK if lparcfg gains a line
> every few years?
> 

So, two things:

1.) I wanted to give fore warning that Michael is generally reluctant to add new
things to lparcfg and I wanted to prepare you for that possibility.

2.) As a simple metric who is consuming said metric? I've not seen any patches
to powerpc-utils in prep so should I be expecting this to be exposed via
lparstat, or are we expecting new tooling to also start parsing lparcfg? If we
are planning for users to consume it via existing powerpc-utils tooling than its
probably easier to make the argument that it fits into lparcfg. If we are
creating new tooling, or expecting users to extract that value on their own I
think the sysfs route is going to be favored.

-Tyrel

> 
>>> The H_GetPerformanceCounterInfo hypercall is already used elsewhere in
>>> the kernel, in powerpc/perf/hv-gpci.c.  Refactoring that code and this
>>> code into a more general API might be worthwhile if additional modules
>>> require the hypercall in the future.
>>
>> If you are duplicating code its likely you should already be doing this. See the
>> rest of my comments about below.
>>
>>>
>>> Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
>>> ---
>>>  arch/powerpc/platforms/pseries/lparcfg.c | 53 ++++++++++++++++++++++++
>>>  1 file changed, 53 insertions(+)
>>>
>>> diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
>>> index b8d28ab88178..b75151eee0f0 100644
>>> --- a/arch/powerpc/platforms/pseries/lparcfg.c
>>> +++ b/arch/powerpc/platforms/pseries/lparcfg.c
>>> @@ -136,6 +136,57 @@ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data)
>>>  	return rc;
>>>  }
>>>  
>>> +/*
>>> + * Based on H_GetPerformanceCounterInfo v1.10.
>>> + */
>>> +static void show_gpci_data(struct seq_file *m)
>>> +{
>>> +	struct perf_counter_info_params {
>>> +		__be32 counter_request;
>>> +		__be32 starting_index;
>>> +		__be16 secondary_index;
>>> +		__be16 returned_values;
>>> +		__be32 detail_rc;
>>> +		__be16 counter_value_element_size;
>>> +		u8     counter_info_version_in;
>>> +		u8     counter_info_version_out;
>>> +		u8     reserved[0xC];
>>> +	} __packed;
>>
>> This looks to duplicate the hv_get_perf_counter_info_params struct from
>> arch/powerpc/perf/hv-gpci.h. Maybe this include file needs to move to
>> arch/powerpc/asm/inlcude so you don't have to redefine this struct.
>>
>>> +	struct hv_gpci_request_buffer {
>>> +		struct perf_counter_info_params params;
>>> +		u8 output[4096 - sizeof(struct perf_counter_info_params)];
>>> +	} __packed;
>>
>> This struct is code duplication of the one defined in
>> arch/powerpc/perf/hv-gpci.c and could be moved into hv-gpci.h along with
>> HGPCI_MAX_DATA_BYTES so that you can use those versions here.
> 
> I tend to agree with these comments.
> 
> 
>>> +	struct hv_gpci_request_buffer *buf;
>>> +	long ret;
>>> +	unsigned int affinity_score;
>>> +
>>> +	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
>>> +	if (buf == NULL)
>>> +		return;
>>> +
>>> +	/*
>>> +	 * Show the local LPAR's affinity score.
>>> +	 *
>>> +	 * 0xB1 selects the Affinity_Domain_Info_By_Partition subcall.
>>> +	 * The score is at byte 0xB in the output buffer.
>>> +	 */
>>> +	memset(&buf->params, 0, sizeof(buf->params));
>>> +	buf->params.counter_request = cpu_to_be32(0xB1);
>>> +	buf->params.starting_index = cpu_to_be32(-1);	/* local LPAR */
>>> +	buf->params.counter_info_version_in = 0x5;	/* v5+ for score */
>>> +	ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO, virt_to_phys(buf),
>>> +				 sizeof(*buf));
>>> +	if (ret != H_SUCCESS) {
>>> +		pr_debug("hcall failed: H_GET_PERF_COUNTER_INFO: %ld, %x\n",
>>> +			 ret, be32_to_cpu(buf->params.detail_rc));
>>> +		goto out;
>>> +	}
>>> +	affinity_score = buf->output[0xB];
>>> +	seq_printf(m, "partition_affinity_score=%u\n", affinity_score);
>>> +out:
>>> +	kfree(buf);
>>> +}
>>> +
>>
>> IIUC we should already be able to get this value from userspace using perf tool,
>> right? If thats the case can't we also programatically retrieve it via the
>> perf_event interface in userspace as well?
> 
> No... I had the same thought when I first looked at this, but perf is
> for sampling counters, and the partition affinity score is not a
> counter.

But, its obtained through the same H_GET_PERF_COUNTER_INFO interface as the rest
of the performance counters. Shouldn't it be obtainable via 'perf stat -e
hv-gpci/*' and therefore also via perf_event? I really am not that familiar with
perf so correct me on whatever I'm missing here.

-Tyrel

^ permalink raw reply

* Re: [PATCH 2/4] powerpc/pseries/iommu: Implement ibm,reset-pe-dma-windows rtas call
From: Leonardo Bras @ 2020-06-22 18:58 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <2f004ecc-4788-47b6-e9ae-0c08d4723008@ozlabs.ru>

Hello Alexey, thanks for the feedback!

On Mon, 2020-06-22 at 20:02 +1000, Alexey Kardashevskiy wrote:
> 
> On 19/06/2020 15:06, Leonardo Bras wrote:
> > Platforms supporting the DDW option starting with LoPAR level 2.7 implement
> > ibm,ddw-extensions. The first extension available (index 2) carries the
> > token for ibm,reset-pe-dma-windows rtas call, which is used to restore
> > the default DMA window for a device, if it has been deleted.
> > 
> > It does so by resetting the TCE table allocation for the PE to it's
> > boot time value, available in "ibm,dma-window" device tree node.
> > 
> > Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> > ---
> >  arch/powerpc/platforms/pseries/iommu.c | 33 ++++++++++++++++++++++++++
> >  1 file changed, 33 insertions(+)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > index e5a617738c8b..5e1fbc176a37 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -1012,6 +1012,39 @@ static phys_addr_t ddw_memory_hotplug_max(void)
> >  	return max_addr;
> >  }
> >  
> > +/*
> > + * Platforms supporting the DDW option starting with LoPAR level 2.7 implement
> > + * ibm,ddw-extensions, which carries the rtas token for
> > + * ibm,reset-pe-dma-windows.
> > + * That rtas-call can be used to restore the default DMA window for the device.
> > + */
> > +static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
> > +{
> > +	int ret;
> > +	u32 cfg_addr, ddw_ext[3];
> > +	u64 buid;
> > +	struct device_node *dn;
> > +	struct pci_dn *pdn;
> > +
> > +	ret = of_property_read_u32_array(par_dn, "ibm,ddw-extensions",
> > +					 &ddw_ext[0], 3);
> 
> s/3/2/ as for the reset extension you do not need the "64bit largest
> block" extension.

Sure, I will update this.

> 
> 
> > +	if (ret)
> > +		return;
> > +
> > +	dn = pci_device_to_OF_node(dev);
> > +	pdn = PCI_DN(dn);
> > +	buid = pdn->phb->buid;
> > +	cfg_addr = ((pdn->busno << 16) | (pdn->devfn << 8));
> > +
> > +	ret = rtas_call(ddw_ext[1], 3, 1, NULL, cfg_addr,
> 
> Here the "reset" extention is in ddw_ext[1]. Hm. 1/4 has a bug then.

Humm, in 1/4 I used dd_ext[0] (how many extensions) and ddw_ext[2] (64-
bit largest window count). I fail to see the bug here.

> And I am pretty sure it won't compile as reset_dma_window() is not used
> and it is static so fold it into one the next patches. Thanks,

Sure, I will do that. 
I was questioning myself about this and thought it would be better to
split for easier revision.

> 
> 
> > +			BUID_HI(buid), BUID_LO(buid));
> > +	if (ret)
> > +		dev_info(&dev->dev,
> > +			 "ibm,reset-pe-dma-windows(%x) %x %x %x returned %d ",
> > +			 ddw_ext[1], cfg_addr, BUID_HI(buid), BUID_LO(buid),
> > +			 ret);
> > +}
> > +
> >  /*
> >   * If the PE supports dynamic dma windows, and there is space for a table
> >   * that can map all pages in a linear offset, then setup such a table,
> > 

Best regards,
Leonardo


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox