LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] powerpc/vas: Fix IRQ name allocation
From: Cédric Le Goater @ 2020-12-12 14:27 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Sukadev Bhattiprolu, Haren Myneni, Cédric Le Goater

The VAS device allocates a generic interrupt to handle page faults but
the IRQ name doesn't show under /proc. This is because it's on
stack. Allocate the name.

Signed-off-by: Cédric Le Goater <clg@kaod.org>
---

 I didn't understand this part in init_vas_instance() :

	if (vinst->virq) {
		rc = vas_irq_fault_window_setup(vinst);
		/*
		 * Fault window is used only for user space send windows.
		 * So if vinst->virq is NULL, tx_win_open returns -ENODEV
		 * for user space.
		 */
		if (rc)
			vinst->virq = 0;
	}

 If the IRQ cannot be requested, the device probing should fail but
 it's not today. The use of 'vinst->virq' is suspicious.

 arch/powerpc/platforms/powernv/vas.h |  1 +
 arch/powerpc/platforms/powernv/vas.c | 11 ++++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 70f793e8f6cc..c7db3190baca 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -340,6 +340,7 @@ struct vas_instance {
 	struct vas_window *rxwin[VAS_COP_TYPE_MAX];
 	struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
 
+	char *name;
 	char *dbgname;
 	struct dentry *dbgdir;
 };
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index 598e4cd563fb..b65256a63e87 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -28,12 +28,10 @@ static DEFINE_PER_CPU(int, cpu_vas_id);
 
 static int vas_irq_fault_window_setup(struct vas_instance *vinst)
 {
-	char devname[64];
 	int rc = 0;
 
-	snprintf(devname, sizeof(devname), "vas-%d", vinst->vas_id);
 	rc = request_threaded_irq(vinst->virq, vas_fault_handler,
-				vas_fault_thread_fn, 0, devname, vinst);
+				vas_fault_thread_fn, 0, vinst->name, vinst);
 
 	if (rc) {
 		pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n",
@@ -80,6 +78,12 @@ static int init_vas_instance(struct platform_device *pdev)
 	if (!vinst)
 		return -ENOMEM;
 
+	vinst->name = kasprintf(GFP_KERNEL, "vas-%d", vasid);
+	if (!vinst->name) {
+		kfree(vinst);
+		return -ENOMEM;
+	}
+
 	INIT_LIST_HEAD(&vinst->node);
 	ida_init(&vinst->ida);
 	mutex_init(&vinst->mutex);
@@ -162,6 +166,7 @@ static int init_vas_instance(struct platform_device *pdev)
 	return 0;
 
 free_vinst:
+	kfree(vinst->name);
 	kfree(vinst);
 	return -ENODEV;
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH 1/2] dmaengine: fsldma: Fix a resource leak in the remove function
From: Christophe JAILLET @ 2020-12-12 16:05 UTC (permalink / raw)
  To: leoyang.li, zw, vkoul, dan.j.williams, timur
  Cc: dmaengine, kernel-janitors, Christophe JAILLET, linuxppc-dev,
	linux-kernel

A 'irq_dispose_mapping()' call is missing in the remove function.
Add it.

This is needed to undo the 'irq_of_parse_and_map() call from the probe
function and already part of the error handling path of the probe function.

It was added in the probe function only in commit d3f620b2c4fe ("fsldma:
simplify IRQ probing and handling")

Fixes: 77cd62e8082b ("fsldma: allow Freescale Elo DMA driver to be compiled as a module")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
Patch provided as-is.
I don't have the configuration to compile test this patch
---
 drivers/dma/fsldma.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 0feb323bae1e..554f70a0c18c 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -1314,6 +1314,7 @@ static int fsldma_of_remove(struct platform_device *op)
 		if (fdev->chan[i])
 			fsl_dma_chan_remove(fdev->chan[i]);
 	}
+	irq_dispose_mapping(fdev->irq);
 
 	iounmap(fdev->regs);
 	kfree(fdev);
-- 
2.27.0


^ permalink raw reply related

* [PATCH 2/2] dmaengine: fsldma: Fix a resource leak in an error handling path of the probe function
From: Christophe JAILLET @ 2020-12-12 16:06 UTC (permalink / raw)
  To: leoyang.li, zw, vkoul, dan.j.williams, iws
  Cc: dmaengine, kernel-janitors, Christophe JAILLET, linuxppc-dev,
	linux-kernel

In case of error, the previous 'fsl_dma_chan_probe()' calls must be undone
by some 'fsl_dma_chan_remove()', as already done in the remove function.

It was added in the remove function in commit 77cd62e8082b ("fsldma: allow
Freescale Elo DMA driver to be compiled as a module")

Fixes: d3f620b2c4fe ("fsldma: simplify IRQ probing and handling")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
---
Patch provided as-is.
I don't have the configuration to compile test this patch
---
 drivers/dma/fsldma.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 554f70a0c18c..f8459cc5315d 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -1214,6 +1214,7 @@ static int fsldma_of_probe(struct platform_device *op)
 {
 	struct fsldma_device *fdev;
 	struct device_node *child;
+	unsigned int i;
 	int err;
 
 	fdev = kzalloc(sizeof(*fdev), GFP_KERNEL);
@@ -1292,6 +1293,10 @@ static int fsldma_of_probe(struct platform_device *op)
 	return 0;
 
 out_free_fdev:
+	for (i = 0; i < FSL_DMA_MAX_CHANS_PER_DEVICE; i++) {
+		if (fdev->chan[i])
+			fsl_dma_chan_remove(fdev->chan[i]);
+	}
 	irq_dispose_mapping(fdev->irq);
 	iounmap(fdev->regs);
 out_free:
-- 
2.27.0


^ permalink raw reply related

* [PATCH 2/3] kbuild: LD_VERSION redenomination
From: Masahiro Yamada @ 2020-12-12 16:54 UTC (permalink / raw)
  To: linux-kbuild
  Cc: Thomas Bogendoerfer, Dominique Martinet, linuxppc-dev,
	linux-kernel, Jiaxun Yang, linux-mips, Paul Mackerras,
	Catalin Marinas, Huacai Chen, Will Deacon, Masahiro Yamada,
	linux-arm-kernel
In-Reply-To: <20201212165431.150750-1-masahiroy@kernel.org>

Commit ccbef1674a15 ("Kbuild, lto: add ld-version and ld-ifversion
macros") introduced scripts/ld-version.sh for GCC LTO.

At that time, this script handled 5 version fields because GCC LTO
needed the downstream binutils. (https://lkml.org/lkml/2014/4/8/272)

The code snippet from the submitted patch was as follows:

    # We need HJ Lu's Linux binutils because mainline binutils does not
    # support mixing assembler and LTO code in the same ld -r object.
    # XXX check if the gcc plugin ld is the expected one too
    # XXX some Fedora binutils should also support it. How to check for that?
    ifeq ($(call ld-ifversion,-ge,22710001,y),y)
        ...

However, GCC LTO was not merged into the mainline after all.
(https://lkml.org/lkml/2014/4/8/272)

So, the 4th and 5th fields were never used, and finally removed by
commit 0d61ed17dd30 ("ld-version: Drop the 4th and 5th version
components").

Since then, the last 4-digits returned by this script is always zeros.

Remove the meaningless last 4-digits. This makes the version format
consistent with GCC_VERSION, CLANG_VERSION, LLD_VERSION.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---

 arch/arm64/Kconfig            | 2 +-
 arch/mips/loongson64/Platform | 2 +-
 arch/mips/vdso/Kconfig        | 2 +-
 arch/powerpc/Makefile         | 2 +-
 arch/powerpc/lib/Makefile     | 2 +-
 scripts/ld-version.sh         | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a6b5b7ef40ae..69d56b21a6ec 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1499,7 +1499,7 @@ config ARM64_PTR_AUTH
 	depends on (CC_HAS_SIGN_RETURN_ADDRESS || CC_HAS_BRANCH_PROT_PAC_RET) && AS_HAS_PAC
 	# Modern compilers insert a .note.gnu.property section note for PAC
 	# which is only understood by binutils starting with version 2.33.1.
-	depends on LD_IS_LLD || LD_VERSION >= 233010000 || (CC_IS_GCC && GCC_VERSION < 90100)
+	depends on LD_IS_LLD || LD_VERSION >= 23301 || (CC_IS_GCC && GCC_VERSION < 90100)
 	depends on !CC_IS_CLANG || AS_HAS_CFI_NEGATE_RA_STATE
 	depends on (!FUNCTION_GRAPH_TRACER || DYNAMIC_FTRACE_WITH_REGS)
 	help
diff --git a/arch/mips/loongson64/Platform b/arch/mips/loongson64/Platform
index ec42c5085905..cc0b9c87f9ad 100644
--- a/arch/mips/loongson64/Platform
+++ b/arch/mips/loongson64/Platform
@@ -35,7 +35,7 @@ cflags-$(CONFIG_CPU_LOONGSON64)	+= $(call as-option,-Wa$(comma)-mno-fix-loongson
 # can't easily be used safely within the kbuild framework.
 #
 ifeq ($(call cc-ifversion, -ge, 0409, y), y)
-  ifeq ($(call ld-ifversion, -ge, 225000000, y), y)
+  ifeq ($(call ld-ifversion, -ge, 22500, y), y)
     cflags-$(CONFIG_CPU_LOONGSON64)  += \
       $(call cc-option,-march=loongson3a -U_MIPS_ISA -D_MIPS_ISA=_MIPS_ISA_MIPS64)
   else
diff --git a/arch/mips/vdso/Kconfig b/arch/mips/vdso/Kconfig
index 7aec721398d5..a665f6108cb5 100644
--- a/arch/mips/vdso/Kconfig
+++ b/arch/mips/vdso/Kconfig
@@ -12,7 +12,7 @@
 # the lack of relocations. As such, we disable the VDSO for microMIPS builds.
 
 config MIPS_LD_CAN_LINK_VDSO
-	def_bool LD_VERSION >= 225000000 || LD_IS_LLD
+	def_bool LD_VERSION >= 22500 || LD_IS_LLD
 
 config MIPS_DISABLE_VDSO
 	def_bool CPU_MICROMIPS || (!CPU_MIPSR6 && !MIPS_LD_CAN_LINK_VDSO)
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 5c8c06215dd4..6a9a852c3d56 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -65,7 +65,7 @@ UTS_MACHINE := $(subst $(space),,$(machine-y))
 ifdef CONFIG_PPC32
 KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
 else
-ifeq ($(call ld-ifversion, -ge, 225000000, y),y)
+ifeq ($(call ld-ifversion, -ge, 22500, y),y)
 # Have the linker provide sfpr if possible.
 # There is a corresponding test in arch/powerpc/lib/Makefile
 KBUILD_LDFLAGS_MODULE += --save-restore-funcs
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 69a91b571845..d4efc182662a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_FUNCTION_ERROR_INJECTION)	+= error-inject.o
 # 64-bit linker creates .sfpr on demand for final link (vmlinux),
 # so it is only needed for modules, and only for older linkers which
 # do not support --save-restore-funcs
-ifeq ($(call ld-ifversion, -lt, 225000000, y),y)
+ifeq ($(call ld-ifversion, -lt, 22500, y),y)
 extra-$(CONFIG_PPC64)	+= crtsavres.o
 endif
 
diff --git a/scripts/ld-version.sh b/scripts/ld-version.sh
index f2be0ff9a738..0f8a2c0f9502 100755
--- a/scripts/ld-version.sh
+++ b/scripts/ld-version.sh
@@ -6,6 +6,6 @@
 	gsub(".*version ", "");
 	gsub("-.*", "");
 	split($1,a, ".");
-	print a[1]*100000000 + a[2]*1000000 + a[3]*10000;
+	print a[1]*10000 + a[2]*100 + a[3];
 	exit
 	}
-- 
2.27.0


^ permalink raw reply related

* Re: [PATCH AUTOSEL 5.9 27/39] sched/idle: Fix arch_cpu_idle() vs tracing
From: Sasha Levin @ 2020-12-13 14:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mark Rutland, uclinux-h8-devel, linux-ia64, linux-parisc,
	linux-s390, linux-hexagon, Heiko Carstens, linux-sh, linux-um,
	linux-kernel, stable, linux-mips, openrisc, linux-csky,
	Sven Schnelle, linux-alpha, sparclinux, linux-riscv, linuxppc-dev,
	linux-arm-kernel
In-Reply-To: <20201203171015.GN2414@hirez.programming.kicks-ass.net>

On Thu, Dec 03, 2020 at 06:10:15PM +0100, Peter Zijlstra wrote:
>On Thu, Dec 03, 2020 at 03:54:42PM +0100, Heiko Carstens wrote:
>> On Thu, Dec 03, 2020 at 08:28:21AM -0500, Sasha Levin wrote:
>> > From: Peter Zijlstra <peterz@infradead.org>
>> >
>> > [ Upstream commit 58c644ba512cfbc2e39b758dd979edd1d6d00e27 ]
>> >
>> > We call arch_cpu_idle() with RCU disabled, but then use
>> > local_irq_{en,dis}able(), which invokes tracing, which relies on RCU.
>> >
>> > Switch all arch_cpu_idle() implementations to use
>> > raw_local_irq_{en,dis}able() and carefully manage the
>> > lockdep,rcu,tracing state like we do in entry.
>> >
>> > (XXX: we really should change arch_cpu_idle() to not return with
>> > interrupts enabled)
>> >
>> > Reported-by: Sven Schnelle <svens@linux.ibm.com>
>> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
>> > Reviewed-by: Mark Rutland <mark.rutland@arm.com>
>> > Tested-by: Mark Rutland <mark.rutland@arm.com>
>> > Link: https://lkml.kernel.org/r/20201120114925.594122626@infradead.org
>> > Signed-off-by: Sasha Levin <sashal@kernel.org>
>>
>> This patch broke s390 irq state tracing. A patch to fix this is
>> scheduled to be merged upstream today (hopefully).
>> Therefore I think this patch should not yet go into 5.9 stable.
>
>Agreed.

I'll also grab b1cae1f84a0f ("s390: fix irq state tracing"). Thanks!

-- 
Thanks,
Sasha

^ permalink raw reply

* Re: [PATCH] powerpc/ps3: use dma_mapping_error()
From: Geoff Levand @ 2020-12-13 19:38 UTC (permalink / raw)
  To: Vincent Stehlé, linuxppc-dev, linux-kernel; +Cc: Geert Uytterhoeven
In-Reply-To: <20201213182622.23047-1-vincent.stehle@laposte.net>

On 12/13/20 10:26 AM, Vincent Stehlé wrote:
> The DMA address returned by dma_map_single() should be checked with
> dma_mapping_error(). Fix the ps3stor_setup() function accordingly.
> 
> Fixes: 80071802cb9c ("[POWERPC] PS3: Storage Driver Core")
> Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>
> Cc: Geoff Levand <geoff@infradead.org>
> Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
> ---
>  drivers/ps3/ps3stor_lib.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Looks good.  Thanks for submitting.

Acked by: Geoff Levand <geoff@infradead.org>

^ permalink raw reply

* Re: [PATCH] powerpc/ps3: use dma_mapping_error()
From: Geert Uytterhoeven @ 2020-12-13 19:39 UTC (permalink / raw)
  To: Vincent Stehlé
  Cc: Geoff Levand, Geert Uytterhoeven, linuxppc-dev,
	Linux Kernel Mailing List
In-Reply-To: <20201213182622.23047-1-vincent.stehle@laposte.net>

On Sun, Dec 13, 2020 at 8:06 PM Vincent Stehlé
<vincent.stehle@laposte.net> wrote:
> The DMA address returned by dma_map_single() should be checked with
> dma_mapping_error(). Fix the ps3stor_setup() function accordingly.
>
> Fixes: 80071802cb9c ("[POWERPC] PS3: Storage Driver Core")
> Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>

Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* [PATCH] powerpc: fix alignment bug whithin the init sections
From: Ariel Marcovitch @ 2020-12-13 18:35 UTC (permalink / raw)
  To: mpe; +Cc: paulus, linuxppc-dev, Ariel Marcovitch, linux-kernel

This is a bug that can cause early crashes in configurations with a
.exit.text section smaller than a page and a .init.text section that
ends in the beginning of a physical page (this is kinda random, which
might explain why this wasn't really encountered before).

The init sections are ordered like this:
	.init.text
	.exit.text
	.init.data

Currently, these sections aren't page aligned.

Because the init code is mapped read-only at runtime and because the
.init.text section can potentially reside on the same physical page as
.init.data, the beginning of .init.data might be mapped read-only along
with .init.text.

Then when the kernel tries to modify a variable in .init.data (like
kthreadd_done, used in kernel_init()) the kernel panics.

To avoid this, I made these sections page aligned.

Fixes: 060ef9d89d18 ("powerpc32: PAGE_EXEC required for inittext")
Signed-off-by: Ariel Marcovitch <ariel.marcovitch@gmail.com>
---
 arch/powerpc/kernel/vmlinux.lds.S | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 326e113d2e45..e3a7c90c03f4 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -179,6 +179,11 @@ SECTIONS
 #endif
 	} :text
 
+	/* .init.text is made RO and .exit.text is not, so we must
+	 * ensure these sections reside in separate physical pages.
+	 */
+	. = ALIGN(PAGE_SIZE);
+
 	/* .exit.text is discarded at runtime, not link time,
 	 * to deal with references from __bug_table
 	 */
@@ -186,6 +191,8 @@ SECTIONS
 		EXIT_TEXT
 	}
 
+	. = ALIGN(PAGE_SIZE);
+
 	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
 		INIT_DATA
 	}

base-commit: 1398820fee515873379809a6415930ad0764b2f6
-- 
2.17.1


^ permalink raw reply related

* [PATCH] powerpc/ps3: use dma_mapping_error()
From: Vincent Stehlé @ 2020-12-13 18:26 UTC (permalink / raw)
  To: linuxppc-dev, linux-kernel
  Cc: Geoff Levand, Geert Uytterhoeven, Vincent Stehlé

The DMA address returned by dma_map_single() should be checked with
dma_mapping_error(). Fix the ps3stor_setup() function accordingly.

Fixes: 80071802cb9c ("[POWERPC] PS3: Storage Driver Core")
Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
---
 drivers/ps3/ps3stor_lib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ps3/ps3stor_lib.c b/drivers/ps3/ps3stor_lib.c
index 333ba83006e48..a12a1ad9b5fe3 100644
--- a/drivers/ps3/ps3stor_lib.c
+++ b/drivers/ps3/ps3stor_lib.c
@@ -189,7 +189,7 @@ int ps3stor_setup(struct ps3_storage_device *dev, irq_handler_t handler)
 	dev->bounce_lpar = ps3_mm_phys_to_lpar(__pa(dev->bounce_buf));
 	dev->bounce_dma = dma_map_single(&dev->sbd.core, dev->bounce_buf,
 					 dev->bounce_size, DMA_BIDIRECTIONAL);
-	if (!dev->bounce_dma) {
+	if (dma_mapping_error(&dev->sbd.core, dev->bounce_dma)) {
 		dev_err(&dev->sbd.core, "%s:%u: map DMA region failed\n",
 			__func__, __LINE__);
 		error = -ENODEV;
-- 
2.29.2


^ permalink raw reply related

* Re: [PATCH v12 00/31] Speculative page faults
From: Joel Fernandes @ 2020-12-14  2:03 UTC (permalink / raw)
  To: Chinwen Chang
  Cc: jack, sergey.senozhatsky.work, peterz, Will Deacon, mhocko,
	linux-mm, paulus, Punit Agrawal, hpa, Michel Lespinasse,
	Alexei Starovoitov, Andrea Arcangeli, ak, Minchan Kim,
	aneesh.kumar, x86, Matthew Wilcox, Daniel Jordan, Ingo Molnar,
	zhong jiang, David Rientjes, paulmck, npiggin, sj38.park,
	Jerome Glisse, dave, kemi.wang, kirill, Thomas Gleixner,
	Laurent Dufour, Haiyan Song, Ganesh Mahendran, Yang Shi,
	Mike Rapoport, linuxppc-dev, linux-kernel, Sergey Senozhatsky,
	miles.chen, vinayak menon, akpm, Tim Chen, haren
In-Reply-To: <1594099897.30360.58.camel@mtkswgap22>

On Tue, Jul 07, 2020 at 01:31:37PM +0800, Chinwen Chang wrote:
[..]
> > > Hi Laurent,
> > > 
> > > We merged SPF v11 and some patches from v12 into our platforms. After
> > > several experiments, we observed SPF has obvious improvements on the
> > > launch time of applications, especially for those high-TLP ones,
> > > 
> > > # launch time of applications(s):
> > > 
> > > package           version      w/ SPF      w/o SPF      improve(%)
> > > ------------------------------------------------------------------
> > > Baidu maps        10.13.3      0.887       0.98         9.49
> > > Taobao            8.4.0.35     1.227       1.293        5.10
> > > Meituan           9.12.401     1.107       1.543        28.26
> > > WeChat            7.0.3        2.353       2.68         12.20
> > > Honor of Kings    1.43.1.6     6.63        6.713        1.24
> > 
> > That's great news, thanks for reporting this!
> > 
> > > 
> > > By the way, we have verified our platforms with those patches and
> > > achieved the goal of mass production.
> > 
> > Another good news!
> > For my information, what is your targeted hardware?
> > 
> > Cheers,
> > Laurent.
> 
> Hi Laurent,
> 
> Our targeted hardware belongs to ARM64 multi-core series.

Hello!

I was trying to develop an intuition about why does SPF give improvement for
you on small CPU systems. This is just a high-level theory but:

1. Assume the improvement is because of elimination of "blocking" on
mmap_sem.
Could it be that the mmap_sem is acquired in write-mode unnecessarily in some
places, thus causing blocking on mmap_sem in other paths? If so, is it
feasible to convert such usages to acquiring them in read-mode?

2. Assume the improvement is because of lesser read-side contention on
mmap_sem.
On small CPU systems, I would not expect reducing cache-line bouncing to give
such a dramatic improvement in performance as you are seeing.

Thanks for any insight on this!

- Joel


^ permalink raw reply

* [powerpc:next] BUILD SUCCESS WITH WARNING dddc4ef92d1ce92987da1d6926cdfa99e8acb622
From: kernel test robot @ 2020-12-14  2:54 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  next
branch HEAD: dddc4ef92d1ce92987da1d6926cdfa99e8acb622  KVM: PPC: Book3S HV: XIVE: Add a comment regarding VP numbering

Warning reports:

https://lore.kernel.org/linuxppc-dev/202012042220.zO7hSFT2-lkp@intel.com

Warning in current branch:

arch/powerpc/kernel/vdso32/vgettimeofday.c:13:5: warning: no previous prototype for function '__c_kernel_clock_gettime64' [-Wmissing-prototypes]

Warning ids grouped by kconfigs:

clang_recent_errors
`-- powerpc64-randconfig-r025-20201213
    `-- arch-powerpc-kernel-vdso32-vgettimeofday.c:warning:no-previous-prototype-for-function-__c_kernel_clock_gettime64

elapsed time: 1767m

configs tested: 138
configs skipped: 48

gcc tested configs:
arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
nios2                            alldefconfig
sh                          polaris_defconfig
m68k                        mvme16x_defconfig
openrisc                            defconfig
mips                        qi_lb60_defconfig
arm                           tegra_defconfig
mips                           ip27_defconfig
s390                             alldefconfig
mips                       bmips_be_defconfig
arm                          exynos_defconfig
arm                          collie_defconfig
arc                          axs101_defconfig
sh                          urquell_defconfig
sh                        dreamcast_defconfig
mips                          rm200_defconfig
sh                               j2_defconfig
powerpc                     pseries_defconfig
mips                        bcm63xx_defconfig
mips                        nlm_xlp_defconfig
arm                        shmobile_defconfig
powerpc                 linkstation_defconfig
sh                        apsh4ad0a_defconfig
arc                           tb10x_defconfig
riscv                             allnoconfig
arm                        multi_v5_defconfig
sh                                  defconfig
powerpc                      pcm030_defconfig
m68k                          atari_defconfig
mips                      loongson3_defconfig
arm                           efm32_defconfig
arm                      footbridge_defconfig
powerpc                      ppc6xx_defconfig
powerpc                    amigaone_defconfig
powerpc                       ebony_defconfig
x86_64                              defconfig
alpha                            allyesconfig
mips                      maltasmvp_defconfig
mips                malta_kvm_guest_defconfig
c6x                        evmc6678_defconfig
powerpc                     rainier_defconfig
powerpc                          g5_defconfig
mips                            e55_defconfig
powerpc                      pmac32_defconfig
sh                          lboxre2_defconfig
sh                           se7206_defconfig
i386                             alldefconfig
xtensa                  audio_kc705_defconfig
arc                 nsimosci_hs_smp_defconfig
powerpc                 mpc832x_mds_defconfig
powerpc                   currituck_defconfig
arm                          pxa910_defconfig
arm                          imote2_defconfig
h8300                       h8s-sim_defconfig
microblaze                      mmu_defconfig
sh                           se7724_defconfig
m68k                         apollo_defconfig
mips                       capcella_defconfig
powerpc                mpc7448_hpc2_defconfig
m68k                          hp300_defconfig
powerpc                    klondike_defconfig
xtensa                    smp_lx200_defconfig
sparc64                             defconfig
ia64                         bigsur_defconfig
parisc                generic-32bit_defconfig
csky                                defconfig
arm                       multi_v4t_defconfig
sh                         ecovec24_defconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
c6x                              allyesconfig
nds32                               defconfig
nios2                            allyesconfig
alpha                               defconfig
xtensa                           allyesconfig
h8300                            allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
parisc                           allyesconfig
s390                                defconfig
i386                             allyesconfig
sparc                            allyesconfig
sparc                               defconfig
i386                               tinyconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
x86_64               randconfig-a003-20201213
x86_64               randconfig-a006-20201213
x86_64               randconfig-a002-20201213
x86_64               randconfig-a005-20201213
x86_64               randconfig-a004-20201213
x86_64               randconfig-a001-20201213
i386                 randconfig-a001-20201213
i386                 randconfig-a004-20201213
i386                 randconfig-a003-20201213
i386                 randconfig-a002-20201213
i386                 randconfig-a005-20201213
i386                 randconfig-a006-20201213
i386                 randconfig-a014-20201213
i386                 randconfig-a013-20201213
i386                 randconfig-a012-20201213
i386                 randconfig-a011-20201213
i386                 randconfig-a016-20201213
i386                 randconfig-a015-20201213
riscv                    nommu_k210_defconfig
riscv                            allyesconfig
riscv                    nommu_virt_defconfig
riscv                               defconfig
riscv                          rv32_defconfig
riscv                            allmodconfig
x86_64                                   rhel
x86_64                           allyesconfig
x86_64                    rhel-7.6-kselftests
x86_64                               rhel-8.3
x86_64                                  kexec

clang tested configs:
x86_64               randconfig-a016-20201213
x86_64               randconfig-a012-20201213
x86_64               randconfig-a013-20201213
x86_64               randconfig-a015-20201213
x86_64               randconfig-a014-20201213
x86_64               randconfig-a011-20201213

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-12-14  4:07 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <CALCETrV5BzXuUYm5YAoEKPZZPfLrbHckvwBHzWKrxZS8hqzHEg@mail.gmail.com>

Excerpts from Andy Lutomirski's message of December 11, 2020 10:11 am:
>> On Dec 5, 2020, at 7:59 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>
> 
>> I'm still going to persue shoot-lazies for the merge window. As you
>> see it's about a dozen lines and a if (IS_ENABLED(... in core code.
>> Your change is common code, but a significant complexity (which
>> affects all archs) so needs a lot more review and testing at this
>> point.
> 
> I don't think it's ready for this merge window.

Yes next one I meant (aka this one for development perspective :)).

> I read the early
> patches again, and I think they make the membarrier code worse, not
> better.

Mathieu and I disagree, so we are at an impasse. I addressed your 
comment about not being able to do the additional core sync avoidance 
from the exit tlb call (you can indeed do so in your arch code) and 
about exit_lazy_tlb being a call into the scheduler (it's not) and
about the arch code not being able to reconcile lazy tlb mm with the
core scheduler code (you can).

I fundamentally think the core sync is an issue with what the membarrier
/ arch specifics are doing with lazy tlb mm switching, and not something
the core scheduler needs to know about at all. I don't see the big
problem with essentially moving it from an explicit call to 
exit_lazy_tlb (which from scheduler POV describes better what it is 
doing, not how).

> I'm not fundamentally opposed to the shoot-lazies concept,
> but it needs more thought and it needs a cleaner foundation.

Well shoot lazies actually doesn't really rely on that membarrier
change at all, it just came as a nice looking cleanup so that part
can be dropped from the series. It's not really foundational.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH 2/2] powerpc/64s: Trim offlined CPUs from mm_cpumasks
From: Nicholas Piggin @ 2020-12-14  4:15 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Peter Zijlstra, Aneesh Kumar K.V, Linux Kernel Mailing List,
	Anton Vorontsov, Thomas Gleixner, linuxppc-dev
In-Reply-To: <CAMuHMdUdorW03=mipgm92SXNPBZO5owW1Wp6_SacRDZ7fOe9gw@mail.gmail.com>

Excerpts from Geert Uytterhoeven's message of December 10, 2020 7:06 pm:
> Hi Nicholas,
> 
> On Fri, Nov 20, 2020 at 4:01 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> When offlining a CPU, powerpc/64s does not flush TLBs, rather it just
>> leaves the CPU set in mm_cpumasks, so it continues to receive TLBIEs
>> to manage its TLBs.
>>
>> However the exit_flush_lazy_tlbs() function expects that after
>> returning, all CPUs (except self) have flushed TLBs for that mm, in
>> which case TLBIEL can be used for this flush. This breaks for offline
>> CPUs because they don't get the IPI to flush their TLB. This can lead
>> to stale translations.
>>
>> Fix this by clearing the CPU from mm_cpumasks, then flushing all TLBs
>> before going offline.
>>
>> These offlined CPU bits stuck in the cpumask also prevents the cpumask
>> from being trimmed back to local mode, which means continual broadcast
>> IPIs or TLBIEs are needed for TLB flushing. This patch prevents that
>> situation too.
>>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> 
> Thanks for your patch!
> 
>> --- a/arch/powerpc/platforms/powermac/smp.c
>> +++ b/arch/powerpc/platforms/powermac/smp.c
>> @@ -911,6 +911,8 @@ static int smp_core99_cpu_disable(void)
>>
>>         mpic_cpu_set_priority(0xf);
>>
>> +       cleanup_cpu_mmu_context();
>> +
> 
> I guess this change broke pmac32_defconfig+SMP in v5.10-rc7?
> 
> arch/powerpc/platforms/powermac/smp.c: error: implicit
> declaration of function 'cleanup_cpu_mmu_context'
> [-Werror=implicit-function-declaration]:  => 914:2
> 
> http://kisskb.ellerman.id.au/kisskb/buildresult/14423174/

Hey, yeah it does thanks for catching it. This patch fixes it for me

---
From a9b5ec92ffac975e81c6d7db6ff2b1486b2723f7 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 14 Dec 2020 13:52:39 +1000
Subject: [PATCH] powerpc/32s: Fix cleanup_cpu_mmu_context() compile bug

32s has no tlbiel_all() defined, so just disable the cleanup with a
comment.

Fixes: 01b0f0eae081 ("powerpc/64s: Trim offlined CPUs from mm_cpumasks")
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/platforms/powermac/smp.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index adae2a6712e1..66ef5f8f4445 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -911,7 +911,16 @@ static int smp_core99_cpu_disable(void)
 
 	mpic_cpu_set_priority(0xf);
 
+	/*
+	 * Would be nice for consistency if all platforms clear mm_cpumask and
+	 * flush TLBs on unplug, but the TLB invalidation bug described in
+	 * commit 01b0f0eae081 ("powerpc/64s: Trim offlined CPUs from
+	 * mm_cpumasks") only applies to 64s and for now we only have the TLB
+	 * flush code for that platform.
+	 */
+#ifdef CONFIG_PPC64
 	cleanup_cpu_mmu_context();
+#endif
 
 	return 0;
 }
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH 2/8] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-12-14  5:53 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <1607918323.6muyu2l982.astroid@bobo.none>

Excerpts from Nicholas Piggin's message of December 14, 2020 2:07 pm:
> Excerpts from Andy Lutomirski's message of December 11, 2020 10:11 am:
>>> On Dec 5, 2020, at 7:59 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>>
>> 
>>> I'm still going to persue shoot-lazies for the merge window. As you
>>> see it's about a dozen lines and a if (IS_ENABLED(... in core code.
>>> Your change is common code, but a significant complexity (which
>>> affects all archs) so needs a lot more review and testing at this
>>> point.
>> 
>> I don't think it's ready for this merge window.
> 
> Yes next one I meant (aka this one for development perspective :)).
> 
>> I read the early
>> patches again, and I think they make the membarrier code worse, not
>> better.
> 
> Mathieu and I disagree, so we are at an impasse.

Well actually not really, I went and cut out the exit_lazy_tlb stuff
from the patch series, those are better to be untangled anyway. I think 
an earlier version had something in exit_lazy_tlb for the mm refcounting 
change but it's not required now anyway.

I'll split them out and just work on the shoot lazies series for now, I
might revisit exit_lazy_tlb after the dust settles from that and the
current membarrier changes. I'll test and repost shortly.

Thanks,
Nick

^ permalink raw reply

* [PATCH v2 0/5] shoot lazy tlbs
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev

This is another rebase, on top of mainline now (don't need the
asm-generic tree), and without any x86 or membarrier changes.
This makes the series far smaller and more manageable and
without the controversial bits.

Thanks,
Nick

Nicholas Piggin (5):
  lazy tlb: introduce lazy mm refcount helper functions
  lazy tlb: allow lazy tlb mm switching to be configurable
  lazy tlb: shoot lazies, a non-refcounting lazy tlb option
  powerpc: use lazy mm refcount helper functions
  powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN

 arch/Kconfig                         | 30 ++++++++++
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/Kconfig                 |  1 +
 arch/powerpc/kernel/smp.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 +-
 fs/exec.c                            |  4 +-
 include/linux/sched/mm.h             | 20 +++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/fork.c                        | 52 ++++++++++++++++
 kernel/kthread.c                     | 11 ++--
 kernel/sched/core.c                  | 88 ++++++++++++++++++++--------
 kernel/sched/sched.h                 |  4 +-
 13 files changed, 184 insertions(+), 38 deletions(-)

-- 
2.23.0


^ permalink raw reply

* [PATCH v2 1/5] lazy tlb: introduce lazy mm refcount helper functions
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>

Add explicit _lazy_tlb annotated functions for lazy mm refcounting.
This makes things a bit more explicit, and allows explicit refcounting
to be removed if it is not used.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/arm/mach-rpc/ecard.c            |  2 +-
 arch/powerpc/mm/book3s64/radix_tlb.c |  4 ++--
 fs/exec.c                            |  4 ++--
 include/linux/sched/mm.h             | 11 +++++++++++
 kernel/cpu.c                         |  2 +-
 kernel/exit.c                        |  2 +-
 kernel/kthread.c                     | 11 +++++++----
 kernel/sched/core.c                  | 15 ++++++++-------
 8 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/arch/arm/mach-rpc/ecard.c b/arch/arm/mach-rpc/ecard.c
index 827b50f1c73e..1b4a41aad793 100644
--- a/arch/arm/mach-rpc/ecard.c
+++ b/arch/arm/mach-rpc/ecard.c
@@ -253,7 +253,7 @@ static int ecard_init_mm(void)
 	current->mm = mm;
 	current->active_mm = mm;
 	activate_mm(active_mm, mm);
-	mmdrop(active_mm);
+	mmdrop_lazy_tlb(active_mm);
 	ecard_init_pgtables(mm);
 	return 0;
 }
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index b487b489d4b6..74708aef333e 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -658,10 +658,10 @@ static void do_exit_flush_lazy_tlb(void *arg)
 	if (current->active_mm == mm) {
 		WARN_ON_ONCE(current->mm != NULL);
 		/* Is a kernel thread and is using mm as the lazy tlb */
-		mmgrab(&init_mm);
+		mmgrab_lazy_tlb(&init_mm);
 		current->active_mm = &init_mm;
 		switch_mm_irqs_off(mm, &init_mm, current);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
 
 	atomic_dec(&mm->context.active_cpus);
diff --git a/fs/exec.c b/fs/exec.c
index 547a2390baf5..56fc23dcbe4d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1028,9 +1028,9 @@ static int exec_mmap(struct mm_struct *mm)
 		setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
 		mm_update_next_owner(old_mm);
 		mmput(old_mm);
-		return 0;
+	} else {
+		mmdrop_lazy_tlb(active_mm);
 	}
-	mmdrop(active_mm);
 	return 0;
 }
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d5ece7a9a403..94a117160083 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm)
 		__mmdrop(mm);
 }
 
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+	mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+	mmdrop(mm);
+}
+
 /**
  * mmget() - Pin the address space associated with a &struct mm_struct.
  * @mm: The address space to pin.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2b8d7a5db383..a54cdfa08d71 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -576,7 +576,7 @@ static int finish_cpu(unsigned int cpu)
 	 */
 	if (mm != &init_mm)
 		idle->active_mm = &init_mm;
-	mmdrop(mm);
+	mmdrop_lazy_tlb(mm);
 	return 0;
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 1f236ed375f8..3711a74fcf4a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -474,7 +474,7 @@ static void exit_mm(void)
 		__set_current_state(TASK_RUNNING);
 		mmap_read_lock(mm);
 	}
-	mmgrab(mm);
+	mmgrab_lazy_tlb(mm);
 	BUG_ON(mm != current->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(current);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 933a625621b8..da189e0d26ed 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1240,14 +1240,14 @@ void kthread_use_mm(struct mm_struct *mm)
 	WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
 	WARN_ON_ONCE(tsk->mm);
 
+	mmgrab(mm);
+
 	task_lock(tsk);
 	/* Hold off tlb flush IPIs while switching mm's */
 	local_irq_disable();
 	active_mm = tsk->active_mm;
-	if (active_mm != mm) {
-		mmgrab(mm);
+	if (active_mm != mm)
 		tsk->active_mm = mm;
-	}
 	tsk->mm = mm;
 	switch_mm_irqs_off(active_mm, mm, tsk);
 	local_irq_enable();
@@ -1257,7 +1257,7 @@ void kthread_use_mm(struct mm_struct *mm)
 #endif
 
 	if (active_mm != mm)
-		mmdrop(active_mm);
+		mmdrop_lazy_tlb(active_mm);
 
 	to_kthread(tsk)->oldfs = force_uaccess_begin();
 }
@@ -1280,10 +1280,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
 	sync_mm_rss(mm);
 	local_irq_disable();
 	tsk->mm = NULL;
+	mmgrab_lazy_tlb(mm);
 	/* active_mm is still 'mm' */
 	enter_lazy_tlb(mm, tsk);
 	local_irq_enable();
 	task_unlock(tsk);
+
+	mmdrop(mm);
 }
 EXPORT_SYMBOL_GPL(kthread_unuse_mm);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e7e453492cff..c2f8ea43d29b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3629,13 +3629,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 * rq->curr, before returning to userspace, so provide them here:
 	 *
 	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
-	 *   provided by mmdrop(),
+	 *   provided by mmdrop_lazy_tlb(),
 	 * - a sync_core for SYNC_CORE.
 	 */
 	if (mm) {
 		membarrier_mm_sync_core_before_usermode(mm);
-		mmdrop(mm);
+		mmdrop_lazy_tlb(mm);
 	}
+
 	if (unlikely(prev_state == TASK_DEAD)) {
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
@@ -3739,9 +3740,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	/*
 	 * kernel -> kernel   lazy + transfer active
-	 *   user -> kernel   lazy + mmgrab() active
+	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
 	 *
-	 * kernel ->   user   switch + mmdrop() active
+	 * kernel ->   user   switch + mmdrop_lazy_tlb() active
 	 *   user ->   user   switch
 	 */
 	if (!next->mm) {                                // to kernel
@@ -3749,7 +3750,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 		next->active_mm = prev->active_mm;
 		if (prev->mm)                           // from user
-			mmgrab(prev->active_mm);
+			mmgrab_lazy_tlb(prev->active_mm);
 		else
 			prev->active_mm = NULL;
 	} else {                                        // to user
@@ -3765,7 +3766,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop() in finish_task_switch(). */
+			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
 			rq->prev_mm = prev->active_mm;
 			prev->active_mm = NULL;
 		}
@@ -7206,7 +7207,7 @@ void __init sched_init(void)
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	enter_lazy_tlb(&init_mm, current);
 
 	/*
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 2/5] lazy tlb: allow lazy tlb mm switching to be configurable
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>

Add CONFIG_MMU_LAZY_TLB which can be configured out to disable
the lazy tlb mechanism entirely, and switches to init_mm when
switching to a kernel thread.

NOMMU systems could easily go without this and save a bit of code
and the refcount atomics, because their mm switch is a no-op. They
have not been switched over by default because the arch code needs
to be audited and tested for lazy tlb mm refcounting and converted
to _lazy_tlb refcounting if necessary.

CONFIG_MMU_LAZY_TLB_REFCOUNT is also added, but it must always
be enabled if CONFIG_MMU_LAZY_TLB is enabled until the next patch
which provides an alternate scheme.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig             | 17 +++++++++
 include/linux/sched/mm.h | 13 +++++--
 kernel/sched/core.c      | 75 ++++++++++++++++++++++++++++++----------
 kernel/sched/sched.h     |  4 ++-
 4 files changed, 87 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index ba4e966484ab..84faaba66364 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -430,6 +430,23 @@ config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	  irqs disabled over activate_mm. Architectures that do IPI based TLB
 	  shootdowns should enable this.
 
+# Should make this depend on MMU, because there is little use for lazy mm switching
+# with NOMMU. Must audit NOMMU architecture code for lazy mm refcounting first.
+config MMU_LAZY_TLB
+	def_bool y
+	help
+	  Enable "lazy TLB" mmu context switching for kernel threads.
+	  If this is disabled then switching to a kernel thread always
+	  switches to init_mm. If mm switches are inexpensive or free
+	  (in the case of NOMMU) then this could be disabled.
+
+config MMU_LAZY_TLB_REFCOUNT
+	def_bool y
+	depends on MMU_LAZY_TLB
+	help
+	  This must be enabled if MMU_LAZY_TLB is enabled until the next
+	  patch.
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 94a117160083..5edf8e942c84 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -52,12 +52,21 @@ static inline void mmdrop(struct mm_struct *mm)
 /* Helpers for lazy TLB mm refcounting */
 static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
 {
-	mmgrab(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
+		mmgrab(mm);
 }
 
 static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
 {
-	mmdrop(mm);
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
+		mmdrop(mm);
+	} else {
+		/*
+		 * mmdrop_lazy_tlb must provide a full memory barrier, see the
+		 * membarrier comment finish_task_switch which relies on this.
+		 */
+		smp_mb();
+	}
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c2f8ea43d29b..9c1dc9406e4b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3579,7 +3579,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	__releases(rq->lock)
 {
 	struct rq *rq = this_rq();
-	struct mm_struct *mm = rq->prev_mm;
+	struct mm_struct *mm = NULL;
 	long prev_state;
 
 	/*
@@ -3598,7 +3598,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		      current->comm, current->pid, preempt_count()))
 		preempt_count_set(FORK_PREEMPT_COUNT);
 
-	rq->prev_mm = NULL;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	mm = rq->prev_lazy_mm;
+	rq->prev_lazy_mm = NULL;
+#endif
 
 	/*
 	 * A task struct has one reference for the use as "current".
@@ -3722,22 +3725,10 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 	calculate_sigpending();
 }
 
-/*
- * context_switch - switch to the new MM and the new thread's register state.
- */
-static __always_inline struct rq *
-context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next, struct rq_flags *rf)
+static __always_inline void
+context_switch_mm(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
 {
-	prepare_task_switch(rq, prev, next);
-
-	/*
-	 * For paravirt, this is coupled with an exit in switch_to to
-	 * combine the page table reload and the switch backend into
-	 * one hypercall.
-	 */
-	arch_start_context_switch(prev);
-
 	/*
 	 * kernel -> kernel   lazy + transfer active
 	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
@@ -3766,11 +3757,57 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
 		if (!prev->mm) {                        // from kernel
-			/* will mmdrop_lazy_tlb() in finish_task_switch(). */
-			rq->prev_mm = prev->active_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+			/* Will mmdrop_lazy_tlb() in finish_task_switch(). */
+			rq->prev_lazy_mm = prev->active_mm;
 			prev->active_mm = NULL;
+#else
+			/*
+			 * Without MMU_LAZY_REFCOUNT there is no lazy
+			 * tracking (because no rq->prev_lazy_mm) in
+			 * finish_task_switch, so no mmdrop_lazy_tlb(),
+			 * so no memory barrier for membarrier (see the
+			 * membarrier comment in finish_task_switch()).
+			 * Do it here.
+			 */
+			smp_mb();
+#endif
 		}
 	}
+}
+
+static __always_inline void
+context_switch_mm_nolazy(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
+{
+	if (!next->mm)
+		next->active_mm = &init_mm;
+	membarrier_switch_mm(rq, prev->active_mm, next->active_mm);
+	switch_mm_irqs_off(prev->active_mm, next->active_mm, next);
+	if (!prev->mm)
+		prev->active_mm = NULL;
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline struct rq *
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next, struct rq_flags *rf)
+{
+	prepare_task_switch(rq, prev, next);
+
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB))
+		context_switch_mm(rq, prev, next);
+	else
+		context_switch_mm_nolazy(rq, prev, next);
 
 	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index df80bfcea92e..3b72aec5a2f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -950,7 +950,9 @@ struct rq {
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
-	struct mm_struct	*prev_mm;
+#ifdef CONFIG_MMU_LAZY_TLB_REFCOUNT
+	struct mm_struct	*prev_lazy_mm;
+#endif
 
 	unsigned int		clock_update_flags;
 	u64			clock;
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 3/5] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>

On big systems, the mm refcount can become highly contented when doing
a lot of context switching with threaded applications (particularly
switching between the idle thread and an application thread).

Abandoning lazy tlb slows switching down quite a bit in the important
user->idle->user cases, so instead implement a non-refcounted scheme
that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
any remaining lazy ones.

Shootdown IPIs are some concern, but they have not been observed to be
a big problem with this scheme (the powerpc implementation generated
314 additional interrupts on a 144 CPU system during a kernel compile).
There are a number of strategies that could be employed to reduce IPIs
if they turn out to be a problem for some workload.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig  | 17 +++++++++++++++--
 kernel/fork.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 84faaba66364..e69c974369cc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -443,9 +443,22 @@ config MMU_LAZY_TLB
 config MMU_LAZY_TLB_REFCOUNT
 	def_bool y
 	depends on MMU_LAZY_TLB
+	depends on !MMU_LAZY_TLB_SHOOTDOWN
 	help
-	  This must be enabled if MMU_LAZY_TLB is enabled until the next
-	  patch.
+	  This refcounts the mm that is used as the lazy TLB mm when switching
+	  switching to a kernel thread.
+
+config MMU_LAZY_TLB_SHOOTDOWN
+	bool
+	depends on MMU_LAZY_TLB
+	help
+	  Instead of refcounting the "lazy tlb" mm struct, which can cause
+	  contention with multi-threaded apps on large multiprocessor systems,
+	  this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
+	  switch to init_mm if they were using the to-be-freed mm as the lazy
+	  tlb. To implement this, architectures must use _lazy_tlb variants of
+	  mm refcounting, and mm_cpumask must include at least all possible
+	  CPUs in which mm might be lazy.
 
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..74b972d2d8a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -669,6 +669,53 @@ static void check_mm(struct mm_struct *mm)
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
+static void do_shoot_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		WARN_ON_ONCE(current->mm);
+		current->active_mm = &init_mm;
+		switch_mm(mm, &init_mm, current);
+	}
+}
+
+static void do_check_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void shoot_lazy_tlbs(struct mm_struct *mm)
+{
+	if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+		/*
+		 * IPI overheads have not found to be expensive, but they could
+		 * be reduced in a number of possible ways, for example (in
+		 * roughly increasing order of complexity):
+		 * - A batch of mms requiring IPIs could be gathered and freed
+		 *   at once.
+		 * - CPUs could store their active mm somewhere that can be
+		 *   remotely checked without a lock, to filter out
+		 *   false-positives in the cpumask.
+		 * - After mm_users or mm_count reaches zero, switching away
+		 *   from the mm could clear mm_cpumask to reduce some IPIs
+		 *   (some batching or delaying would help).
+		 * - A delayed freeing and RCU-like quiescing sequence based on
+		 *   mm switching to avoid IPIs completely.
+		 */
+		on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+		if (IS_ENABLED(CONFIG_DEBUG_VM))
+			on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+	} else {
+		/*
+		 * In this case, lazy tlb mms are refounted and would not reach
+		 * __mmdrop until all CPUs have switched away and mmdrop()ed.
+		 */
+	}
+}
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -678,7 +725,12 @@ void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
+
+	/* Ensure no CPUs are using this as their lazy tlb mm */
+	shoot_lazy_tlbs(mm);
+
 	WARN_ON_ONCE(mm == current->active_mm);
+
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_subscriptions_destroy(mm);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 4/5] powerpc: use lazy mm refcount helper functions
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>

Use _lazy_tlb functions for lazy mm refcounting in powerpc, to prepare
to move to MMU_LAZY_TLB_SHOOTDOWN.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8c2857cbd960..93c0eaa6f4bf 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1395,7 +1395,7 @@ void start_secondary(void *unused)
 {
 	unsigned int cpu = raw_smp_processor_id();
 
-	mmgrab(&init_mm);
+	mmgrab_lazy_tlb(&init_mm);
 	current->active_mm = &init_mm;
 
 	smp_store_cpu_info(cpu);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v2 5/5] powerpc/64s: enable MMU_LAZY_TLB_SHOOTDOWN
From: Nicholas Piggin @ 2020-12-14  6:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: linux-arch, Nicholas Piggin, linux-mm, Andy Lutomirski,
	linuxppc-dev
In-Reply-To: <20201214065312.270062-1-npiggin@gmail.com>

On a 16-socket 192-core POWER8 system, a context switching benchmark
with as many software threads as CPUs (so each switch will go in and
out of idle), upstream can achieve a rate of about 1 million context
switches per second. After this patch it goes up to 118 million.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5181872f9452..356138bdb5bb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -232,6 +232,7 @@ config PPC
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE
 	select MMU_GATHER_PAGE_SIZE
+	select MMU_LAZY_TLB_SHOOTDOWN		if PPC_BOOK3S_64
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
 	select HAVE_SYSCALL_TRACEPOINTS
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH v2 3/5] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Randy Dunlap @ 2020-12-14  7:04 UTC (permalink / raw)
  To: Nicholas Piggin, linux-kernel
  Cc: linux-arch, linux-mm, linuxppc-dev, Andy Lutomirski
In-Reply-To: <20201214065312.270062-4-npiggin@gmail.com>

On 12/13/20 10:53 PM, Nicholas Piggin wrote:
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 84faaba66364..e69c974369cc 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -443,9 +443,22 @@ config MMU_LAZY_TLB
>  config MMU_LAZY_TLB_REFCOUNT
>  	def_bool y
>  	depends on MMU_LAZY_TLB
> +	depends on !MMU_LAZY_TLB_SHOOTDOWN
>  	help
> -	  This must be enabled if MMU_LAZY_TLB is enabled until the next
> -	  patch.
> +	  This refcounts the mm that is used as the lazy TLB mm when switching
> +	  switching to a kernel thread.

duplicate "switching".

> +
> +config MMU_LAZY_TLB_SHOOTDOWN
> +	bool
> +	depends on MMU_LAZY_TLB
> +	help
> +	  Instead of refcounting the "lazy tlb" mm struct, which can cause
> +	  contention with multi-threaded apps on large multiprocessor systems,
> +	  this option causes __mmdrop to IPI all CPUs in the mm_cpumask and
> +	  switch to init_mm if they were using the to-be-freed mm as the lazy
> +	  tlb. To implement this, architectures must use _lazy_tlb variants of
> +	  mm refcounting, and mm_cpumask must include at least all possible
> +	  CPUs in which mm might be lazy.
>  
>  config ARCH_HAVE_NMI_SAFE_CMPXCHG
>  	bool


-- 


^ permalink raw reply

* [PATCH] powerpc/kup: Mark the kuap/keup function non __init
From: Aneesh Kumar K.V @ 2020-12-14  7:13 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V

Kernel call these functions on cpu online and hence they should
not be marked __init.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/book3s32/mmu.c   | 4 ++--
 arch/powerpc/mm/book3s64/pkeys.c | 4 ++--
 arch/powerpc/mm/init-common.c    | 2 +-
 arch/powerpc/mm/nohash/8xx.c     | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 23f60e97196e..8d9e90a99b51 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -449,7 +449,7 @@ void __init print_system_hash_info(void)
 }
 
 #ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
 	pr_info("Activating Kernel Userspace Execution Prevention\n");
 
@@ -459,7 +459,7 @@ void __init setup_kuep(bool disabled)
 #endif
 
 #ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
 {
 	pr_info("Activating Kernel Userspace Access Protection\n");
 
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 2b7ded396db4..f1c6f264ed91 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -251,7 +251,7 @@ void __init pkey_early_init_devtree(void)
 }
 
 #ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
 	if (disabled)
 		return;
@@ -277,7 +277,7 @@ void __init setup_kuep(bool disabled)
 #endif
 
 #ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
 {
 	if (disabled)
 		return;
diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
index afdebb95bcae..c71af3978496 100644
--- a/arch/powerpc/mm/init-common.c
+++ b/arch/powerpc/mm/init-common.c
@@ -47,7 +47,7 @@ static int __init parse_nosmap(char *p)
 }
 early_param("nosmap", parse_nosmap);
 
-void __ref setup_kup(void)
+void setup_kup(void)
 {
 	setup_kuep(disable_kuep);
 	setup_kuap(disable_kuap);
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 231ca95f9ffb..9fba29b95b5a 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -245,7 +245,7 @@ void set_context(unsigned long id, pgd_t *pgd)
 }
 
 #ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
 	if (disabled)
 		return;
@@ -257,7 +257,7 @@ void __init setup_kuep(bool disabled)
 #endif
 
 #ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
 {
 	pr_info("Activating Kernel Userspace Access Protection\n");
 
-- 
2.28.0


^ permalink raw reply related

* Re: [PATCH] powerpc/kup: Mark the kuap/keup function non __init
From: Christophe Leroy @ 2020-12-14  7:22 UTC (permalink / raw)
  To: Aneesh Kumar K.V, linuxppc-dev, mpe
In-Reply-To: <20201214071306.346399-1-aneesh.kumar@linux.ibm.com>



Le 14/12/2020 à 08:13, Aneesh Kumar K.V a écrit :
> Kernel call these functions on cpu online and hence they should
> not be marked __init.

This is PPC64 only.

See commit 
https://github.com/linuxppc/linux/commit/67d53f30e23ec66aa7bbdd1592d5e64d46876190#diff-9799ddc8e77e666295031a560afc2a754d2f5fa0ddfb96335495b26a07511ad4

Christophe

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>   arch/powerpc/mm/book3s32/mmu.c   | 4 ++--
>   arch/powerpc/mm/book3s64/pkeys.c | 4 ++--
>   arch/powerpc/mm/init-common.c    | 2 +-
>   arch/powerpc/mm/nohash/8xx.c     | 4 ++--
>   4 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
> index 23f60e97196e..8d9e90a99b51 100644
> --- a/arch/powerpc/mm/book3s32/mmu.c
> +++ b/arch/powerpc/mm/book3s32/mmu.c
> @@ -449,7 +449,7 @@ void __init print_system_hash_info(void)
>   }
>   
>   #ifdef CONFIG_PPC_KUEP
> -void __init setup_kuep(bool disabled)
> +void setup_kuep(bool disabled)
>   {
>   	pr_info("Activating Kernel Userspace Execution Prevention\n");
>   
> @@ -459,7 +459,7 @@ void __init setup_kuep(bool disabled)
>   #endif
>   
>   #ifdef CONFIG_PPC_KUAP
> -void __init setup_kuap(bool disabled)
> +void setup_kuap(bool disabled)
>   {
>   	pr_info("Activating Kernel Userspace Access Protection\n");
>   
> diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
> index 2b7ded396db4..f1c6f264ed91 100644
> --- a/arch/powerpc/mm/book3s64/pkeys.c
> +++ b/arch/powerpc/mm/book3s64/pkeys.c
> @@ -251,7 +251,7 @@ void __init pkey_early_init_devtree(void)
>   }
>   
>   #ifdef CONFIG_PPC_KUEP
> -void __init setup_kuep(bool disabled)
> +void setup_kuep(bool disabled)
>   {
>   	if (disabled)
>   		return;
> @@ -277,7 +277,7 @@ void __init setup_kuep(bool disabled)
>   #endif
>   
>   #ifdef CONFIG_PPC_KUAP
> -void __init setup_kuap(bool disabled)
> +void setup_kuap(bool disabled)
>   {
>   	if (disabled)
>   		return;
> diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c
> index afdebb95bcae..c71af3978496 100644
> --- a/arch/powerpc/mm/init-common.c
> +++ b/arch/powerpc/mm/init-common.c
> @@ -47,7 +47,7 @@ static int __init parse_nosmap(char *p)
>   }
>   early_param("nosmap", parse_nosmap);
>   
> -void __ref setup_kup(void)
> +void setup_kup(void)
>   {
>   	setup_kuep(disable_kuep);
>   	setup_kuap(disable_kuap);
> diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
> index 231ca95f9ffb..9fba29b95b5a 100644
> --- a/arch/powerpc/mm/nohash/8xx.c
> +++ b/arch/powerpc/mm/nohash/8xx.c
> @@ -245,7 +245,7 @@ void set_context(unsigned long id, pgd_t *pgd)
>   }
>   
>   #ifdef CONFIG_PPC_KUEP
> -void __init setup_kuep(bool disabled)
> +void setup_kuep(bool disabled)
>   {
>   	if (disabled)
>   		return;
> @@ -257,7 +257,7 @@ void __init setup_kuep(bool disabled)
>   #endif
>   
>   #ifdef CONFIG_PPC_KUAP
> -void __init setup_kuap(bool disabled)
> +void setup_kuap(bool disabled)
>   {
>   	pr_info("Activating Kernel Userspace Access Protection\n");
>   
> 

^ permalink raw reply

* [PATCH v2] powerpc/book3s/kup: Mark the kuap/keup function non __init
From: Aneesh Kumar K.V @ 2020-12-14  8:01 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V

The kernel call these functions on cpu online and hence they should
not be marked __init.

Fixes: 3b47b7549ead ("powerpc/book3s64/kuap: Move KUAP related function outside radix")
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/book3s64/pkeys.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index 2b7ded396db4..f1c6f264ed91 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -251,7 +251,7 @@ void __init pkey_early_init_devtree(void)
 }
 
 #ifdef CONFIG_PPC_KUEP
-void __init setup_kuep(bool disabled)
+void setup_kuep(bool disabled)
 {
 	if (disabled)
 		return;
@@ -277,7 +277,7 @@ void __init setup_kuep(bool disabled)
 #endif
 
 #ifdef CONFIG_PPC_KUAP
-void __init setup_kuap(bool disabled)
+void setup_kuap(bool disabled)
 {
 	if (disabled)
 		return;
-- 
2.28.0


^ permalink raw reply related

* [PATCH kernel v3] powerpc/kuap: Restore AMR after replaying soft interrupts
From: Alexey Kardashevskiy @ 2020-12-14  8:39 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alexey Kardashevskiy, Nicholas Piggin

Since de78a9c "powerpc: Add a framework for Kernel Userspace Access
Protection", user access helpers call user_{read|write}_access_{begin|end}
when user space access is allowed.

890274c "powerpc/64s: Implement KUAP for Radix MMU" made the mentioned
helpers program a AMR special register to allow such access for a short
period of time, most of the time AMR is expected to block user memory
access by the kernel.

Since the code accesses the user space memory, unsafe_get_user()
calls might_fault() which calls arch_local_irq_restore() if either
CONFIG_PROVE_LOCKING or CONFIG_DEBUG_ATOMIC_SLEEP is enabled.
arch_local_irq_restore() then attempts to replay pending soft interrupts
as KUAP regions have hardware interrupts enabled.
If a pending interrupt happens to do user access (performance interrupts
do that), it enables access for a short period of time so after returning
from the replay, the user access state remains blocked and if a user page
fault happens - "Bug: Read fault blocked by AMR!" appears and SIGSEGV is
sent.

This saves/restores AMR when replaying interrupts.

This adds a check if AMR was not blocked when before replaying interrupts.

Found by syzkaller. The call stack for the bug is:

copy_from_user_nofault+0xf8/0x250
perf_callchain_user_64+0x3d8/0x8d0
perf_callchain_user+0x38/0x50
get_perf_callchain+0x28c/0x300
perf_callchain+0xb0/0x130
perf_prepare_sample+0x364/0xbf0
perf_event_output_forward+0xe0/0x280
__perf_event_overflow+0xa4/0x240
perf_swevent_hrtimer+0x1d4/0x1f0
__hrtimer_run_queues+0x328/0x900
hrtimer_interrupt+0x128/0x350
timer_interrupt+0x180/0x600
replay_soft_interrupts+0x21c/0x4f0
arch_local_irq_restore+0x94/0x150
lock_is_held_type+0x140/0x200
___might_sleep+0x220/0x330
__might_fault+0x88/0x120
do_strncpy_from_user+0x108/0x2b0
strncpy_from_user+0x1d0/0x2a0
getname_flags+0x88/0x2c0
do_sys_openat2+0x2d4/0x5f0
do_sys_open+0xcc/0x140
system_call_exception+0x160/0x240
system_call_common+0xf0/0x27c

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
---
Changes:
v3:
* do not block/unblock if AMR was blocked
* reverted move of AMR_KUAP_***
* added kuap_check_amr

v2:
* fixed compile on hash
* moved get/set to arch_local_irq_restore
* block KUAP before replaying

---

This is an example:

------------[ cut here ]------------
Bug: Read fault blocked by AMR!
WARNING: CPU: 0 PID: 1603 at /home/aik/p/kernel/arch/powerpc/include/asm/book3s/64/kup-radix.h:145 __do_page_fau

Modules linked in:
CPU: 0 PID: 1603 Comm: amr Not tainted 5.10.0-rc6_v5.10-rc6_a+fstn1 #24
NIP:  c00000000009ece8 LR: c00000000009ece4 CTR: 0000000000000000
REGS: c00000000dc63560 TRAP: 0700   Not tainted  (5.10.0-rc6_v5.10-rc6_a+fstn1)
MSR:  8000000000021033 <SF,ME,IR,DR,RI,LE>  CR: 28002888  XER: 20040000
CFAR: c0000000001fa928 IRQMASK: 1
GPR00: c00000000009ece4 c00000000dc637f0 c000000002397600 000000000000001f
GPR04: c0000000020eb318 0000000000000000 c00000000dc63494 0000000000000027
GPR08: c00000007fe4de68 c00000000dfe9180 0000000000000000 0000000000000001
GPR12: 0000000000002000 c0000000030a0000 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 bfffffffffffffff
GPR20: 0000000000000000 c0000000134a4020 c0000000019c2218 0000000000000fe0
GPR24: 0000000000000000 0000000000000000 c00000000d106200 0000000040000000
GPR28: 0000000000000000 0000000000000300 c00000000dc63910 c000000001946730
NIP [c00000000009ece8] __do_page_fault+0xb38/0xde0
LR [c00000000009ece4] __do_page_fault+0xb34/0xde0
Call Trace:
[c00000000dc637f0] [c00000000009ece4] __do_page_fault+0xb34/0xde0 (unreliable)
[c00000000dc638a0] [c00000000000c968] handle_page_fault+0x10/0x2c
--- interrupt: 300 at strncpy_from_user+0x290/0x440
    LR = strncpy_from_user+0x284/0x440
[c00000000dc63ba0] [c000000000c3dcb0] strncpy_from_user+0x2f0/0x440 (unreliable)
[c00000000dc63c30] [c00000000068b888] getname_flags+0x88/0x2c0
[c00000000dc63c90] [c000000000662a44] do_sys_openat2+0x2d4/0x5f0
[c00000000dc63d30] [c00000000066560c] do_sys_open+0xcc/0x140
[c00000000dc63dc0] [c000000000045e10] system_call_exception+0x160/0x240
[c00000000dc63e20] [c00000000000da60] system_call_common+0xf0/0x27c
Instruction dump:
409c0048 3fe2ff5b 3bfff128 fac10060 fae10068 482f7a85 60000000 3c62ff5b
7fe4fb78 3863f250 4815bbd9 60000000 <0fe00000> 3c62ff5b 3863f2b8 4815c8b5
irq event stamp: 254
hardirqs last  enabled at (253): [<c000000000019550>] arch_local_irq_restore+0xa0/0x150
hardirqs last disabled at (254): [<c000000000008a10>] data_access_common_virt+0x1b0/0x1d0
softirqs last  enabled at (0): [<c0000000001f6d5c>] copy_process+0x78c/0x2120
softirqs last disabled at (0): [<0000000000000000>] 0x0
---[ end trace ba98aec5151f3aeb ]---
---
 arch/powerpc/kernel/irq.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 7d0f7682d01d..92f28ad78d6d 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -311,6 +311,23 @@ void replay_soft_interrupts(void)
 	}
 }
 
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_KUAP)
+static inline void replay_soft_interrupts_irqrestore(void)
+{
+	unsigned long kuap_state = get_kuap();
+
+	if (kuap_state != AMR_KUAP_BLOCKED)
+		set_kuap(AMR_KUAP_BLOCKED);
+
+	replay_soft_interrupts();
+
+	if (kuap_state != AMR_KUAP_BLOCKED)
+		set_kuap(kuap_state);
+}
+#else
+#define replay_soft_interrupts_irqrestore() replay_soft_interrupts()
+#endif
+
 notrace void arch_local_irq_restore(unsigned long mask)
 {
 	unsigned char irq_happened;
@@ -320,6 +337,14 @@ notrace void arch_local_irq_restore(unsigned long mask)
 	if (mask)
 		return;
 
+	/*
+	 * It fires if anything calls local_irq_enable or restore when
+	 * KUAP is enabled, and the code handles that just fine by saving
+	 * and re-locking AMR but we would like to remove those calls,
+	 * hence the warning.
+	 */
+	kuap_check_amr();
+
 	/*
 	 * From this point onward, we can take interrupts, preempt,
 	 * etc... unless we got hard-disabled. We check if an event
@@ -373,7 +398,7 @@ notrace void arch_local_irq_restore(unsigned long mask)
 	irq_soft_mask_set(IRQS_ALL_DISABLED);
 	trace_hardirqs_off();
 
-	replay_soft_interrupts();
+	replay_soft_interrupts_irqrestore();
 	local_paca->irq_happened = 0;
 
 	trace_hardirqs_on();
-- 
2.17.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox