LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 15/18] powerpc/embedded6xx/mpc7448: Move PHB discovery
From: Oliver O'Halloran @ 2020-11-03  4:35 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20201103043523.916109-1-oohall@gmail.com>

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
compile tested with mpc7448_hpc2_defconfig
---
 arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c
index b95c3380d2b5..5565647dc879 100644
--- a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c
+++ b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c
@@ -58,16 +58,14 @@ int mpc7448_hpc2_exclude_device(struct pci_controller *hose,
 		return PCIBIOS_SUCCESSFUL;
 }
 
-static void __init mpc7448_hpc2_setup_arch(void)
+static void __init mpc7448_hpc2_setup_pci(void)
 {
+#ifdef CONFIG_PCI
 	struct device_node *np;
 	if (ppc_md.progress)
-		ppc_md.progress("mpc7448_hpc2_setup_arch():set_bridge", 0);
-
-	tsi108_csr_vir_base = get_vir_csrbase();
+		ppc_md.progress("mpc7448_hpc2_setup_pci():set_bridge", 0);
 
 	/* setup PCI host bridge */
-#ifdef CONFIG_PCI
 	for_each_compatible_node(np, "pci", "tsi108-pci")
 		tsi108_setup_pci(np, MPC7448HPC2_PCI_CFG_PHYS, 0);
 
@@ -75,6 +73,11 @@ static void __init mpc7448_hpc2_setup_arch(void)
 	if (ppc_md.progress)
 		ppc_md.progress("tsi108: resources set", 0x100);
 #endif
+}
+
+static void __init mpc7448_hpc2_setup_arch(void)
+{
+	tsi108_csr_vir_base = get_vir_csrbase();
 
 	printk(KERN_INFO "MPC7448HPC2 (TAIGA) Platform\n");
 	printk(KERN_INFO
@@ -181,6 +184,7 @@ define_machine(mpc7448_hpc2){
 	.name 			= "MPC7448 HPC2",
 	.probe 			= mpc7448_hpc2_probe,
 	.setup_arch 		= mpc7448_hpc2_setup_arch,
+	.discover_phbs		= mpc7448_hpc2_setup_pci,
 	.init_IRQ 		= mpc7448_hpc2_init_IRQ,
 	.show_cpuinfo 		= mpc7448_hpc2_show_cpuinfo,
 	.get_irq 		= mpic_get_irq,
-- 
2.26.2


^ permalink raw reply related

* [PATCH 16/18] powerpc/embedded6xx/mve5100: Move PHB discovery
From: Oliver O'Halloran @ 2020-11-03  4:35 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20201103043523.916109-1-oohall@gmail.com>

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
compile tested with mvme5100_defconfig
---
 arch/powerpc/platforms/embedded6xx/mvme5100.c   | 13 ++++++++-----
 arch/powerpc/platforms/embedded6xx/storcenter.c |  8 ++++++--
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/embedded6xx/mvme5100.c b/arch/powerpc/platforms/embedded6xx/mvme5100.c
index 1cd488daa0bf..c06a0490d157 100644
--- a/arch/powerpc/platforms/embedded6xx/mvme5100.c
+++ b/arch/powerpc/platforms/embedded6xx/mvme5100.c
@@ -154,17 +154,19 @@ static const struct of_device_id mvme5100_of_bus_ids[] __initconst = {
  */
 static void __init mvme5100_setup_arch(void)
 {
-	struct device_node *np;
-
 	if (ppc_md.progress)
 		ppc_md.progress("mvme5100_setup_arch()", 0);
 
-	for_each_compatible_node(np, "pci", "hawk-pci")
-		mvme5100_add_bridge(np);
-
 	restart = ioremap(BOARD_MODRST_REG, 4);
 }
 
+static void __init mvme5100_setup_pci(void)
+{
+	struct device_node *np;
+
+	for_each_compatible_node(np, "pci", "hawk-pci")
+		mvme5100_add_bridge(np);
+}
 
 static void mvme5100_show_cpuinfo(struct seq_file *m)
 {
@@ -205,6 +207,7 @@ define_machine(mvme5100) {
 	.name			= "MVME5100",
 	.probe			= mvme5100_probe,
 	.setup_arch		= mvme5100_setup_arch,
+	.discover_phbs		= mvme5100_setup_pci,
 	.init_IRQ		= mvme5100_pic_init,
 	.show_cpuinfo		= mvme5100_show_cpuinfo,
 	.get_irq		= mpic_get_irq,
diff --git a/arch/powerpc/platforms/embedded6xx/storcenter.c b/arch/powerpc/platforms/embedded6xx/storcenter.c
index e346ddcef45e..e188b90f7016 100644
--- a/arch/powerpc/platforms/embedded6xx/storcenter.c
+++ b/arch/powerpc/platforms/embedded6xx/storcenter.c
@@ -65,14 +65,17 @@ static int __init storcenter_add_bridge(struct device_node *dev)
 }
 
 static void __init storcenter_setup_arch(void)
+{
+	printk(KERN_INFO "IOMEGA StorCenter\n");
+}
+
+static void __init storcenter_setup_pci(void)
 {
 	struct device_node *np;
 
 	/* Lookup PCI host bridges */
 	for_each_compatible_node(np, "pci", "mpc10x-pci")
 		storcenter_add_bridge(np);
-
-	printk(KERN_INFO "IOMEGA StorCenter\n");
 }
 
 /*
@@ -117,6 +120,7 @@ define_machine(storcenter){
 	.name 			= "IOMEGA StorCenter",
 	.probe 			= storcenter_probe,
 	.setup_arch 		= storcenter_setup_arch,
+	.discover_phbs 		= storcenter_setup_pci,
 	.init_IRQ 		= storcenter_init_IRQ,
 	.get_irq 		= mpic_get_irq,
 	.restart 		= storcenter_restart,
-- 
2.26.2


^ permalink raw reply related

* [PATCH 17/18] powerpc/pasemi: Move PHB discovery
From: Oliver O'Halloran @ 2020-11-03  4:35 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20201103043523.916109-1-oohall@gmail.com>

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
compile tested with pasemi_defconfig
---
 arch/powerpc/platforms/pasemi/setup.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pasemi/setup.c b/arch/powerpc/platforms/pasemi/setup.c
index b612474f8f8e..376797eb7894 100644
--- a/arch/powerpc/platforms/pasemi/setup.c
+++ b/arch/powerpc/platforms/pasemi/setup.c
@@ -144,8 +144,6 @@ static void __init pas_setup_arch(void)
 	/* Setup SMP callback */
 	smp_ops = &pas_smp_ops;
 #endif
-	/* Lookup PCI hosts */
-	pas_pci_init();
 
 	/* Remap SDC register for doing reset */
 	/* XXXOJN This should maybe come out of the device tree */
@@ -446,6 +444,7 @@ define_machine(pasemi) {
 	.name			= "PA Semi PWRficient",
 	.probe			= pas_probe,
 	.setup_arch		= pas_setup_arch,
+	.discover_phbs		= pas_pci_init,
 	.init_IRQ		= pas_init_IRQ,
 	.get_irq		= mpic_get_irq,
 	.restart		= pas_restart,
-- 
2.26.2


^ permalink raw reply related

* [PATCH 18/18] powerpc/powermac: Move PHB discovery
From: Oliver O'Halloran @ 2020-11-03  4:35 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20201103043523.916109-1-oohall@gmail.com>

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
compile tested with pmac32_defconfig and g5_defconfig
---
 arch/powerpc/platforms/powermac/setup.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c
index 2e2cc0c75d87..86aee3f2483f 100644
--- a/arch/powerpc/platforms/powermac/setup.c
+++ b/arch/powerpc/platforms/powermac/setup.c
@@ -298,9 +298,6 @@ static void __init pmac_setup_arch(void)
 		of_node_put(ic);
 	}
 
-	/* Lookup PCI hosts */
-	pmac_pci_init();
-
 #ifdef CONFIG_PPC32
 	ohare_init();
 	l2cr_init();
@@ -600,6 +597,7 @@ define_machine(powermac) {
 	.name			= "PowerMac",
 	.probe			= pmac_probe,
 	.setup_arch		= pmac_setup_arch,
+	.discover_phbs		= pmac_pci_init,
 	.show_cpuinfo		= pmac_show_cpuinfo,
 	.init_IRQ		= pmac_pic_init,
 	.get_irq		= NULL,	/* changed later */
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH v2] powerpc/pci: unmap legacy INTx interrupts when a PHB is removed
From: Oliver O'Halloran @ 2020-11-03  4:40 UTC (permalink / raw)
  To: Cédric Le Goater
  Cc: Stephen Rothwell, Alexey Kardashevskiy, Linux Kernel Mailing List,
	Qian Cai, Linux-Next Mailing List, linuxppc-dev
In-Reply-To: <3497b725-9108-9f63-9cc2-ac7b1dd06c09@kaod.org>

On Tue, Nov 3, 2020 at 1:39 AM Cédric Le Goater <clg@kaod.org> wrote:
>
> On 10/14/20 4:55 AM, Alexey Kardashevskiy wrote:
> >
> > How do you remove PHBs exactly? There is no such thing in the powernv platform, I thought someone added this and you are fixing it but no. PHBs on powernv are created at the boot time and there is no way to remove them, you can only try removing all the bridges.
>
> yes. I noticed that later when proposing the fix for the double
> free.
>
> > So what exactly are you doing?
>
> What you just said above, with the commands :
>
>   echo 1 >  /sys/devices/pci0031\:00/0031\:00\:00.0/remove
>   echo 1 >  /sys/devices/pci0031\:00/pci_bus/0031\:00/rescan

Right, so that'll remove the root port device (and Bus 01 beneath it),
but the PHB itself is still there. If it was removed the root bus
would also disappear.

^ permalink raw reply

* [PATCH 1/3] selftests/powerpc: Hoist helper code out of eeh-basic
From: Oliver O'Halloran @ 2020-11-03  4:45 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran

Hoist some of the useful test environment checking and prep code into
eeh-functions.sh so they can be reused in other tests.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 .../selftests/powerpc/eeh/eeh-basic.sh        | 39 ++-------------
 .../selftests/powerpc/eeh/eeh-functions.sh    | 48 +++++++++++++++++++
 2 files changed, 51 insertions(+), 36 deletions(-)
 mode change 100755 => 100644 tools/testing/selftests/powerpc/eeh/eeh-functions.sh

diff --git a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
index 0d783e1065c8..16d00555f13e 100755
--- a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
+++ b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
@@ -1,28 +1,13 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0-only
 
-KSELFTESTS_SKIP=4
-
 . ./eeh-functions.sh
 
-if ! eeh_supported ; then
-	echo "EEH not supported on this system, skipping"
-	exit $KSELFTESTS_SKIP;
-fi
-
-if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
-   [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
-	echo "debugfs EEH testing files are missing. Is debugfs mounted?"
-	exit $KSELFTESTS_SKIP;
-fi
+eeh_test_prep # NB: may exit
 
 pre_lspci=`mktemp`
 lspci > $pre_lspci
 
-# Bump the max freeze count to something absurd so we don't
-# trip over it while breaking things.
-echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes
-
 # record the devices that we break in here. Assuming everything
 # goes to plan we should get them back once the recover process
 # is finished.
@@ -30,34 +15,16 @@ devices=""
 
 # Build up a list of candidate devices.
 for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do
-	# skip bridges since we can't recover them (yet...)
-	if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
-		echo "$dev, Skipped: bridge"
+	if ! eeh_can_break $dev ; then
 		continue;
 	fi
 
-	# Skip VFs for now since we don't have a reliable way
-	# to break them.
+	# Skip VFs for now since we don't have a reliable way to break them.
 	if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then
 		echo "$dev, Skipped: virtfn"
 		continue;
 	fi
 
-	if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then
-		echo "$dev, Skipped: ahci doesn't support recovery"
-		continue
-	fi
-
-	# Don't inject errosr into an already-frozen PE. This happens with
-	# PEs that contain multiple PCI devices (e.g. multi-function cards)
-	# and injecting new errors during the recovery process will probably
-	# result in the recovery failing and the device being marked as
-	# failed.
-	if ! pe_ok $dev ; then
-		echo "$dev, Skipped: Bad initial PE state"
-		continue;
-	fi
-
 	echo "$dev, Added"
 
 	# Add to this list of device to check
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
old mode 100755
new mode 100644
index 00dc32c0ed75..9b1bcc1fd4ad
--- a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
+++ b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
@@ -1,6 +1,8 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0-only
 
+export KSELFTESTS_SKIP=4
+
 pe_ok() {
 	local dev="$1"
 	local path="/sys/bus/pci/devices/$dev/eeh_pe_state"
@@ -39,6 +41,52 @@ eeh_supported() {
 	grep -q 'EEH Subsystem is enabled' /proc/powerpc/eeh
 }
 
+eeh_test_prep() {
+	if ! eeh_supported ; then
+		echo "EEH not supported on this system, skipping"
+		exit $KSELFTESTS_SKIP;
+	fi
+
+	if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
+	   [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
+		echo "debugfs EEH testing files are missing. Is debugfs mounted?"
+		exit $KSELFTESTS_SKIP;
+	fi
+
+	# Bump the max freeze count to something absurd so we don't
+	# trip over it while breaking things.
+	echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes
+}
+
+eeh_can_break() {
+	# skip bridges since we can't recover them (yet...)
+	if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
+		echo "$dev, Skipped: bridge"
+		return 1;
+	fi
+
+	# The ahci driver doesn't support error recovery. If the ahci device
+	# happens to be hosting the root filesystem, and then we go and break
+	# it the system will generally go down. We should probably fix that
+	# at some point
+	if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then
+		echo "$dev, Skipped: ahci doesn't support recovery"
+		return 1;
+	fi
+
+	# Don't inject errosr into an already-frozen PE. This happens with
+	# PEs that contain multiple PCI devices (e.g. multi-function cards)
+	# and injecting new errors during the recovery process will probably
+	# result in the recovery failing and the device being marked as
+	# failed.
+	if ! pe_ok $dev ; then
+		echo "$dev, Skipped: Bad initial PE state"
+		return 1;
+	fi
+
+	return 0
+}
+
 eeh_one_dev() {
 	local dev="$1"
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH 2/3] selftests/powerpc: Use stderr for debug messages in eeh-functions
From: Oliver O'Halloran @ 2020-11-03  4:45 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20201103044503.917128-1-oohall@gmail.com>

We want to use stdout to return lists of devices, etc so log debug / status
messages to stderr rather than stdout.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 .../selftests/powerpc/eeh/eeh-functions.sh    | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
index 9b1bcc1fd4ad..32e5b7fbf18a 100644
--- a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
+++ b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
@@ -3,6 +3,10 @@
 
 export KSELFTESTS_SKIP=4
 
+log() {
+	echo >/dev/stderr $*
+}
+
 pe_ok() {
 	local dev="$1"
 	local path="/sys/bus/pci/devices/$dev/eeh_pe_state"
@@ -49,7 +53,7 @@ eeh_test_prep() {
 
 	if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
 	   [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
-		echo "debugfs EEH testing files are missing. Is debugfs mounted?"
+		log "debugfs EEH testing files are missing. Is debugfs mounted?"
 		exit $KSELFTESTS_SKIP;
 	fi
 
@@ -61,7 +65,7 @@ eeh_test_prep() {
 eeh_can_break() {
 	# skip bridges since we can't recover them (yet...)
 	if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
-		echo "$dev, Skipped: bridge"
+		log "$dev, Skipped: bridge"
 		return 1;
 	fi
 
@@ -70,7 +74,7 @@ eeh_can_break() {
 	# it the system will generally go down. We should probably fix that
 	# at some point
 	if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then
-		echo "$dev, Skipped: ahci doesn't support recovery"
+		log "$dev, Skipped: ahci doesn't support recovery"
 		return 1;
 	fi
 
@@ -80,7 +84,7 @@ eeh_can_break() {
 	# result in the recovery failing and the device being marked as
 	# failed.
 	if ! pe_ok $dev ; then
-		echo "$dev, Skipped: Bad initial PE state"
+		log "$dev, Skipped: Bad initial PE state"
 		return 1;
 	fi
 
@@ -94,7 +98,7 @@ eeh_one_dev() {
 	# testing so check that the argument is a well-formed sysfs device
 	# name.
 	if ! test -e /sys/bus/pci/devices/$dev/ ; then
-		echo "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)"
+		log "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)"
 		return 1;
 	fi
 
@@ -118,16 +122,16 @@ eeh_one_dev() {
 		if pe_ok $dev ; then
 			break;
 		fi
-		echo "$dev, waited $i/${max_wait}"
+		log "$dev, waited $i/${max_wait}"
 		sleep 1
 	done
 
 	if ! pe_ok $dev ; then
-		echo "$dev, Failed to recover!"
+		log "$dev, Failed to recover!"
 		return 1;
 	fi
 
-	echo "$dev, Recovered after $i seconds"
+	log "$dev, Recovered after $i seconds"
 	return 0;
 }
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH 3/3] selftests/powerpc: Add VF recovery tests
From: Oliver O'Halloran @ 2020-11-03  4:45 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20201103044503.917128-1-oohall@gmail.com>

The basic EEH test ignores VFs since we the way the eeh_dev_break debugfs
interface works means that if multiple VFs are enabled we may cause errors
on all them them. However, we can work around that by only enabling a
single VF at a time.

This patch adds some infrastructure for finding SR-IOV capable devices and
enabling / disabling VFs so we can exercise the VF specific EEH recovery
paths. Two new tests are added, one for testing EEH aware devices and one
for EEH un-aware VFs.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 .../selftests/powerpc/eeh/eeh-functions.sh    | 108 ++++++++++++++++++
 .../selftests/powerpc/eeh/eeh-vf-aware.sh     |  45 ++++++++
 .../selftests/powerpc/eeh/eeh-vf-unaware.sh   |  35 ++++++
 3 files changed, 188 insertions(+)
 create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-vf-aware.sh
 create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-vf-unaware.sh

diff --git a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
index 32e5b7fbf18a..70daa3925dcb 100644
--- a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
+++ b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
@@ -135,3 +135,111 @@ eeh_one_dev() {
 	return 0;
 }
 
+eeh_has_driver() {
+	test -e /sys/bus/pci/devices/$1/driver;
+	return $?
+}
+
+eeh_can_recover() {
+	# we'll get an IO error if the device's current driver doesn't support
+	# error recovery
+	echo $1 > '/sys/kernel/debug/powerpc/eeh_dev_can_recover' 2>/dev/null
+
+	return $?
+}
+
+eeh_find_all_pfs() {
+	devices=""
+
+	# SR-IOV on pseries requires hypervisor support, so check for that
+	is_pseries=""
+	if grep -q pSeries /proc/cpuinfo ; then
+		if [ ! -f /proc/device-tree/rtas/ibm,open-sriov-allow-unfreeze ] ||
+		   [ ! -f /proc/device-tree/rtas/ibm,open-sriov-map-pe-number ] ; then
+			return 1;
+		fi
+
+		is_pseries="true"
+	fi
+
+	for dev in `ls -1 /sys/bus/pci/devices/` ; do
+		sysfs="/sys/bus/pci/devices/$dev"
+		if [ ! -e "$sysfs/sriov_numvfs" ] ; then
+			continue
+		fi
+
+		# skip unsupported PFs on pseries
+		if [ -z "$is_pseries" ] &&
+		   [ ! -f "$sysfs/of_node/ibm,is-open-sriov-pf" ] &&
+		   [ ! -f "$sysfs/of_node/ibm,open-sriov-vf-bar-info" ] ; then
+			continue;
+		fi
+
+		# no driver, no vfs
+		if ! eeh_has_driver $dev ; then
+			continue
+		fi
+
+		devices="$devices $dev"
+	done
+
+	if [ -z "$devices" ] ; then
+		return 1;
+	fi
+
+	echo $devices
+	return 0;
+}
+
+# attempts to enable one VF on each PF so we can do VF specific tests.
+# stdout: list of enabled VFs, one per line
+# return code: 0 if vfs are found, 1 otherwise
+eeh_enable_vfs() {
+	pf_list="$(eeh_find_all_pfs)"
+
+	vfs=0
+	for dev in $pf_list ; do
+		pf_sysfs="/sys/bus/pci/devices/$dev"
+
+		# make sure we have a single VF
+		echo 0 > "$pf_sysfs/sriov_numvfs"
+		echo 1 > "$pf_sysfs/sriov_numvfs"
+		if [ "$?" != 0 ] ; then
+			log "Unable to enable VFs on $pf, skipping"
+			continue;
+		fi
+
+		vf="$(basename $(realpath "$pf_sysfs/virtfn0"))"
+		if [ $? != 0 ] ; then
+			log "unable to find enabled vf on $pf"
+			echo 0 > "$pf_sysfs/sriov_numvfs"
+			continue;
+		fi
+
+		if ! eeh_can_break $vf ; then
+			log "skipping "
+
+			echo 0 > "$pf_sysfs/sriov_numvfs"
+			continue;
+		fi
+
+		vfs="$((vfs + 1))"
+		echo $vf
+	done
+
+	test "$vfs" != 0
+	return $?
+}
+
+eeh_disable_vfs() {
+	pf_list="$(eeh_find_all_pfs)"
+	if [ -z "$pf_list" ] ; then
+		return 1;
+	fi
+
+	for dev in $pf_list ; do
+		echo 0 > "/sys/bus/pci/devices/$dev/sriov_numvfs"
+	done
+
+	return 0;
+}
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-vf-aware.sh b/tools/testing/selftests/powerpc/eeh/eeh-vf-aware.sh
new file mode 100755
index 000000000000..874c11953bb6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/eeh-vf-aware.sh
@@ -0,0 +1,45 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+. ./eeh-functions.sh
+
+eeh_test_prep # NB: may exit
+
+vf_list="$(eeh_enable_vfs)";
+if $? != 0 ; then
+	log "No usable VFs found. Skipping EEH unaware VF test"
+	exit $KSELFTESTS_SKIP;
+fi
+
+log "Enabled VFs: $vf_list"
+
+tested=0
+passed=0
+for vf in $vf_list ; do
+	log "Testing $vf"
+
+	if ! eeh_can_recover $vf ; then
+		log "Driver for $vf doesn't support error recovery, skipping"
+		continue;
+	fi
+
+	tested="$((tested + 1))"
+
+	log "Breaking $vf..."
+	if ! eeh_one_dev $vf ; then
+		log "$vf failed to recover"
+		continue;
+	fi
+
+	passed="$((passed + 1))"
+done
+
+eeh_disable_vfs
+
+if [ "$tested" == 0 ] ; then
+	echo "No VFs with EEH aware drivers found, skipping"
+	exit $KSELFTESTS_SKIP
+fi
+
+test "$failed" != 0
+exit $?;
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-vf-unaware.sh b/tools/testing/selftests/powerpc/eeh/eeh-vf-unaware.sh
new file mode 100755
index 000000000000..8a4c147b9d43
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/eeh-vf-unaware.sh
@@ -0,0 +1,35 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+. ./eeh-functions.sh
+
+eeh_test_prep # NB: may exit
+
+vf_list="$(eeh_enable_vfs)";
+if $? != 0 ; then
+	log "No usable VFs found. Skipping EEH unaware VF test"
+	exit $KSELFTESTS_SKIP;
+fi
+
+log "Enabled VFs: $vf_list"
+
+failed=0
+for vf in $vf_list ; do
+	log "Testing $vf"
+
+	if eeh_can_recover $vf ; then
+		log "Driver for $vf supports error recovery. Unbinding..."
+		echo "$vf" > /sys/bus/pci/devices/$vf/driver/unbind
+	fi
+
+	log "Breaking $vf..."
+	if ! eeh_one_dev $vf ; then
+		log "$vf failed to recover"
+		failed="$((failed + 1))"
+	fi
+done
+
+eeh_disable_vfs
+
+test "$failed" != 0
+exit $?;
-- 
2.26.2


^ permalink raw reply related

* [PATCH 1/2] powerpc/eeh: Rework pci_dev lookup in debugfs attributes
From: Oliver O'Halloran @ 2020-11-03  5:15 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran

Pull the string -> pci_dev lookup stuff into a helper function. No functional change.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/kernel/eeh.c | 71 ++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 813713c9120c..f9182ff57804 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1596,6 +1596,35 @@ static int proc_eeh_show(struct seq_file *m, void *v)
 }
 
 #ifdef CONFIG_DEBUG_FS
+
+
+static struct pci_dev *eeh_debug_lookup_pdev(struct file *filp,
+					     const char __user *user_buf,
+					     size_t count, loff_t *ppos)
+{
+	uint32_t domain, bus, dev, fn;
+	struct pci_dev *pdev;
+	char buf[20];
+	int ret;
+
+	memset(buf, 0, sizeof(buf));
+	ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
+	if (!ret)
+		return ERR_PTR(-EFAULT);
+
+	ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
+	if (ret != 4) {
+		pr_err("%s: expected 4 args, got %d\n", __func__, ret);
+		return ERR_PTR(-EINVAL);
+	}
+
+	pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
+	if (!pdev)
+		return ERR_PTR(-ENODEV);
+
+	return pdev;
+}
+
 static int eeh_enable_dbgfs_set(void *data, u64 val)
 {
 	if (val)
@@ -1688,26 +1717,13 @@ static ssize_t eeh_dev_check_write(struct file *filp,
 				const char __user *user_buf,
 				size_t count, loff_t *ppos)
 {
-	uint32_t domain, bus, dev, fn;
 	struct pci_dev *pdev;
 	struct eeh_dev *edev;
-	char buf[20];
 	int ret;
 
-	memset(buf, 0, sizeof(buf));
-	ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
-	if (!ret)
-		return -EFAULT;
-
-	ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
-	if (ret != 4) {
-		pr_err("%s: expected 4 args, got %d\n", __func__, ret);
-		return -EINVAL;
-	}
-
-	pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
-	if (!pdev)
-		return -ENODEV;
+	pdev = eeh_debug_lookup_pdev(filp, user_buf, count, ppos);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
 
 	edev = pci_dev_to_eeh_dev(pdev);
 	if (!edev) {
@@ -1717,8 +1733,8 @@ static ssize_t eeh_dev_check_write(struct file *filp,
 	}
 
 	ret = eeh_dev_check_failure(edev);
-	pci_info(pdev, "eeh_dev_check_failure(%04x:%02x:%02x.%01x) = %d\n",
-			domain, bus, dev, fn, ret);
+	pci_info(pdev, "eeh_dev_check_failure(%s) = %d\n",
+			pci_name(pdev), ret);
 
 	pci_dev_put(pdev);
 
@@ -1829,25 +1845,12 @@ static ssize_t eeh_dev_break_write(struct file *filp,
 				const char __user *user_buf,
 				size_t count, loff_t *ppos)
 {
-	uint32_t domain, bus, dev, fn;
 	struct pci_dev *pdev;
-	char buf[20];
 	int ret;
 
-	memset(buf, 0, sizeof(buf));
-	ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
-	if (!ret)
-		return -EFAULT;
-
-	ret = sscanf(buf, "%x:%x:%x.%x", &domain, &bus, &dev, &fn);
-	if (ret != 4) {
-		pr_err("%s: expected 4 args, got %d\n", __func__, ret);
-		return -EINVAL;
-	}
-
-	pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
-	if (!pdev)
-		return -ENODEV;
+	pdev = eeh_debug_lookup_pdev(filp, user_buf, count, ppos);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
 
 	ret = eeh_debugfs_break_device(pdev);
 	pci_dev_put(pdev);
-- 
2.26.2


^ permalink raw reply related

* [PATCH 2/2] powerpc/eeh: Add a debugfs interface to check if a driver supports recovery
From: Oliver O'Halloran @ 2020-11-03  5:15 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Oliver O'Halloran
In-Reply-To: <20201103051512.919333-1-oohall@gmail.com>

If a PCI device's current driver implements the error handling callbacks
EEH can use them to recover the device after an error occurs. For devices
without the error handling callbacks we recover them by removing the device
and re-scanning it so the PCI core puts the device back into a known good
state.

Currently there's no way for userspace to determine if the driver supports
recovery or not which makes it difficult to write automated tests for EEH.
This patch addressing that by adding a debugfs interface for querying if
a specific device can be recovered or not.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
---
 arch/powerpc/kernel/eeh.c | 50 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index f9182ff57804..cd60bc1c8701 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1868,6 +1868,53 @@ static const struct file_operations eeh_dev_break_fops = {
 	.read   = eeh_debugfs_dev_usage,
 };
 
+static ssize_t eeh_dev_can_recover(struct file *filp,
+				   const char __user *user_buf,
+				   size_t count, loff_t *ppos)
+{
+	struct pci_driver *drv;
+	struct pci_dev *pdev;
+	size_t ret;
+
+	pdev = eeh_debug_lookup_pdev(filp, user_buf, count, ppos);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
+
+	/*
+	 * In order for error recovery to work the driver needs to implement
+	 * .error_detected(), so it can quiesce IO to the device, and
+	 * .slot_reset() so it can re-initialise the device after a reset.
+	 *
+	 * Ideally they'd implement .resume() too, but some drivers which
+	 * we need to support (notably IPR) don't so I guess we can tolerate
+	 * that.
+	 *
+	 * .mmio_enabled() is mostly there as a work-around for devices which
+	 * take forever to re-init after a hot reset. Implementing that is
+	 * strictly optional.
+	 */
+	drv = pci_dev_driver(pdev);
+	if (drv &&
+	    drv->err_handler &&
+	    drv->err_handler->error_detected &&
+	    drv->err_handler->slot_reset) {
+		ret = count;
+	} else {
+		ret = -EOPNOTSUPP;
+	}
+
+	pci_dev_put(pdev);
+
+	return ret;
+}
+
+static const struct file_operations eeh_dev_can_recover_fops = {
+	.open	= simple_open,
+	.llseek	= no_llseek,
+	.write	= eeh_dev_can_recover,
+	.read   = eeh_debugfs_dev_usage,
+};
+
 #endif
 
 static int __init eeh_init_proc(void)
@@ -1892,6 +1939,9 @@ static int __init eeh_init_proc(void)
 		debugfs_create_file_unsafe("eeh_force_recover", 0600,
 				powerpc_debugfs_root, NULL,
 				&eeh_force_recover_fops);
+		debugfs_create_file_unsafe("eeh_dev_can_recover", 0600,
+				powerpc_debugfs_root, NULL,
+				&eeh_dev_can_recover_fops);
 		eeh_cache_debugfs_init();
 #endif
 	}
-- 
2.26.2


^ permalink raw reply related

* [Bug 209869] Kernel 5.10-rc1 fails to boot on a PowerMac G4 3,6 at an early stage
From: bugzilla-daemon @ 2020-11-03  7:36 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-209869-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=209869

Christophe Leroy (christophe.leroy@csgroup.eu) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |christophe.leroy@csgroup.eu

--- Comment #1 from Christophe Leroy (christophe.leroy@csgroup.eu) ---
Could you try reverting commit
https://github.com/linuxppc/linux/commit/69a1593abdbcf03a76367320d929a8ae7a5e3d71
?

I got another report from someone who has the same problem and bisected it to
that commit.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [PATCH] powerpc/32s: Setup the early hash table at all time.
From: Christophe Leroy @ 2020-11-03  7:41 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: Paul Mackerras, linuxppc-dev, linux-kernel
In-Reply-To: <87pn4zc0zl.fsf@igel.home>

Hi Andreas,

Le 30/10/2020 à 14:11, Andreas Schwab a écrit :
> #
> # Automatically generated file; DO NOT EDIT.
> # Linux/powerpc 5.10.0-rc1 Kernel Configuration
> #

I tried again on QEMU with both pmac32_defconfig and your config, and it boots.

I really can't understand what the problem is, because that patch only activates at all time 
something that has been working well when CONFIG_KASAN is set.

Would you mind checking that with that patch reverted, you are able to boot a kernel built with 
CONFIG_KASAN ?

Thanks
Christophe

^ permalink raw reply

* Re: [PATCH] powerpc/32s: Setup the early hash table at all time.
From: Andreas Schwab @ 2020-11-03  8:43 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: Paul Mackerras, linuxppc-dev, linux-kernel
In-Reply-To: <1f8494cd-36db-e3a2-8ea4-28fb976468e7@csgroup.eu>

On Nov 03 2020, Christophe Leroy wrote:

> I tried again on QEMU with both pmac32_defconfig and your config, and it boots.

Isn't it quite naïve to think that qemu provides an adequate test
environment for such lowlevel stuff?

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply

* Re: [PATCH] powerpc/32s: Setup the early hash table at all time.
From: Andreas Schwab @ 2020-11-03  8:56 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: Paul Mackerras, linuxppc-dev, linux-kernel
In-Reply-To: <1f8494cd-36db-e3a2-8ea4-28fb976468e7@csgroup.eu>

On Nov 03 2020, Christophe Leroy wrote:

> Would you mind checking that with that patch reverted, you are able to
> boot a kernel built with CONFIG_KASAN ?

That doesn't exist.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply

* Re: [PATCH v1 4/4] powernv/memtrace: don't abuse memory hot(un)plug infrastructure for memory allocations
From: Michal Hocko @ 2020-11-03  9:23 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Wei Yang, linux-kernel, linux-mm, Paul Mackerras, Rashmica Gupta,
	linuxppc-dev, Andrew Morton, Mike Rapoport, Oscar Salvador
In-Reply-To: <20201029162718.29910-5-david@redhat.com>

On Thu 29-10-20 17:27:18, David Hildenbrand wrote:
> Let's use alloc_contig_pages() for allocating memory and remove the
> linear mapping manually via arch_remove_linear_mapping(). Mark all pages
> PG_offline, such that they will definitely not get touched - e.g.,
> when hibernating. When freeing memory, try to revert what we did.
> 
> The original idea was discussed in:
>  https://lkml.kernel.org/r/48340e96-7e6b-736f-9e23-d3111b915b6e@redhat.com
> 
> This is similar to CONFIG_DEBUG_PAGEALLOC handling on other
> architectures, whereby only single pages are unmapped from the linear
> mapping. Let's mimic what memory hot(un)plug would do with the linear
> mapping.
> 
> We now need MEMORY_HOTPLUG and CONTIG_ALLOC as dependencies.
> 
> Simple test under QEMU TCG (10GB RAM, single NUMA node):
> 
> sh-5.0# mount -t debugfs none /sys/kernel/debug/
> sh-5.0# cat /sys/devices/system/memory/block_size_bytes
> 40000000
> sh-5.0# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
> [   71.052836][  T356] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
> sh-5.0# echo 0x80000000 > /sys/kernel/debug/powerpc/memtrace/enable
> [   75.424302][  T356] radix-mmu: Mapped 0x0000000080000000-0x00000000c0000000 with 64.0 KiB pages
> [   75.430549][  T356] memtrace: Freed trace memory back on node 0
> [   75.604520][  T356] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
> sh-5.0# echo 0x100000000 > /sys/kernel/debug/powerpc/memtrace/enable
> [   80.418835][  T356] radix-mmu: Mapped 0x0000000080000000-0x0000000100000000 with 64.0 KiB pages
> [   80.430493][  T356] memtrace: Freed trace memory back on node 0
> [   80.433882][  T356] memtrace: Failed to allocate trace memory on node 0
> sh-5.0# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
> [   91.920158][  T356] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
> 
> Note 1: We currently won't be allocating from ZONE_MOVABLE - because our
> 	pages are not movable. However, as we don't run with any memory
> 	hot(un)plug mechanism around, we could make an exception to
> 	increase the chance of allocations succeeding.
> 
> Note 2: PG_reserved isn't sufficient. E.g., kernel_page_present() used
> 	along PG_reserved in hibernation code will always return "true"
> 	on powerpc, resulting in the pages getting touched. It's too
> 	generic - e.g., indicates boot allocations.
> 
> Note 3: For now, we keep using memory_block_size_bytes() as minimum
> 	granularity. I'm not able to come up with a better guess (most
> 	probably, doing it on a section basis could be possible).
> 
> Suggested-by: Michal Hocko <mhocko@kernel.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Cc: Rashmica Gupta <rashmica.g@gmail.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Mike Rapoport <rppt@kernel.org>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>

Thanks! This looks like a move into the right direction. I cannot really
judge implementation details because I am not familiar with the code.
I have only one tiny concern:
[...]
> -/* called with device_hotplug_lock held */
> -static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
> +static u64 memtrace_alloc_node(u32 nid, u64 size)
>  {
> -	const unsigned long start = PFN_PHYS(start_pfn);
> -	const unsigned long size = PFN_PHYS(nr_pages);
> +	const unsigned long nr_pages = PHYS_PFN(size);
> +	unsigned long pfn, start_pfn;
> +	struct page *page;
>  
> -	if (walk_memory_blocks(start, size, NULL, check_memblock_online))
> -		return false;
> -
> -	walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
> -			   change_memblock_state);
> -
> -	if (offline_pages(start_pfn, nr_pages)) {
> -		walk_memory_blocks(start, size, (void *)MEM_ONLINE,
> -				   change_memblock_state);
> -		return false;
> -	}
> +	/*
> +	 * Trace memory needs to be aligned to the size, which is guaranteed
> +	 * by alloc_contig_pages().
> +	 */
> +	page = alloc_contig_pages(nr_pages, __GFP_THISNODE | __GFP_NOWARN,
> +				  nid, NULL);

__GFP_THISNODE without other modifiers looks suspicious. I suspect you
want to enfore node locality and exclude movable zones by this. While
this works it is an antipattern. I would rather use GFP_KERNEL |
__GFP_THISNODE | __GFP_NOWARN to be more in line with other gfp usage.

If for no other reasons we want to be able to work inside a normal
compaction context (comparing to effectively GFP_NOIO which the above
implies). Also this looks like a sleepable context.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH v1 4/4] powernv/memtrace: don't abuse memory hot(un)plug infrastructure for memory allocations
From: David Hildenbrand @ 2020-11-03  9:29 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Wei Yang, linux-kernel, linux-mm, Paul Mackerras, Rashmica Gupta,
	linuxppc-dev, Andrew Morton, Mike Rapoport, Oscar Salvador
In-Reply-To: <20201103092309.GD21990@dhcp22.suse.cz>

On 03.11.20 10:23, Michal Hocko wrote:
> On Thu 29-10-20 17:27:18, David Hildenbrand wrote:
>> Let's use alloc_contig_pages() for allocating memory and remove the
>> linear mapping manually via arch_remove_linear_mapping(). Mark all pages
>> PG_offline, such that they will definitely not get touched - e.g.,
>> when hibernating. When freeing memory, try to revert what we did.
>>
>> The original idea was discussed in:
>>   https://lkml.kernel.org/r/48340e96-7e6b-736f-9e23-d3111b915b6e@redhat.com
>>
>> This is similar to CONFIG_DEBUG_PAGEALLOC handling on other
>> architectures, whereby only single pages are unmapped from the linear
>> mapping. Let's mimic what memory hot(un)plug would do with the linear
>> mapping.
>>
>> We now need MEMORY_HOTPLUG and CONTIG_ALLOC as dependencies.
>>
>> Simple test under QEMU TCG (10GB RAM, single NUMA node):
>>
>> sh-5.0# mount -t debugfs none /sys/kernel/debug/
>> sh-5.0# cat /sys/devices/system/memory/block_size_bytes
>> 40000000
>> sh-5.0# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
>> [   71.052836][  T356] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
>> sh-5.0# echo 0x80000000 > /sys/kernel/debug/powerpc/memtrace/enable
>> [   75.424302][  T356] radix-mmu: Mapped 0x0000000080000000-0x00000000c0000000 with 64.0 KiB pages
>> [   75.430549][  T356] memtrace: Freed trace memory back on node 0
>> [   75.604520][  T356] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
>> sh-5.0# echo 0x100000000 > /sys/kernel/debug/powerpc/memtrace/enable
>> [   80.418835][  T356] radix-mmu: Mapped 0x0000000080000000-0x0000000100000000 with 64.0 KiB pages
>> [   80.430493][  T356] memtrace: Freed trace memory back on node 0
>> [   80.433882][  T356] memtrace: Failed to allocate trace memory on node 0
>> sh-5.0# echo 0x40000000 > /sys/kernel/debug/powerpc/memtrace/enable
>> [   91.920158][  T356] memtrace: Allocated trace memory on node 0 at 0x0000000080000000
>>
>> Note 1: We currently won't be allocating from ZONE_MOVABLE - because our
>> 	pages are not movable. However, as we don't run with any memory
>> 	hot(un)plug mechanism around, we could make an exception to
>> 	increase the chance of allocations succeeding.
>>
>> Note 2: PG_reserved isn't sufficient. E.g., kernel_page_present() used
>> 	along PG_reserved in hibernation code will always return "true"
>> 	on powerpc, resulting in the pages getting touched. It's too
>> 	generic - e.g., indicates boot allocations.
>>
>> Note 3: For now, we keep using memory_block_size_bytes() as minimum
>> 	granularity. I'm not able to come up with a better guess (most
>> 	probably, doing it on a section basis could be possible).
>>
>> Suggested-by: Michal Hocko <mhocko@kernel.org>
>> Cc: Michael Ellerman <mpe@ellerman.id.au>
>> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
>> Cc: Paul Mackerras <paulus@samba.org>
>> Cc: Rashmica Gupta <rashmica.g@gmail.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Mike Rapoport <rppt@kernel.org>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
> 
> Thanks! This looks like a move into the right direction. I cannot really
> judge implementation details because I am not familiar with the code.
> I have only one tiny concern:
> [...]
>> -/* called with device_hotplug_lock held */
>> -static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
>> +static u64 memtrace_alloc_node(u32 nid, u64 size)
>>   {
>> -	const unsigned long start = PFN_PHYS(start_pfn);
>> -	const unsigned long size = PFN_PHYS(nr_pages);
>> +	const unsigned long nr_pages = PHYS_PFN(size);
>> +	unsigned long pfn, start_pfn;
>> +	struct page *page;
>>   
>> -	if (walk_memory_blocks(start, size, NULL, check_memblock_online))
>> -		return false;
>> -
>> -	walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
>> -			   change_memblock_state);
>> -
>> -	if (offline_pages(start_pfn, nr_pages)) {
>> -		walk_memory_blocks(start, size, (void *)MEM_ONLINE,
>> -				   change_memblock_state);
>> -		return false;
>> -	}
>> +	/*
>> +	 * Trace memory needs to be aligned to the size, which is guaranteed
>> +	 * by alloc_contig_pages().
>> +	 */
>> +	page = alloc_contig_pages(nr_pages, __GFP_THISNODE | __GFP_NOWARN,
>> +				  nid, NULL);
> 
> __GFP_THISNODE without other modifiers looks suspicious. I suspect you
> want to enfore node locality and exclude movable zones by this. While
> this works it is an antipattern. I would rather use GFP_KERNEL |
> __GFP_THISNODE | __GFP_NOWARN to be more in line with other gfp usage.

Agreed GFP_KERNEL should be the right thing to do here.

> 
> If for no other reasons we want to be able to work inside a normal
> compaction context (comparing to effectively GFP_NOIO which the above
> implies). Also this looks like a sleepable context.
> 

Yes it is. Thanks!

-- 
Thanks,

David / dhildenb


^ permalink raw reply

* [RESEND PATCH] kernel/watchdog: Fix watchdog_allowed_mask not used warning
From: Santosh Sivaraj @ 2020-11-03  9:32 UTC (permalink / raw)
  To: Linux Kernel, Michael Ellerman, linuxppc-dev
  Cc: pmladek, Santosh Sivaraj, bala24

Define watchdog_allowed_mask only when SOFTLOCKUP_DETECTOR is enabled.

Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
---

Original patch is here:
https://lore.kernel.org/lkml/20190807014417.9418-1-santosh@fossix.org/

A similar patch was also sent by Balamuruhan and reviewed by Petr.
https://lkml.org/lkml/2020/8/20/1030

 kernel/watchdog.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5abb5b22ad13..71109065bd8e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -44,8 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
 static int __read_mostly nmi_watchdog_available;
 
-static struct cpumask watchdog_allowed_mask __read_mostly;
-
 struct cpumask watchdog_cpumask __read_mostly;
 unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 
@@ -162,6 +160,8 @@ static void lockup_detector_update_enable(void)
 int __read_mostly sysctl_softlockup_all_cpu_backtrace;
 #endif
 
+static struct cpumask watchdog_allowed_mask __read_mostly;
+
 /* Global variables, exported for sysctl */
 unsigned int __read_mostly softlockup_panic =
 			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-- 
2.26.2


^ permalink raw reply related

* Re: [RESEND PATCH] kernel/watchdog: Fix watchdog_allowed_mask not used warning
From: Christophe Leroy @ 2020-11-03  9:55 UTC (permalink / raw)
  To: Santosh Sivaraj, Linux Kernel, Michael Ellerman, linuxppc-dev
  Cc: pmladek, Thomas Gleixner, bala24
In-Reply-To: <20201103093235.655665-1-santosh@fossix.org>



Le 03/11/2020 à 10:32, Santosh Sivaraj a écrit :
> Define watchdog_allowed_mask only when SOFTLOCKUP_DETECTOR is enabled.
> 
> Signed-off-by: Santosh Sivaraj <santosh@fossix.org>

I think maybe you should add a Fixes: tag (Towards https://github.com/linuxppc/linux/commit/7feeb9cd ?)

And copy Thomas (tglx)

Christophe

> ---
> 
> Original patch is here:
> https://lore.kernel.org/lkml/20190807014417.9418-1-santosh@fossix.org/
> 
> A similar patch was also sent by Balamuruhan and reviewed by Petr.
> https://lkml.org/lkml/2020/8/20/1030
> 
>   kernel/watchdog.c | 4 ++--
>   1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 5abb5b22ad13..71109065bd8e 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -44,8 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1;
>   int __read_mostly watchdog_thresh = 10;
>   static int __read_mostly nmi_watchdog_available;
>   
> -static struct cpumask watchdog_allowed_mask __read_mostly;
> -
>   struct cpumask watchdog_cpumask __read_mostly;
>   unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
>   
> @@ -162,6 +160,8 @@ static void lockup_detector_update_enable(void)
>   int __read_mostly sysctl_softlockup_all_cpu_backtrace;
>   #endif
>   
> +static struct cpumask watchdog_allowed_mask __read_mostly;
> +
>   /* Global variables, exported for sysctl */
>   unsigned int __read_mostly softlockup_panic =
>   			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
> 

^ permalink raw reply

* Re: [RESEND PATCH] kernel/watchdog: Fix watchdog_allowed_mask not used warning
From: Petr Mladek @ 2020-11-03 10:15 UTC (permalink / raw)
  To: Santosh Sivaraj
  Cc: Linux Kernel, bala24, Thomas Gleixner, linuxppc-dev,
	Andrew Morton
In-Reply-To: <214be993-96ec-82b0-b841-c80f7e7faefb@csgroup.eu>

On Tue 2020-11-03 10:55:11, Christophe Leroy wrote:
> 
> 
> Le 03/11/2020 à 10:32, Santosh Sivaraj a écrit :
> > Define watchdog_allowed_mask only when SOFTLOCKUP_DETECTOR is enabled.
> > 
> > Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
> 
> I think maybe you should add a Fixes: tag (Towards https://github.com/linuxppc/linux/commit/7feeb9cd ?)
> 
> And copy Thomas (tglx)

And also Andrew that usually take patches in this area.

Finally, you should add my Reviewed-by that I provided for the first
resend. Sigh, I have missed that the important people were not in CC.

Best Regards,
Petr

> Christophe
> 
> > ---
> > 
> > Original patch is here:
> > https://lore.kernel.org/lkml/20190807014417.9418-1-santosh@fossix.org/
> > 
> > A similar patch was also sent by Balamuruhan and reviewed by Petr.
> > https://lkml.org/lkml/2020/8/20/1030
> > 
> >   kernel/watchdog.c | 4 ++--
> >   1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> > index 5abb5b22ad13..71109065bd8e 100644
> > --- a/kernel/watchdog.c
> > +++ b/kernel/watchdog.c
> > @@ -44,8 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1;
> >   int __read_mostly watchdog_thresh = 10;
> >   static int __read_mostly nmi_watchdog_available;
> > -static struct cpumask watchdog_allowed_mask __read_mostly;
> > -
> >   struct cpumask watchdog_cpumask __read_mostly;
> >   unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
> > @@ -162,6 +160,8 @@ static void lockup_detector_update_enable(void)
> >   int __read_mostly sysctl_softlockup_all_cpu_backtrace;
> >   #endif
> > +static struct cpumask watchdog_allowed_mask __read_mostly;
> > +
> >   /* Global variables, exported for sysctl */
> >   unsigned int __read_mostly softlockup_panic =
> >   			CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
> > 

^ permalink raw reply

* Re: [PATCH 11/11 v2] ftrace: Add recording of functions that caused recursion
From: Petr Mladek @ 2020-11-03 10:40 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Anton Vorontsov, linux-doc, Peter Zijlstra,
	Sebastian Andrzej Siewior, Kamalesh Babulal, James E.J. Bottomley,
	Guo Ren, H. Peter Anvin, live-patching, Miroslav Benes,
	Ingo Molnar, linux-s390, Joe Lawrence, Jonathan Corbet,
	Mauro Carvalho Chehab, Helge Deller, x86, linux-csky,
	Christian Borntraeger, Kees Cook, Vasily Gorbik, Heiko Carstens,
	Jiri Kosina, Borislav Petkov, Josh Poimboeuf, Thomas Gleixner,
	Tony Luck, linux-parisc, linux-kernel, Masami Hiramatsu,
	Colin Cross, Paul Mackerras, Andrew Morton, linuxppc-dev
In-Reply-To: <20201102120907.457ad2f7@gandalf.local.home>

On Mon 2020-11-02 12:09:07, Steven Rostedt wrote:
> On Mon, 2 Nov 2020 17:41:47 +0100
> Petr Mladek <pmladek@suse.com> wrote:
> 
> > On Fri 2020-10-30 17:31:53, Steven Rostedt wrote:
> > > From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
> > > 
> > > This adds CONFIG_FTRACE_RECORD_RECURSION that will record to a file
> > > "recursed_functions" all the functions that caused recursion while a
> > > callback to the function tracer was running.
> > >   
> > 
> > > --- /dev/null
> > > +++ b/kernel/trace/trace_recursion_record.c
> > > +	if (index >= CONFIG_FTRACE_RECORD_RECURSION_SIZE)
> > > +		return;
> > > +
> > > +	for (i = index - 1; i >= 0; i--) {
> > > +		if (recursed_functions[i].ip == ip) {
> > > +			cached_function = ip;
> > > +			return;
> > > +		}
> > > +	}
> > > +
> > > +	cached_function = ip;
> > > +
> > > +	/*
> > > +	 * We only want to add a function if it hasn't been added before.
> > > +	 * Add to the current location before incrementing the count.
> > > +	 * If it fails to add, then increment the index (save in i)
> > > +	 * and try again.
> > > +	 */
> > > +	old = cmpxchg(&recursed_functions[index].ip, 0, ip);
> > > +	if (old != 0) {
> > > +		/* Did something else already added this for us? */
> > > +		if (old == ip)
> > > +			return;
> > > +		/* Try the next location (use i for the next index) */
> > > +		i = index + 1;  
> > 
> > What about
> > 
> > 		index++;
> > 
> > We basically want to run the code again with index + 1 limit.
> 
> But something else could update nr_records, and we want to use that if
> nr_records is greater than i.
> 
> Now, we could swap the use case, and have
> 
> 	int index = 0;
> 
> 	[..]
> 	i = atomic_read(&nr_records);
> 	if (i > index)
> 		index = i;
> 
> 	[..]
> 
> 		index++;
> 		goto again;
> 
> 
> > 
> > Maybe, it even does not make sense to check the array again
> > and we should just try to store the value into the next slot.
> 
> We do this dance to prevent duplicates.

I see.

My code was wrong. It reserved slot for the new "ip" by cmpxchg
on nr_records. The "ip" was stored later so that any parallel
call need not see that it is a dumplicate.

Your code reserves the slot by cmpxchg of "ip".
Any parallel call would fail to take the slot and see
the "ip" in the next iteration.

Best Regards,
Petr

^ permalink raw reply

* [patch V3 02/37] highmem: Remove unused functions
From: Thomas Gleixner @ 2020-11-03  9:27 UTC (permalink / raw)
  To: LKML
  Cc: Juri Lelli, linux-aio, Peter Zijlstra, Sebastian Andrzej Siewior,
	Joonas Lahtinen, dri-devel, virtualization, Ben Segall,
	Chris Mason, Huang Rui, Paul Mackerras, Gerd Hoffmann,
	Daniel Bristot de Oliveira, sparclinux, Vincent Chen,
	Christoph Hellwig, Vincent Guittot, Paul McKenney, Max Filippov,
	x86, Russell King, linux-csky, Ingo Molnar, David Airlie,
	VMware Graphics, Mel Gorman, nouveau, Dave Airlie, linux-snps-arc,
	Ben Skeggs, linux-xtensa, Arnd Bergmann, intel-gfx,
	Roland Scheidegger, Josef Bacik, Steven Rostedt, Linus Torvalds,
	Alexander Viro, spice-devel, David Sterba, Rodrigo Vivi,
	Dietmar Eggemann, linux-arm-kernel, Jani Nikula, Chris Zankel,
	Michal Simek, Thomas Bogendoerfer, Nick Hu, linux-mm,
	Vineet Gupta, linux-mips, Christian Koenig, Benjamin LaHaise,
	Daniel Vetter, linux-fsdevel, Andrew Morton, linuxppc-dev,
	David S. Miller, linux-btrfs, Greentime Hu
In-Reply-To: <20201103092712.714480842@linutronix.de>

Nothing uses totalhigh_pages_dec() and totalhigh_pages_set().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
V3: New patch
---
 include/linux/highmem.h |   10 ----------
 1 file changed, 10 deletions(-)

--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -104,21 +104,11 @@ static inline void totalhigh_pages_inc(v
 	atomic_long_inc(&_totalhigh_pages);
 }
 
-static inline void totalhigh_pages_dec(void)
-{
-	atomic_long_dec(&_totalhigh_pages);
-}
-
 static inline void totalhigh_pages_add(long count)
 {
 	atomic_long_add(count, &_totalhigh_pages);
 }
 
-static inline void totalhigh_pages_set(long val)
-{
-	atomic_long_set(&_totalhigh_pages, val);
-}
-
 void kmap_flush_unused(void);
 
 struct page *kmap_to_page(void *addr);


^ permalink raw reply

* [patch V3 01/37] mm/highmem: Un-EXPORT __kmap_atomic_idx()
From: Thomas Gleixner @ 2020-11-03  9:27 UTC (permalink / raw)
  To: LKML
  Cc: Juri Lelli, linux-aio, Peter Zijlstra, Sebastian Andrzej Siewior,
	Joonas Lahtinen, dri-devel, virtualization, Ben Segall,
	Chris Mason, Huang Rui, Paul Mackerras, Gerd Hoffmann,
	Daniel Bristot de Oliveira, sparclinux, Vincent Chen,
	Christoph Hellwig, Vincent Guittot, Paul McKenney, Max Filippov,
	x86, Russell King, linux-csky, Ingo Molnar, David Airlie,
	VMware Graphics, Mel Gorman, nouveau, Dave Airlie, linux-snps-arc,
	Ben Skeggs, linux-xtensa, Arnd Bergmann, intel-gfx,
	Roland Scheidegger, Josef Bacik, Steven Rostedt, Linus Torvalds,
	Alexander Viro, spice-devel, David Sterba, Rodrigo Vivi,
	Dietmar Eggemann, linux-arm-kernel, Jani Nikula, Chris Zankel,
	Michal Simek, Thomas Bogendoerfer, Nick Hu, linux-mm,
	Vineet Gupta, linux-mips, Christian Koenig, Benjamin LaHaise,
	Daniel Vetter, linux-fsdevel, Andrew Morton, linuxppc-dev,
	David S. Miller, linux-btrfs, Greentime Hu
In-Reply-To: <20201103092712.714480842@linutronix.de>

Nothing in modules can use that.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
---
 mm/highmem.c |    2 --
 1 file changed, 2 deletions(-)

--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -108,8 +108,6 @@ static inline wait_queue_head_t *get_pkm
 atomic_long_t _totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(_totalhigh_pages);
 
-EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
-
 unsigned int nr_free_highpages (void)
 {
 	struct zone *zone;


^ permalink raw reply

* [patch V3 03/37] fs: Remove asm/kmap_types.h includes
From: Thomas Gleixner @ 2020-11-03  9:27 UTC (permalink / raw)
  To: LKML
  Cc: Juri Lelli, linux-aio, Peter Zijlstra, Sebastian Andrzej Siewior,
	Joonas Lahtinen, dri-devel, virtualization, Ben Segall, linux-mm,
	Huang Rui, Paul Mackerras, Gerd Hoffmann,
	Daniel Bristot de Oliveira, sparclinux, Vincent Chen,
	Christoph Hellwig, Vincent Guittot, Paul McKenney, Max Filippov,
	x86, Russell King, linux-csky, Ingo Molnar, David Airlie,
	VMware Graphics, Mel Gorman, nouveau, Dave Airlie, linux-snps-arc,
	Ben Skeggs, linux-xtensa, Arnd Bergmann, intel-gfx,
	Roland Scheidegger, Josef Bacik, Steven Rostedt, Linus Torvalds,
	Alexander Viro, spice-devel, David Sterba, Rodrigo Vivi,
	Dietmar Eggemann, linux-arm-kernel, Jani Nikula, Chris Zankel,
	Michal Simek, Thomas Bogendoerfer, Nick Hu, Chris Mason,
	Vineet Gupta, linux-mips, Christian Koenig, Benjamin LaHaise,
	Daniel Vetter, linux-fsdevel, Andrew Morton, linuxppc-dev,
	David S. Miller, linux-btrfs, Greentime Hu
In-Reply-To: <20201103092712.714480842@linutronix.de>

Historical leftovers from the time where kmap() had fixed slots.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-aio@kvack.org
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: David Sterba <dsterba@suse.com>
Cc: linux-btrfs@vger.kernel.org
---
 fs/aio.c         |    1 -
 fs/btrfs/ctree.h |    1 -
 2 files changed, 2 deletions(-)

--- a/fs/aio.c
+++ b/fs/aio.c
@@ -43,7 +43,6 @@
 #include <linux/mount.h>
 #include <linux/pseudo_fs.h>
 
-#include <asm/kmap_types.h>
 #include <linux/uaccess.h>
 #include <linux/nospec.h>
 
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -17,7 +17,6 @@
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <trace/events/btrfs.h>
-#include <asm/kmap_types.h>
 #include <asm/unaligned.h>
 #include <linux/pagemap.h>
 #include <linux/btrfs.h>


^ permalink raw reply

* [patch V3 00/37] mm/highmem: Preemptible variant of kmap_atomic & friends
From: Thomas Gleixner @ 2020-11-03  9:27 UTC (permalink / raw)
  To: LKML
  Cc: Juri Lelli, linux-aio, Peter Zijlstra, Sebastian Andrzej Siewior,
	Joonas Lahtinen, dri-devel, virtualization, Ben Segall,
	Chris Mason, Huang Rui, Paul Mackerras, Gerd Hoffmann,
	Daniel Bristot de Oliveira, sparclinux, Vincent Chen,
	Christoph Hellwig, Vincent Guittot, Paul McKenney, Max Filippov,
	x86, Russell King, linux-csky, Ingo Molnar, David Airlie,
	VMware Graphics, Mel Gorman, nouveau, Dave Airlie, linux-snps-arc,
	Ben Skeggs, linux-xtensa, Arnd Bergmann, intel-gfx,
	Roland Scheidegger, Josef Bacik, Steven Rostedt, Linus Torvalds,
	Alexander Viro, spice-devel, David Sterba, Rodrigo Vivi,
	Dietmar Eggemann, linux-arm-kernel, Jani Nikula, Chris Zankel,
	Michal Simek, Thomas Bogendoerfer, Nick Hu, linux-mm,
	Vineet Gupta, linux-mips, Christian Koenig, Benjamin LaHaise,
	Daniel Vetter, linux-fsdevel, Andrew Morton, linuxppc-dev,
	David S. Miller, linux-btrfs, Greentime Hu

Following up to the discussion in:

  https://lore.kernel.org/r/20200914204209.256266093@linutronix.de

and the second version of this:

  https://lore.kernel.org/r/20201029221806.189523375@linutronix.de

this series provides a preemptible variant of kmap_atomic & related
interfaces.

This is achieved by:

 - Removing the RT dependency from migrate_disable/enable()

 - Consolidating all kmap atomic implementations in generic code including
   a useful version of the CONFIG_DEBUG_HIGHMEM which provides guard pages
   between the individual maps instead of just increasing the map size.

 - Switching from per CPU storage of the kmap index to a per task storage

 - Adding a pteval array to the per task storage which contains the ptevals
   of the currently active temporary kmaps

 - Adding context switch code which checks whether the outgoing or the
   incoming task has active temporary kmaps. If so, the outgoing task's
   kmaps are removed and the incoming task's kmaps are restored.

 - Adding new interfaces k[un]map_local*() which are not disabling
   preemption and can be called from any context (except NMI).

   Contrary to kmap() which provides preemptible and "persistant" mappings,
   these interfaces are meant to replace the temporary mappings provided by
   kmap_atomic*() today.

This allows to get rid of conditional mapping choices and allows to have
preemptible short term mappings on 64bit which are today enforced to be
non-preemptible due to the highmem constraints. It clearly puts overhead on
the highmem users, but highmem is slow anyway.

This is not a wholesale conversion which makes kmap_atomic magically
preemptible because there might be usage sites which rely on the implicit
preempt disable. So this needs to be done on a case by case basis and the
call sites converted to kmap_local().

Note, that this is only tested on X86 and completely untested on all other
architectures (at least it compiles except on csky which does not compile
with the newest cross tools from kernel.org independent of this change).

The lot is available from

   git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git highmem

It is based on Peter Zijlstras migrate disable branch which is close to be
merged into the tip tree, but still not finalized:

   git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git sched/migrate-disable

The series has the following parts:

    Patches  1 - 22: Consolidation work which is independent of the scheduler
    	       	     changes

		     79 files changed, 595 insertions(+), 1296 deletions(-)

    Patch   23:      Needs to be folded back into the sched/migrate-disable

    Patches 24 - 26: The preemptible kmap_local() implementation

    	       	     9 files changed, 283 insertions(+), 57 deletions(-)

    Patches 27 - 37: Cleanup of the less common kmap/io_map_atomic users

    	       	     19 files changed, 114 insertions(+), 256 deletions(-)

Vs. merging this pile:

If everyone agrees, I'd like to take the first part (1-22) through tip so
that the preemptible implementation can be sorted in tip once the scheduler
prerequisites are there. The initial cleanups (27-37) might have to wait if
there are conflicts vs. the drm/gpu tree. We'll see.

>From what I can tell kmap_atomic() can be removed all together and
completly replaced by kmap_local(). Most of the usage sites are trivial and
just doing memcpy(), memset() or trivial operations on the temporarily
mapped page. The interesting ones are those which do either conditional
stuff or have copy_.*_user_inatomic() inside. As shown with the crash and
drm/gpu cleanups this allows to simplify the code quite a bit.

Changes vs. V2:

  - Remove the migrate disable from kmap_local and only issue that when the
    there is an actual highmem mapping. (Linus)
  - Reordered the series so the consolidation is upfront
  - Get rid of kmap_types.h and the associated cruft
  - Fixup documentation and add function documentation for kmap_*
  - Splitout the internal implementation into a seperate header
  - More cleanups - removal of unused functions
  - Replace a few of the less frequently used kmap_atomic and
    io_mapping_map_atomic variants and remove those interfaces.

Thanks,

	tglx
---
 arch/alpha/include/asm/kmap_types.h                   |   15 
 arch/arc/include/asm/kmap_types.h                     |   14 
 arch/arm/include/asm/kmap_types.h                     |   10 
 arch/arm/mm/highmem.c                                 |  121 -------
 arch/ia64/include/asm/kmap_types.h                    |   13 
 arch/microblaze/mm/highmem.c                          |   78 ----
 arch/mips/include/asm/kmap_types.h                    |   13 
 arch/nds32/mm/highmem.c                               |   48 --
 arch/parisc/include/asm/kmap_types.h                  |   13 
 arch/powerpc/include/asm/kmap_types.h                 |   13 
 arch/powerpc/mm/highmem.c                             |   67 ----
 arch/sh/include/asm/kmap_types.h                      |   15 
 arch/sparc/include/asm/kmap_types.h                   |   11 
 arch/sparc/mm/highmem.c                               |  115 -------
 arch/um/include/asm/kmap_types.h                      |   13 
 arch/x86/include/asm/kmap_types.h                     |   13 
 b/Documentation/driver-api/io-mapping.rst             |   92 ++---
 b/arch/arc/Kconfig                                    |    1 
 b/arch/arc/include/asm/highmem.h                      |   26 +
 b/arch/arc/mm/highmem.c                               |   54 ---
 b/arch/arm/Kconfig                                    |    1 
 b/arch/arm/include/asm/fixmap.h                       |    4 
 b/arch/arm/include/asm/highmem.h                      |   33 +-
 b/arch/arm/mm/Makefile                                |    1 
 b/arch/arm/mm/cache-feroceon-l2.c                     |    6 
 b/arch/arm/mm/cache-xsc3l2.c                          |    4 
 b/arch/csky/Kconfig                                   |    1 
 b/arch/csky/include/asm/fixmap.h                      |    4 
 b/arch/csky/include/asm/highmem.h                     |    6 
 b/arch/csky/mm/highmem.c                              |   75 ----
 b/arch/microblaze/Kconfig                             |    1 
 b/arch/microblaze/include/asm/fixmap.h                |    4 
 b/arch/microblaze/include/asm/highmem.h               |    6 
 b/arch/microblaze/mm/Makefile                         |    1 
 b/arch/microblaze/mm/init.c                           |    6 
 b/arch/mips/Kconfig                                   |    1 
 b/arch/mips/include/asm/fixmap.h                      |    4 
 b/arch/mips/include/asm/highmem.h                     |    6 
 b/arch/mips/kernel/crash_dump.c                       |   42 --
 b/arch/mips/mm/highmem.c                              |   77 ----
 b/arch/mips/mm/init.c                                 |    4 
 b/arch/nds32/Kconfig.cpu                              |    1 
 b/arch/nds32/include/asm/fixmap.h                     |    4 
 b/arch/nds32/include/asm/highmem.h                    |   22 -
 b/arch/nds32/mm/Makefile                              |    1 
 b/arch/openrisc/mm/init.c                             |    1 
 b/arch/openrisc/mm/ioremap.c                          |    1 
 b/arch/powerpc/Kconfig                                |    1 
 b/arch/powerpc/include/asm/fixmap.h                   |    4 
 b/arch/powerpc/include/asm/highmem.h                  |    7 
 b/arch/powerpc/mm/Makefile                            |    1 
 b/arch/powerpc/mm/mem.c                               |    7 
 b/arch/sh/include/asm/fixmap.h                        |    8 
 b/arch/sh/mm/init.c                                   |    8 
 b/arch/sparc/Kconfig                                  |    1 
 b/arch/sparc/include/asm/highmem.h                    |    8 
 b/arch/sparc/include/asm/vaddrs.h                     |    4 
 b/arch/sparc/mm/Makefile                              |    3 
 b/arch/sparc/mm/srmmu.c                               |    2 
 b/arch/um/include/asm/fixmap.h                        |    1 
 b/arch/x86/Kconfig                                    |    3 
 b/arch/x86/include/asm/fixmap.h                       |    5 
 b/arch/x86/include/asm/highmem.h                      |   13 
 b/arch/x86/include/asm/iomap.h                        |   13 
 b/arch/x86/include/asm/paravirt_types.h               |    1 
 b/arch/x86/kernel/crash_dump_32.c                     |   48 --
 b/arch/x86/mm/highmem_32.c                            |   59 ---
 b/arch/x86/mm/init_32.c                               |   15 
 b/arch/x86/mm/iomap_32.c                              |   57 ---
 b/arch/xtensa/Kconfig                                 |    1 
 b/arch/xtensa/include/asm/fixmap.h                    |    4 
 b/arch/xtensa/include/asm/highmem.h                   |   12 
 b/arch/xtensa/mm/highmem.c                            |   46 --
 b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c      |    7 
 b/drivers/gpu/drm/i915/i915_gem.c                     |   40 --
 b/drivers/gpu/drm/i915/selftests/i915_gem.c           |    4 
 b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c       |    8 
 b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h |    8 
 b/drivers/gpu/drm/qxl/qxl_image.c                     |   18 -
 b/drivers/gpu/drm/qxl/qxl_ioctl.c                     |   27 -
 b/drivers/gpu/drm/qxl/qxl_object.c                    |   12 
 b/drivers/gpu/drm/qxl/qxl_object.h                    |    4 
 b/drivers/gpu/drm/qxl/qxl_release.c                   |    4 
 b/drivers/gpu/drm/ttm/ttm_bo_util.c                   |   20 -
 b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c                |   30 -
 b/fs/aio.c                                            |    1 
 b/fs/btrfs/ctree.h                                    |    1 
 b/include/asm-generic/Kbuild                          |    2 
 b/include/asm-generic/kmap_size.h                     |   12 
 b/include/linux/highmem-internal.h                    |  210 ++++++++++++
 b/include/linux/highmem.h                             |  294 ++++++------------
 b/include/linux/io-mapping.h                          |   28 -
 b/include/linux/kernel.h                              |   21 -
 b/include/linux/preempt.h                             |   38 --
 b/include/linux/sched.h                               |   11 
 b/kernel/entry/common.c                               |    2 
 b/kernel/fork.c                                       |    1 
 b/kernel/sched/core.c                                 |   63 +++
 b/kernel/sched/sched.h                                |    4 
 b/lib/smp_processor_id.c                              |    2 
 b/mm/Kconfig                                          |    3 
 b/mm/highmem.c                                        |  255 ++++++++++++++-
 include/asm-generic/kmap_types.h                      |   11 
 103 files changed, 959 insertions(+), 1576 deletions(-)

^ permalink raw reply

* [patch V3 04/37] sh/highmem: Remove all traces of unused cruft
From: Thomas Gleixner @ 2020-11-03  9:27 UTC (permalink / raw)
  To: LKML
  Cc: Juri Lelli, linux-aio, Peter Zijlstra, Sebastian Andrzej Siewior,
	Joonas Lahtinen, dri-devel, virtualization, Ben Segall,
	Chris Mason, Huang Rui, Paul Mackerras, Gerd Hoffmann,
	Daniel Bristot de Oliveira, sparclinux, Vincent Chen,
	Christoph Hellwig, Vincent Guittot, Paul McKenney, Max Filippov,
	x86, Russell King, linux-csky, Ingo Molnar, David Airlie,
	VMware Graphics, Mel Gorman, nouveau, Dave Airlie, linux-snps-arc,
	Ben Skeggs, linux-xtensa, Arnd Bergmann, intel-gfx,
	Roland Scheidegger, Josef Bacik, Steven Rostedt, Linus Torvalds,
	Alexander Viro, spice-devel, David Sterba, Rodrigo Vivi,
	Dietmar Eggemann, linux-arm-kernel, Jani Nikula, Chris Zankel,
	Michal Simek, Thomas Bogendoerfer, Nick Hu, linux-mm,
	Vineet Gupta, linux-mips, Christian Koenig, Benjamin LaHaise,
	Daniel Vetter, linux-fsdevel, Andrew Morton, linuxppc-dev,
	David S. Miller, linux-btrfs, Greentime Hu
In-Reply-To: <20201103092712.714480842@linutronix.de>

For whatever reasons SH has highmem bits all over the place but does
not enable it via Kconfig. Remove the bitrot.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/sh/include/asm/fixmap.h     |    8 --------
 arch/sh/include/asm/kmap_types.h |   15 ---------------
 arch/sh/mm/init.c                |    8 --------
 3 files changed, 31 deletions(-)

--- a/arch/sh/include/asm/fixmap.h
+++ b/arch/sh/include/asm/fixmap.h
@@ -13,9 +13,6 @@
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <asm/page.h>
-#ifdef CONFIG_HIGHMEM
-#include <asm/kmap_types.h>
-#endif
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -53,11 +50,6 @@ enum fixed_addresses {
 	FIX_CMAP_BEGIN,
 	FIX_CMAP_END = FIX_CMAP_BEGIN + (FIX_N_COLOURS * NR_CPUS) - 1,
 
-#ifdef CONFIG_HIGHMEM
-	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
-	FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1,
-#endif
-
 #ifdef CONFIG_IOREMAP_FIXED
 	/*
 	 * FIX_IOREMAP entries are useful for mapping physical address
--- a/arch/sh/include/asm/kmap_types.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __SH_KMAP_TYPES_H
-#define __SH_KMAP_TYPES_H
-
-/* Dummy header just to define km_type. */
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-#define  __WITH_KM_FENCE
-#endif
-
-#include <asm-generic/kmap_types.h>
-
-#undef __WITH_KM_FENCE
-
-#endif
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -362,9 +362,6 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 	pr_info("virtual kernel memory layout:\n"
 		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-#ifdef CONFIG_HIGHMEM
-		"    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-#endif
 		"    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
 		"    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB) (cached)\n"
 #ifdef CONFIG_UNCACHED_MAPPING
@@ -376,11 +373,6 @@ void __init mem_init(void)
 		FIXADDR_START, FIXADDR_TOP,
 		(FIXADDR_TOP - FIXADDR_START) >> 10,
 
-#ifdef CONFIG_HIGHMEM
-		PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
-		(LAST_PKMAP*PAGE_SIZE) >> 10,
-#endif
-
 		(unsigned long)VMALLOC_START, VMALLOC_END,
 		(VMALLOC_END - VMALLOC_START) >> 20,
 


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox