LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH AUTOSEL 4.4 10/12] powerpc/5200: dts: fix memory node unit name
From: Sasha Levin @ 2021-11-09 22:24 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sasha Levin, Rob Herring, pawel.moll, ijc+devicetree, devicetree,
	robh+dt, paulus, galak, mark.rutland, Anatolij Gustschin,
	linuxppc-dev
In-Reply-To: <20211109222426.1236575-1-sashal@kernel.org>

From: Anatolij Gustschin <agust@denx.de>

[ Upstream commit aed2886a5e9ffc8269a4220bff1e9e030d3d2eb1 ]

Fixes build warnings:
Warning (unit_address_vs_reg): /memory: node has a reg or ranges property, but no unit name

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20211013220532.24759-4-agust@denx.de
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/boot/dts/charon.dts    | 2 +-
 arch/powerpc/boot/dts/digsy_mtc.dts | 2 +-
 arch/powerpc/boot/dts/lite5200.dts  | 2 +-
 arch/powerpc/boot/dts/lite5200b.dts | 2 +-
 arch/powerpc/boot/dts/media5200.dts | 2 +-
 arch/powerpc/boot/dts/mpc5200b.dtsi | 2 +-
 arch/powerpc/boot/dts/o2d.dts       | 2 +-
 arch/powerpc/boot/dts/o2d.dtsi      | 2 +-
 arch/powerpc/boot/dts/o2dnt2.dts    | 2 +-
 arch/powerpc/boot/dts/o3dnt.dts     | 2 +-
 arch/powerpc/boot/dts/pcm032.dts    | 2 +-
 arch/powerpc/boot/dts/tqm5200.dts   | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/boot/dts/charon.dts b/arch/powerpc/boot/dts/charon.dts
index 0e00e508eaa6a..1c8fe20752e6a 100644
--- a/arch/powerpc/boot/dts/charon.dts
+++ b/arch/powerpc/boot/dts/charon.dts
@@ -39,7 +39,7 @@
 		};
 	};
 
-	memory {
+	memory@0 {
 		device_type = "memory";
 		reg = <0x00000000 0x08000000>;	// 128MB
 	};
diff --git a/arch/powerpc/boot/dts/digsy_mtc.dts b/arch/powerpc/boot/dts/digsy_mtc.dts
index 955bff629df3c..bf511255f3ae8 100644
--- a/arch/powerpc/boot/dts/digsy_mtc.dts
+++ b/arch/powerpc/boot/dts/digsy_mtc.dts
@@ -20,7 +20,7 @@
 	model = "intercontrol,digsy-mtc";
 	compatible = "intercontrol,digsy-mtc";
 
-	memory {
+	memory@0 {
 		reg = <0x00000000 0x02000000>;	// 32MB
 	};
 
diff --git a/arch/powerpc/boot/dts/lite5200.dts b/arch/powerpc/boot/dts/lite5200.dts
index 179a1785d6454..18d137a3393f0 100644
--- a/arch/powerpc/boot/dts/lite5200.dts
+++ b/arch/powerpc/boot/dts/lite5200.dts
@@ -36,7 +36,7 @@
 		};
 	};
 
-	memory {
+	memory@0 {
 		device_type = "memory";
 		reg = <0x00000000 0x04000000>;	// 64MB
 	};
diff --git a/arch/powerpc/boot/dts/lite5200b.dts b/arch/powerpc/boot/dts/lite5200b.dts
index 5abb46c5cc951..29419cf81e044 100644
--- a/arch/powerpc/boot/dts/lite5200b.dts
+++ b/arch/powerpc/boot/dts/lite5200b.dts
@@ -35,7 +35,7 @@
 		led4 { gpios = <&gpio_simple 2 1>; };
 	};
 
-	memory {
+	memory@0 {
 		reg = <0x00000000 0x10000000>;	// 256MB
 	};
 
diff --git a/arch/powerpc/boot/dts/media5200.dts b/arch/powerpc/boot/dts/media5200.dts
index b5413cb85f134..3d57463bc49da 100644
--- a/arch/powerpc/boot/dts/media5200.dts
+++ b/arch/powerpc/boot/dts/media5200.dts
@@ -36,7 +36,7 @@
 		};
 	};
 
-	memory {
+	memory@0 {
 		reg = <0x00000000 0x08000000>;	// 128MB RAM
 	};
 
diff --git a/arch/powerpc/boot/dts/mpc5200b.dtsi b/arch/powerpc/boot/dts/mpc5200b.dtsi
index 969b2200b2f97..ecfba675b5611 100644
--- a/arch/powerpc/boot/dts/mpc5200b.dtsi
+++ b/arch/powerpc/boot/dts/mpc5200b.dtsi
@@ -37,7 +37,7 @@
 		};
 	};
 
-	memory: memory {
+	memory: memory@0 {
 		device_type = "memory";
 		reg = <0x00000000 0x04000000>;	// 64MB
 	};
diff --git a/arch/powerpc/boot/dts/o2d.dts b/arch/powerpc/boot/dts/o2d.dts
index 9f6dd4d889b32..5a676e8141caf 100644
--- a/arch/powerpc/boot/dts/o2d.dts
+++ b/arch/powerpc/boot/dts/o2d.dts
@@ -16,7 +16,7 @@
 	model = "ifm,o2d";
 	compatible = "ifm,o2d";
 
-	memory {
+	memory@0 {
 		reg = <0x00000000 0x08000000>;  // 128MB
 	};
 
diff --git a/arch/powerpc/boot/dts/o2d.dtsi b/arch/powerpc/boot/dts/o2d.dtsi
index cf073e693f24d..1b4df5f64b580 100644
--- a/arch/powerpc/boot/dts/o2d.dtsi
+++ b/arch/powerpc/boot/dts/o2d.dtsi
@@ -23,7 +23,7 @@
 	model = "ifm,o2d";
 	compatible = "ifm,o2d";
 
-	memory {
+	memory@0 {
 		reg = <0x00000000 0x04000000>;	// 64MB
 	};
 
diff --git a/arch/powerpc/boot/dts/o2dnt2.dts b/arch/powerpc/boot/dts/o2dnt2.dts
index a0f5b97a4f06e..5184c461a205f 100644
--- a/arch/powerpc/boot/dts/o2dnt2.dts
+++ b/arch/powerpc/boot/dts/o2dnt2.dts
@@ -16,7 +16,7 @@
 	model = "ifm,o2dnt2";
 	compatible = "ifm,o2d";
 
-	memory {
+	memory@0 {
 		reg = <0x00000000 0x08000000>;  // 128MB
 	};
 
diff --git a/arch/powerpc/boot/dts/o3dnt.dts b/arch/powerpc/boot/dts/o3dnt.dts
index acce49326491b..045b901719245 100644
--- a/arch/powerpc/boot/dts/o3dnt.dts
+++ b/arch/powerpc/boot/dts/o3dnt.dts
@@ -16,7 +16,7 @@
 	model = "ifm,o3dnt";
 	compatible = "ifm,o2d";
 
-	memory {
+	memory@0 {
 		reg = <0x00000000 0x04000000>;  // 64MB
 	};
 
diff --git a/arch/powerpc/boot/dts/pcm032.dts b/arch/powerpc/boot/dts/pcm032.dts
index 96b139bf50e9c..ac3f53c1a1f5b 100644
--- a/arch/powerpc/boot/dts/pcm032.dts
+++ b/arch/powerpc/boot/dts/pcm032.dts
@@ -26,7 +26,7 @@
 	model = "phytec,pcm032";
 	compatible = "phytec,pcm032";
 
-	memory {
+	memory@0 {
 		reg = <0x00000000 0x08000000>;	// 128MB
 	};
 
diff --git a/arch/powerpc/boot/dts/tqm5200.dts b/arch/powerpc/boot/dts/tqm5200.dts
index 1db07f6cf133c..68b9e8240fb5b 100644
--- a/arch/powerpc/boot/dts/tqm5200.dts
+++ b/arch/powerpc/boot/dts/tqm5200.dts
@@ -36,7 +36,7 @@
 		};
 	};
 
-	memory {
+	memory@0 {
 		device_type = "memory";
 		reg = <0x00000000 0x04000000>;	// 64MB
 	};
-- 
2.33.0


^ permalink raw reply related

* [PATCH AUTOSEL 4.4 12/12] powerpc/dcr: Use cmplwi instead of 3-argument cmpli
From: Sasha Levin @ 2021-11-09 22:24 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Sasha Levin, Nick Desaulniers, paulus, linuxppc-dev
In-Reply-To: <20211109222426.1236575-1-sashal@kernel.org>

From: Michael Ellerman <mpe@ellerman.id.au>

[ Upstream commit fef071be57dc43679a32d5b0e6ee176d6f12e9f2 ]

In dcr-low.S we use cmpli with three arguments, instead of four
arguments as defined in the ISA:

	cmpli	cr0,r3,1024

This appears to be a PPC440-ism, looking at the "PPC440x5 CPU Core
User’s Manual" it shows cmpli having no L field, but implied to be 0 due
to the core being 32-bit. It mentions that the ISA defines four
arguments and recommends using cmplwi.

It also corresponds to the old POWER instruction set, which had no L
field there, a reserved bit instead.

dcr-low.S is only built 32-bit, because it is only built when
DCR_NATIVE=y, which is only selected by 40x and 44x. Looking at the
generated code (with gcc/gas) we see cmplwi as expected.

Although gas is happy with the 3-argument version when building for
32-bit, the LLVM assembler is not and errors out with:

  arch/powerpc/sysdev/dcr-low.S:27:10: error: invalid operand for instruction
   cmpli 0,%r3,1024; ...
           ^

Switch to the cmplwi extended opcode, which avoids any confusion when
reading the ISA, fixes the issue with the LLVM assembler, and also means
the code could be built 64-bit in future (though that's very unlikely).

Reported-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
BugLink: https://github.com/ClangBuiltLinux/linux/issues/1419
Link: https://lore.kernel.org/r/20211014024424.528848-1-mpe@ellerman.id.au
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/powerpc/sysdev/dcr-low.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/dcr-low.S b/arch/powerpc/sysdev/dcr-low.S
index d3098ef1404a2..3943d19d5f63b 100644
--- a/arch/powerpc/sysdev/dcr-low.S
+++ b/arch/powerpc/sysdev/dcr-low.S
@@ -14,7 +14,7 @@
 #include <asm/bug.h>
 
 #define DCR_ACCESS_PROLOG(table) \
-	cmpli	cr0,r3,1024;	 \
+	cmplwi	cr0,r3,1024;	 \
 	rlwinm  r3,r3,4,18,27;   \
 	lis     r5,table@h;      \
 	ori     r5,r5,table@l;   \
-- 
2.33.0


^ permalink raw reply related

* Re: [PASEMI] Nemo board doesn't recognize any ATA disks with the pci-v5.16 updates
From: Krzysztof Wilczyński @ 2021-11-09 22:40 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Jens Axboe, Darren Stevens, mad skateman,
	linux-pci@vger.kernel.org, Damien Le Moal, Olof Johansson,
	R.T.Dickinson, Christian Zigotzky,
	bhelgaas@google.com >> Bjorn Helgaas, Matthew Leaman,
	linuxppc-dev, Christian Zigotzky
In-Reply-To: <20211109165848.GA1155989@bhelgaas>

[+CC Adding Jens and Damien to get their opinion about the problem at hand]

Hello Jens and Damien,

Sorry to bother both of you, but we are having a problem that most
definitely requires someone with an extensive expertise in storage,
as per the quoted message from Christian below:

> > > The Nemo board [1] doesn't recognize any ATA disks with the pci-v5.16
> > > updates [2].
> > >
> > > Error messages:
> > >
> > > ata4.00: gc timeout cmd 0xec
> > > ata4.00: failed to IDENTIFY (I/O error, error_mask=0x4)
> > > ata1.00: gc timeout cmd 0xec
> > > ata1.00: failed to IDENTIFY (I/O error, error_mask=0x4)
> > > ata3.00: gc timeout cmd 0xec
> > > ata3.00: failed to IDENTIFY (I/O error, error_mask=0x4)

The error message is also not very detailed and we aren't really sure what
the issue coming from the PCI sub-system might be causing or leading to
this.

> > >
> > > I was able to revert the new pci-v5.16 updates [2]. After a new compiling,
> > > the kernel recognize all ATA disks correctly.
> > >
> > > Could you please check the pci-v5.16 updates [2]?
> > >
> > > Please find attached the kernel config.
> > >
> > > Thanks,
> > > Christian
> > >
> > > [1] https://en.wikipedia.org/wiki/AmigaOne_X1000
> > > [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c5c62ddf88c34bc83b66e4ac9beb2bb0e1887d4
> 
> Sorry for the breakage, and thank you very much for the report.  Can
> you please collect the complete dmesg logs before and after the
> pci-v5.16 changes and the "sudo lspci -vv" output from before the
> changes?
> 
> You can attach them at https://bugzilla.kernel.org if you don't have
> a better place to put them.
> 
> You could attach the kernel config there, too, since it didn't make it
> to the mailing list (vger may discard them -- see
> http://vger.kernel.org/majordomo-info.html).

Bjorn and I looked at which commits that went with a recent Pull Request
from us might be causing this, but we are a little bit at loss, and were
hoping that you could give us a hand in troubleshooting this.

Thank you in advance!

	Krzysztof



^ permalink raw reply

* Re: [PASEMI] Nemo board doesn't recognize any ATA disks with the pci-v5.16 updates
From: Arnd Bergmann @ 2021-11-09 23:05 UTC (permalink / raw)
  To: Krzysztof Wilczyński
  Cc: Jens Axboe, Darren Stevens, R.T.Dickinson,
	linux-pci@vger.kernel.org, Damien Le Moal,
	bhelgaas@google.com >> Bjorn Helgaas, Bjorn Helgaas,
	mad skateman, Christian Zigotzky, Olof Johansson, Matthew Leaman,
	linuxppc-dev, Christian Zigotzky
In-Reply-To: <YYr4x1xWfptXRmqt@rocinante>

On Tue, Nov 9, 2021 at 11:40 PM Krzysztof Wilczyński <kw@linux.com> wrote:
> >
> > You could attach the kernel config there, too, since it didn't make it
> > to the mailing list (vger may discard them -- see
> > http://vger.kernel.org/majordomo-info.html).
>
> Bjorn and I looked at which commits that went with a recent Pull Request
> from us might be causing this, but we are a little bit at loss, and were
> hoping that you could give us a hand in troubleshooting this.

For reference, these are the patches in that branch that touch any
interesting files,
as most of the contents are for pci-controller drivers that are not used on
powerpc at all:

$ git log --no-merges --oneline 512b7931ad05..dda4b381f05d
arch/powerpc/ drivers/of/  drivers/pci/*.[ch]  include/linux/
acd61ffb2f16 PCI: Add ACS quirk for Pericom PI7C9X2G switches
978fd0056e19 PCI: of: Allow matching of an interrupt-map local to a PCI device
041284181226 of/irq: Allow matching of an interrupt-map local to an
interrupt controller
0ab8d0f6ae3f irqdomain: Make of_phandle_args_to_fwspec() generally available
5ec0a6fcb60e PCI: Do not enable AtomicOps on VFs
7a41ae80bdcb PCI: pci-bridge-emul: Fix emulation of W1C bits
fd1ae23b495b PCI: Prefer 'unsigned int' over bare 'unsigned'
ff5d3bb6e16d PCI: Remove redundant 'rc' initialization
3331325c6347 PCI/VPD: Use pci_read_vpd_any() in pci_vpd_size()
e1b0d0bb2032 PCI: Re-enable Downstream Port LTR after reset or hotplug
ac8e3cef588c PCI/sysfs: Explicitly show first MSI IRQ for 'irq'
88dee3b0efe4 PCI: Remove unused pci_pool wrappers
b5f9c644eb1b PCI: Remove struct pci_dev->driver
2a4d9408c9e8 PCI: Use to_pci_driver() instead of pci_dev->driver
4141127c44a9 powerpc/eeh: Use to_pci_driver() instead of pci_dev->driver
f9a6c8ad4922 PCI/ERR: Reduce compile time for CONFIG_PCIEAER=n
43e85554d4ed xen/pcifront: Use to_pci_driver() instead of pci_dev->driver
34ab316d7287 xen/pcifront: Drop pcifront_common_process() tests of pcidev, pdrv
9f37ab0412eb PCI/switchtec: Add check of event support
5a72431ec318 powerpc/eeh: Use dev_driver_string() instead of struct
pci_dev->driver->name
ae232f0970ea PCI: Drop pci_device_probe() test of !pci_dev->driver
097d9d414433 PCI: Drop pci_device_remove() test of pci_dev->driver
8e9028b3790d PCI: Return NULL for to_pci_driver(NULL)
357df2fc0066 PCI: Use unsigned to match sscanf("%x") in pci_dev_str_match_path()
bf2928c7a284 PCI/VPD: Add pci_read/write_vpd_any()
b2105b9f39b5 PCI: Correct misspelled and remove duplicated words
7c3855c423b1 PCI: Coalesce host bridge contiguous apertures
e0f7b1922358 PCI: Use kstrtobool() directly, sans strtobool() wrapper
36f354ec7bf9 PCI/sysfs: Return -EINVAL consistently from "store" functions
95e83e219d68 PCI/sysfs: Check CAP_SYS_ADMIN before parsing user input
af9d82626c8f PCI/ACPI: Remove OSC_PCI_SUPPORT_MASKS and OSC_PCI_CONTROL_MASKS
9a0a1417d3bb PCI: Tidy comments
06dc660e6eb8 PCI: Rename pcibios_add_device() to pcibios_device_add()
e3f4bd3462f6 PCI: Mark Atheros QCA6174 to avoid bus reset
3a19407913e8 PCI/P2PDMA: Apply bus offset correctly in DMA address calculation

Out of these, I agree that most of them seem harmless, these would
be the ones I'd try to look at more closely, or maybe revert for testing:

978fd0056e19 PCI: of: Allow matching of an interrupt-map local to a PCI device
041284181226 of/irq: Allow matching of an interrupt-map local to an
interrupt controller
e1b0d0bb2032 PCI: Re-enable Downstream Port LTR after reset or hotplug
7c3855c423b1 PCI: Coalesce host bridge contiguous apertures
3a19407913e8 PCI/P2PDMA: Apply bus offset correctly in DMA address calculation

       Arnd

^ permalink raw reply

* Re: [PASEMI] Nemo board doesn't recognize any ATA disks with the pci-v5.16 updates
From: Krzysztof Wilczyński @ 2021-11-09 23:18 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Jens Axboe, Darren Stevens, R.T.Dickinson,
	linux-pci@vger.kernel.org, Damien Le Moal,
	bhelgaas@google.com >> Bjorn Helgaas, Bjorn Helgaas,
	mad skateman, Christian Zigotzky, Olof Johansson, Matthew Leaman,
	linuxppc-dev, Robert Święcki, Christian Zigotzky
In-Reply-To: <CAK8P3a3fB7KmqWcaenn04WMps=jucV8wOZs=AZcNFHR+o+Bvyg@mail.gmail.com>

[+CC Adding Robert for visibility]

Hi Arnd,

Thank you looking at this!  Much appreciated.

> > > You could attach the kernel config there, too, since it didn't make it
> > > to the mailing list (vger may discard them -- see
> > > http://vger.kernel.org/majordomo-info.html).
> >
> > Bjorn and I looked at which commits that went with a recent Pull Request
> > from us might be causing this, but we are a little bit at loss, and were
> > hoping that you could give us a hand in troubleshooting this.
> 
> For reference, these are the patches in that branch that touch any
> interesting files,
> as most of the contents are for pci-controller drivers that are not used on
> powerpc at all:
> 
> $ git log --no-merges --oneline 512b7931ad05..dda4b381f05d
> arch/powerpc/ drivers/of/  drivers/pci/*.[ch]  include/linux/
> acd61ffb2f16 PCI: Add ACS quirk for Pericom PI7C9X2G switches
> 978fd0056e19 PCI: of: Allow matching of an interrupt-map local to a PCI device
> 041284181226 of/irq: Allow matching of an interrupt-map local to an
> interrupt controller
> 0ab8d0f6ae3f irqdomain: Make of_phandle_args_to_fwspec() generally available
> 5ec0a6fcb60e PCI: Do not enable AtomicOps on VFs
> 7a41ae80bdcb PCI: pci-bridge-emul: Fix emulation of W1C bits
> fd1ae23b495b PCI: Prefer 'unsigned int' over bare 'unsigned'
> ff5d3bb6e16d PCI: Remove redundant 'rc' initialization
> 3331325c6347 PCI/VPD: Use pci_read_vpd_any() in pci_vpd_size()
> e1b0d0bb2032 PCI: Re-enable Downstream Port LTR after reset or hotplug
> ac8e3cef588c PCI/sysfs: Explicitly show first MSI IRQ for 'irq'
> 88dee3b0efe4 PCI: Remove unused pci_pool wrappers
> b5f9c644eb1b PCI: Remove struct pci_dev->driver
> 2a4d9408c9e8 PCI: Use to_pci_driver() instead of pci_dev->driver
> 4141127c44a9 powerpc/eeh: Use to_pci_driver() instead of pci_dev->driver
> f9a6c8ad4922 PCI/ERR: Reduce compile time for CONFIG_PCIEAER=n
> 43e85554d4ed xen/pcifront: Use to_pci_driver() instead of pci_dev->driver
> 34ab316d7287 xen/pcifront: Drop pcifront_common_process() tests of pcidev, pdrv
> 9f37ab0412eb PCI/switchtec: Add check of event support
> 5a72431ec318 powerpc/eeh: Use dev_driver_string() instead of struct
> pci_dev->driver->name
> ae232f0970ea PCI: Drop pci_device_probe() test of !pci_dev->driver
> 097d9d414433 PCI: Drop pci_device_remove() test of pci_dev->driver
> 8e9028b3790d PCI: Return NULL for to_pci_driver(NULL)
> 357df2fc0066 PCI: Use unsigned to match sscanf("%x") in pci_dev_str_match_path()
> bf2928c7a284 PCI/VPD: Add pci_read/write_vpd_any()
> b2105b9f39b5 PCI: Correct misspelled and remove duplicated words
> 7c3855c423b1 PCI: Coalesce host bridge contiguous apertures
> e0f7b1922358 PCI: Use kstrtobool() directly, sans strtobool() wrapper
> 36f354ec7bf9 PCI/sysfs: Return -EINVAL consistently from "store" functions
> 95e83e219d68 PCI/sysfs: Check CAP_SYS_ADMIN before parsing user input
> af9d82626c8f PCI/ACPI: Remove OSC_PCI_SUPPORT_MASKS and OSC_PCI_CONTROL_MASKS
> 9a0a1417d3bb PCI: Tidy comments
> 06dc660e6eb8 PCI: Rename pcibios_add_device() to pcibios_device_add()
> e3f4bd3462f6 PCI: Mark Atheros QCA6174 to avoid bus reset
> 3a19407913e8 PCI/P2PDMA: Apply bus offset correctly in DMA address calculation
> 
> Out of these, I agree that most of them seem harmless, these would
> be the ones I'd try to look at more closely, or maybe revert for testing:
> 
> 978fd0056e19 PCI: of: Allow matching of an interrupt-map local to a PCI device
> 041284181226 of/irq: Allow matching of an interrupt-map local to an
> interrupt controller
> e1b0d0bb2032 PCI: Re-enable Downstream Port LTR after reset or hotplug
> 7c3855c423b1 PCI: Coalesce host bridge contiguous apertures
> 3a19407913e8 PCI/P2PDMA: Apply bus offset correctly in DMA address calculation

Robert would you be able build a kernel without the patches Arnd singled
out as potential curlprits?  Might be expidite some troubleshooting saving
a lot of time doing bisect.

I wonder if this will help you with the following problem:
  https://lore.kernel.org/linux-pci/CAP145pjO9zdGgutHP=of0H+L1=nSz097zf73i7ZYm2-NWuwHhQ@mail.gmail.com/

	Krzysztof

^ permalink raw reply

* [PATCH] powerpc: Fix sigset_t copy
From: Finn Thain @ 2021-11-09 23:47 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Christophe Leroy
  Cc: linuxppc-dev, linux-kernel, Christopher M. Riedl

From: Christophe Leroy <christophe.leroy@csgroup.eu>

The conversion from __copy_from_user() to __get_user() introduced a
regression in __get_user_sigset() in v5.13. The bug was subsequently
copied and pasted in unsafe_get_user_sigset().

The regression was reported by users of the Xorg packages distributed in
Debian/powerpc --

    "The symptoms are that the fb screen goes blank, with the backlight
    remaining on and no errors logged in /var/log; wdm (or startx) run
    with no effect (I tried logging in in the blind, with no effect).
    And they are hard to kill, requiring 'kill -KILL ...'"

Fix the regression by casting the __get_user() assignment lvalue to u64
so that the entire struct gets copied.

Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christopher M. Riedl <cmr@bluescreens.de>
Link: https://lore.kernel.org/linuxppc-dev/FEtBUOuFPMN4zJy4bIOqz6C4xoliCbTxS7VtMKD6UZkbvEbycUceRgGAd7e9-trRdwVN3hWAbQi0qrNx8Zgn8niTQf2KPVdw-W35czDIaeQ=@protonmail.com/
Fixes: 887f3ceb51cd ("powerpc/signal32: Convert do_setcontext[_tm]() to user access block")
Fixes: d3ccc9781560 ("powerpc/signal: Use __get_user() to copy sigset_t")
Reported-and-tested-by: Stan Johnson <userm57@yahoo.com>
Signed-off-by: Finn Thain <fthain@linux-m68k.org>
---
Christophe, I hope this change is the one you wanted to see upstream (?).
If it is acceptable please add your signed-off-by tag.
---
 arch/powerpc/kernel/signal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index 1f07317964e4..44e736b88e91 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -23,10 +23,10 @@ static inline int __get_user_sigset(sigset_t *dst, const sigset_t __user *src)
 {
 	BUILD_BUG_ON(sizeof(sigset_t) != sizeof(u64));
 
-	return __get_user(dst->sig[0], (u64 __user *)&src->sig[0]);
+	return __get_user(*(u64 *)&dst->sig[0], (u64 __user *)&src->sig[0]);
 }
 #define unsafe_get_user_sigset(dst, src, label) \
-	unsafe_get_user((dst)->sig[0], (u64 __user *)&(src)->sig[0], label)
+	unsafe_get_user(*(u64 *)&(dst)->sig[0], (u64 __user *)&(src)->sig[0], label)
 
 #ifdef CONFIG_VSX
 extern unsigned long copy_vsx_to_user(void __user *to,
-- 
2.26.3


^ permalink raw reply related

* [PATCH] ASoC: imx-hdmi: add put_device() after of_find_device_by_node()
From: cgel.zte @ 2021-11-10  0:29 UTC (permalink / raw)
  To: nicoleotsuka
  Cc: alsa-devel, linuxppc-dev, linux-kernel, Xiubo.Lee, festevam,
	s.hauer, tiwai, lgirdwood, perex, broonie, linux-imx, kernel,
	Ye Guojin, shawnguo, shengjiu.wang, Zeal Robot, linux-arm-kernel

From: Ye Guojin <ye.guojin@zte.com.cn>

This was found by coccicheck:
./sound/soc/fsl/imx-hdmi.c,209,1-7,ERROR  missing put_device; call
of_find_device_by_node on line 119, but without a corresponding object
release within this function.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Ye Guojin <ye.guojin@zte.com.cn>
---
 sound/soc/fsl/imx-hdmi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sound/soc/fsl/imx-hdmi.c b/sound/soc/fsl/imx-hdmi.c
index f10359a28800..929f69b758af 100644
--- a/sound/soc/fsl/imx-hdmi.c
+++ b/sound/soc/fsl/imx-hdmi.c
@@ -145,6 +145,8 @@ static int imx_hdmi_probe(struct platform_device *pdev)
 	data->dai.capture_only = false;
 	data->dai.init = imx_hdmi_init;
 
+	put_device(&cpu_pdev->dev);
+
 	if (of_node_name_eq(cpu_np, "sai")) {
 		data->cpu_priv.sysclk_id[1] = FSL_SAI_CLK_MAST1;
 		data->cpu_priv.sysclk_id[0] = FSL_SAI_CLK_MAST1;
-- 
2.25.1


^ permalink raw reply related

* [PATCH v7 1/5] powerpc: Allow clearing and restoring registers independent of saved breakpoint state
From: Jordan Niethe @ 2021-11-10  0:37 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Jordan Niethe, cmr
In-Reply-To: <20211110003717.1150965-1-jniethe5@gmail.com>

For the coming temporary mm used for instruction patching, the
breakpoint registers need to be cleared to prevent them from
accidentally being triggered. As soon as the patching is done, the
breakpoints will be restored. The breakpoint state is stored in the per
cpu variable current_brk[]. Add a pause_breakpoints() function which will
clear the breakpoint registers without touching the state in
current_bkr[]. Add a pair function unpause_breakpoints() which will move
the state in current_brk[] back to the registers.

Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
---
v7: New to series
---
 arch/powerpc/include/asm/debug.h |  2 ++
 arch/powerpc/kernel/process.c    | 36 +++++++++++++++++++++++++++++---
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index 86a14736c76c..83f2dc3785e8 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -46,6 +46,8 @@ static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; }
 #endif
 
 void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk);
+void pause_breakpoints(void);
+void unpause_breakpoints(void);
 bool ppc_breakpoint_available(void);
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 extern void do_send_trap(struct pt_regs *regs, unsigned long address,
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 406d7ee9e322..22ed72430683 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -688,6 +688,7 @@ DEFINE_INTERRUPT_HANDLER(do_break)
 
 static DEFINE_PER_CPU(struct arch_hw_breakpoint, current_brk[HBP_NUM_MAX]);
 
+
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 /*
  * Set the debug registers back to their default "safe" values.
@@ -865,10 +866,8 @@ static inline int set_breakpoint_8xx(struct arch_hw_breakpoint *brk)
 	return 0;
 }
 
-void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk)
+static void ____set_breakpoint(int nr, struct arch_hw_breakpoint *brk)
 {
-	memcpy(this_cpu_ptr(&current_brk[nr]), brk, sizeof(*brk));
-
 	if (dawr_enabled())
 		// Power8 or later
 		set_dawr(nr, brk);
@@ -882,6 +881,12 @@ void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk)
 		WARN_ON_ONCE(1);
 }
 
+void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk)
+{
+	memcpy(this_cpu_ptr(&current_brk[nr]), brk, sizeof(*brk));
+	____set_breakpoint(nr, brk);
+}
+
 /* Check if we have DAWR or DABR hardware */
 bool ppc_breakpoint_available(void)
 {
@@ -894,6 +899,31 @@ bool ppc_breakpoint_available(void)
 }
 EXPORT_SYMBOL_GPL(ppc_breakpoint_available);
 
+/* Disable the breakpoint in hardware without touching current_brk[] */
+void pause_breakpoints(void)
+{
+	struct arch_hw_breakpoint brk = {0};
+	int i;
+
+	if (!ppc_breakpoint_available())
+		return;
+
+	for (i = 0; i < nr_wp_slots(); i++)
+		____set_breakpoint(i, &brk);
+}
+
+/* Renable the breakpoint in hardware from current_brk[] */
+void unpause_breakpoints(void)
+{
+	int i;
+
+	if (!ppc_breakpoint_available())
+		return;
+
+	for (i = 0; i < nr_wp_slots(); i++)
+		____set_breakpoint(i, this_cpu_ptr(&current_brk[i]));
+}
+
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 
 static inline bool tm_enabled(struct task_struct *tsk)
-- 
2.25.1


^ permalink raw reply related

* [PATCH v7 0/5] Use per-CPU temporary mappings for patching on Radix MMU
From: Jordan Niethe @ 2021-11-10  0:37 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Jordan Niethe, cmr

This is a revision of Chris' series to introduces a per cpu temporary mm to be
used for patching with strict rwx on radix mmus.

The previous version of the series is here:
https://lore.kernel.org/linuxppc-dev/20210911022904.30962-1-cmr@bluescreens.de/

v7: - introduce helper functions for clearing and restoring breakpoint
      registers when using the temporary mm
    - use a new patch_instruction_mm() function instead of needing repeated
      conditionals and a struct to save state to work within
      do_patch_instruction() 
    - include a ptesync after setting the pte

Christopher M. Riedl (4):
  powerpc/64s: Introduce temporary mm for Radix MMU
  powerpc: Rework and improve STRICT_KERNEL_RWX patching
  powerpc: Use WARN_ON and fix check in poking_init
  powerpc/64s: Initialize and use a temporary mm for patching on Radix

Jordan Niethe (1):
  powerpc: Allow clearing and restoring registers independent of saved
    breakpoint state

 arch/powerpc/include/asm/debug.h |   2 +
 arch/powerpc/kernel/process.c    |  36 ++++++-
 arch/powerpc/lib/code-patching.c | 162 +++++++++++++++++++++++++++----
 3 files changed, 176 insertions(+), 24 deletions(-)

-- 
2.25.1


^ permalink raw reply

* [PATCH v7 2/5] powerpc/64s: Introduce temporary mm for Radix MMU
From: Jordan Niethe @ 2021-11-10  0:37 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Jordan Niethe, cmr
In-Reply-To: <20211110003717.1150965-1-jniethe5@gmail.com>

From: "Christopher M. Riedl" <cmr@bluescreens.de>

x86 supports the notion of a temporary mm which restricts access to
temporary PTEs to a single CPU. A temporary mm is useful for situations
where a CPU needs to perform sensitive operations (such as patching a
STRICT_KERNEL_RWX kernel) requiring temporary mappings without exposing
said mappings to other CPUs. Another benefit is that other CPU TLBs do
not need to be flushed when the temporary mm is torn down.

Mappings in the temporary mm can be set in the userspace portion of the
address-space.

Interrupts must be disabled while the temporary mm is in use. HW
breakpoints, which may have been set by userspace as watchpoints on
addresses now within the temporary mm, are saved and disabled when
loading the temporary mm. The HW breakpoints are restored when unloading
the temporary mm. All HW breakpoints are indiscriminately disabled while
the temporary mm is in use - this may include breakpoints set by perf.

Based on x86 implementation:

commit cefa929c034e
("x86/mm: Introduce temporary mm structs")

Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
---
v7: - use breakpoint_pause()/breakpoint_unpause()
    - simplify the temp mm struct, don't need init_temp_mm()
---
 arch/powerpc/lib/code-patching.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index c5ed98823835..29a30c3068ff 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -17,6 +17,9 @@
 #include <asm/code-patching.h>
 #include <asm/setup.h>
 #include <asm/inst.h>
+#include <asm/mmu_context.h>
+#include <asm/debug.h>
+#include <asm/tlb.h>
 
 static int __patch_instruction(u32 *exec_addr, struct ppc_inst instr, u32 *patch_addr)
 {
@@ -45,6 +48,32 @@ int raw_patch_instruction(u32 *addr, struct ppc_inst instr)
 }
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
+
+struct temp_mm_state {
+	struct mm_struct *mm;
+};
+
+static inline struct temp_mm_state start_using_temp_mm(struct mm_struct *mm)
+{
+	struct temp_mm_state temp_state;
+
+	lockdep_assert_irqs_disabled();
+	temp_state.mm = current->active_mm;
+	switch_mm_irqs_off(current->active_mm, mm, current);
+
+	WARN_ON(!mm_is_thread_local(mm));
+
+	pause_breakpoints();
+	return temp_state;
+}
+
+static inline void stop_using_temp_mm(struct temp_mm_state prev_state)
+{
+	lockdep_assert_irqs_disabled();
+	switch_mm_irqs_off(current->active_mm, prev_state.mm, current);
+	unpause_breakpoints();
+}
+
 static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
 
 static int text_area_cpu_up(unsigned int cpu)
-- 
2.25.1


^ permalink raw reply related

* [PATCH v7 3/5] powerpc: Rework and improve STRICT_KERNEL_RWX patching
From: Jordan Niethe @ 2021-11-10  0:37 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Jordan Niethe, cmr
In-Reply-To: <20211110003717.1150965-1-jniethe5@gmail.com>

From: "Christopher M. Riedl" <cmr@bluescreens.de>

Rework code-patching with STRICT_KERNEL_RWX to prepare for a later patch
which uses a temporary mm for patching under the Book3s64 Radix MMU.
Make improvements by adding a WARN_ON when the patchsite doesn't match
after patching and return the error from __patch_instruction() properly.

Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
---
v7: still pass addr to map_patch_area()
---
 arch/powerpc/lib/code-patching.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 29a30c3068ff..d586bf9c7581 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -75,6 +75,7 @@ static inline void stop_using_temp_mm(struct temp_mm_state prev_state)
 }
 
 static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
+static DEFINE_PER_CPU(unsigned long, cpu_patching_addr);
 
 static int text_area_cpu_up(unsigned int cpu)
 {
@@ -87,6 +88,7 @@ static int text_area_cpu_up(unsigned int cpu)
 		return -1;
 	}
 	this_cpu_write(text_poke_area, area);
+	this_cpu_write(cpu_patching_addr, (unsigned long)area->addr);
 
 	return 0;
 }
@@ -172,11 +174,10 @@ static inline int unmap_patch_area(unsigned long addr)
 
 static int do_patch_instruction(u32 *addr, struct ppc_inst instr)
 {
-	int err;
+	int err, rc = 0;
 	u32 *patch_addr = NULL;
 	unsigned long flags;
 	unsigned long text_poke_addr;
-	unsigned long kaddr = (unsigned long)addr;
 
 	/*
 	 * During early early boot patch_instruction is called
@@ -188,15 +189,13 @@ static int do_patch_instruction(u32 *addr, struct ppc_inst instr)
 
 	local_irq_save(flags);
 
-	text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr;
-	if (map_patch_area(addr, text_poke_addr)) {
-		err = -1;
+	text_poke_addr = __this_cpu_read(cpu_patching_addr);
+	err = map_patch_area(addr, text_poke_addr);
+	if (err)
 		goto out;
-	}
-
-	patch_addr = (u32 *)(text_poke_addr + (kaddr & ~PAGE_MASK));
 
-	__patch_instruction(addr, instr, patch_addr);
+	patch_addr = (u32 *)(text_poke_addr | offset_in_page(addr));
+	rc = __patch_instruction(addr, instr, patch_addr);
 
 	err = unmap_patch_area(text_poke_addr);
 	if (err)
@@ -204,8 +203,9 @@ static int do_patch_instruction(u32 *addr, struct ppc_inst instr)
 
 out:
 	local_irq_restore(flags);
+	WARN_ON(!ppc_inst_equal(ppc_inst_read(addr), instr));
 
-	return err;
+	return rc ? rc : err;
 }
 #else /* !CONFIG_STRICT_KERNEL_RWX */
 
-- 
2.25.1


^ permalink raw reply related

* [PATCH v7 4/5] powerpc: Use WARN_ON and fix check in poking_init
From: Jordan Niethe @ 2021-11-10  0:37 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Jordan Niethe, cmr
In-Reply-To: <20211110003717.1150965-1-jniethe5@gmail.com>

From: "Christopher M. Riedl" <cmr@bluescreens.de>

The latest kernel docs list BUG_ON() as 'deprecated' and that they
should be replaced with WARN_ON() (or pr_warn()) when possible. The
BUG_ON() in poking_init() warrants a WARN_ON() rather than a pr_warn()
since the error condition is deemed "unreachable".

Also take this opportunity to fix the failure check in the WARN_ON():
cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, ...) returns a positive integer
on success and a negative integer on failure.

Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
---
v7: no change
---
 arch/powerpc/lib/code-patching.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index d586bf9c7581..aa466e4930ec 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -99,16 +99,11 @@ static int text_area_cpu_down(unsigned int cpu)
 	return 0;
 }
 
-/*
- * Although BUG_ON() is rude, in this case it should only happen if ENOMEM, and
- * we judge it as being preferable to a kernel that will crash later when
- * someone tries to use patch_instruction().
- */
 void __init poking_init(void)
 {
-	BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+	WARN_ON(cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
 		"powerpc/text_poke:online", text_area_cpu_up,
-		text_area_cpu_down));
+		text_area_cpu_down) < 0);
 }
 
 /*
-- 
2.25.1


^ permalink raw reply related

* [PATCH v7 5/5] powerpc/64s: Initialize and use a temporary mm for patching on Radix
From: Jordan Niethe @ 2021-11-10  0:37 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Jordan Niethe, cmr
In-Reply-To: <20211110003717.1150965-1-jniethe5@gmail.com>

From: "Christopher M. Riedl" <cmr@bluescreens.de>

When code patching a STRICT_KERNEL_RWX kernel the page containing the
address to be patched is temporarily mapped as writeable. Currently, a
per-cpu vmalloc patch area is used for this purpose. While the patch
area is per-cpu, the temporary page mapping is inserted into the kernel
page tables for the duration of patching. The mapping is exposed to CPUs
other than the patching CPU - this is undesirable from a hardening
perspective. Use a temporary mm instead which keeps the mapping local to
the CPU doing the patching.

Use the `poking_init` init hook to prepare a temporary mm and patching
address. Initialize the temporary mm by copying the init mm. Choose a
randomized patching address inside the temporary mm userspace address
space. The patching address is randomized between PAGE_SIZE and
DEFAULT_MAP_WINDOW-PAGE_SIZE.

Bits of entropy with 64K page size on BOOK3S_64:

        bits of entropy = log2(DEFAULT_MAP_WINDOW_USER64 / PAGE_SIZE)

        PAGE_SIZE=64K, DEFAULT_MAP_WINDOW_USER64=128TB
        bits of entropy = log2(128TB / 64K)
	bits of entropy = 31

The upper limit is DEFAULT_MAP_WINDOW due to how the Book3s64 Hash MMU
operates - by default the space above DEFAULT_MAP_WINDOW is not
available. Currently the Hash MMU does not use a temporary mm so
technically this upper limit isn't necessary; however, a larger
randomization range does not further "harden" this overall approach and
future work may introduce patching with a temporary mm on Hash as well.

Randomization occurs only once during initialization at boot for each
possible CPU in the system.

Introduce a new function, patch_instruction_mm(), to perform the
patching with a temporary mapping with write permissions at
patching_addr. Map the page with PAGE_KERNEL to set EAA[0] for the PTE
which ignores the AMR (so no need to unlock/lock KUAP) according to
PowerISA v3.0b Figure 35 on Radix.

Based on x86 implementation:

commit 4fc19708b165
("x86/alternatives: Initialize temporary mm for patching")

and:

commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")

Signed-off-by: Christopher M. Riedl <cmr@bluescreens.de>
Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
---
v7: - Change to patch_instruction_mm() instead of map_patch_mm() and
       unmap_patch_mm()
    - include ptesync
---
 arch/powerpc/lib/code-patching.c | 106 +++++++++++++++++++++++++++++--
 1 file changed, 101 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index aa466e4930ec..7722dec4a914 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -11,6 +11,7 @@
 #include <linux/cpuhotplug.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/random.h>
 
 #include <asm/tlbflush.h>
 #include <asm/page.h>
@@ -76,6 +77,7 @@ static inline void stop_using_temp_mm(struct temp_mm_state prev_state)
 
 static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
 static DEFINE_PER_CPU(unsigned long, cpu_patching_addr);
+static DEFINE_PER_CPU(struct mm_struct *, cpu_patching_mm);
 
 static int text_area_cpu_up(unsigned int cpu)
 {
@@ -99,8 +101,48 @@ static int text_area_cpu_down(unsigned int cpu)
 	return 0;
 }
 
+static __always_inline void __poking_init_temp_mm(void)
+{
+	int cpu;
+	spinlock_t *ptl;
+	pte_t *ptep;
+	struct mm_struct *patching_mm;
+	unsigned long patching_addr;
+
+	for_each_possible_cpu(cpu) {
+		patching_mm = copy_init_mm();
+		WARN_ON(!patching_mm);
+		per_cpu(cpu_patching_mm, cpu) = patching_mm;
+
+		/*
+		 * Choose a randomized, page-aligned address from the range:
+		 * [PAGE_SIZE, DEFAULT_MAP_WINDOW - PAGE_SIZE] The lower
+		 * address bound is PAGE_SIZE to avoid the zero-page.  The
+		 * upper address bound is DEFAULT_MAP_WINDOW - PAGE_SIZE to
+		 * stay under DEFAULT_MAP_WINDOW with the Book3s64 Hash MMU.
+		 */
+		patching_addr = PAGE_SIZE + ((get_random_long() & PAGE_MASK) %
+					     (DEFAULT_MAP_WINDOW - 2 * PAGE_SIZE));
+		per_cpu(cpu_patching_addr, cpu) = patching_addr;
+
+		/*
+		 * PTE allocation uses GFP_KERNEL which means we need to
+		 * pre-allocate the PTE here because we cannot do the
+		 * allocation during patching when IRQs are disabled.
+		 */
+		ptep = get_locked_pte(patching_mm, patching_addr, &ptl);
+		WARN_ON(!ptep);
+		pte_unmap_unlock(ptep, ptl);
+	}
+}
+
 void __init poking_init(void)
 {
+	if (radix_enabled()) {
+		__poking_init_temp_mm();
+		return;
+	}
+
 	WARN_ON(cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
 		"powerpc/text_poke:online", text_area_cpu_up,
 		text_area_cpu_down) < 0);
@@ -167,6 +209,57 @@ static inline int unmap_patch_area(unsigned long addr)
 	return 0;
 }
 
+/*
+ * This can be called for kernel text or a module.
+ */
+static int patch_instruction_mm(u32 *addr, struct ppc_inst instr)
+{
+	struct mm_struct *patching_mm = __this_cpu_read(cpu_patching_mm);
+	unsigned long text_poke_addr;
+	u32 *patch_addr = NULL;
+	struct temp_mm_state prev;
+	unsigned long flags;
+	struct page *page;
+	spinlock_t *ptl;
+	int rc;
+	pte_t *ptep;
+
+	text_poke_addr = __this_cpu_read(cpu_patching_addr);
+
+	local_irq_save(flags);
+
+	if (is_vmalloc_or_module_addr(addr))
+		page = vmalloc_to_page(addr);
+	else
+		page = virt_to_page(addr);
+
+	ptep = get_locked_pte(patching_mm, text_poke_addr, &ptl);
+	if (unlikely(!ptep)) {
+		pr_warn("map patch: failed to allocate pte for patching\n");
+		return -1;
+	}
+
+	set_pte_at(patching_mm, text_poke_addr, ptep, pte_mkdirty(mk_pte(page, PAGE_KERNEL)));
+	asm volatile("ptesync": : :"memory");
+
+	prev = start_using_temp_mm(patching_mm);
+
+	patch_addr = (u32 *)(text_poke_addr | offset_in_page(addr));
+	rc = __patch_instruction(addr, instr, patch_addr);
+
+	pte_clear(patching_mm, text_poke_addr, ptep);
+
+	local_flush_tlb_mm(patching_mm);
+
+	stop_using_temp_mm(prev);
+	pte_unmap_unlock(ptep, ptl);
+
+	local_irq_restore(flags);
+	WARN_ON(!ppc_inst_equal(ppc_inst_read(addr), instr));
+
+	return rc;
+}
+
 static int do_patch_instruction(u32 *addr, struct ppc_inst instr)
 {
 	int err, rc = 0;
@@ -175,16 +268,19 @@ static int do_patch_instruction(u32 *addr, struct ppc_inst instr)
 	unsigned long text_poke_addr;
 
 	/*
-	 * During early early boot patch_instruction is called
-	 * when text_poke_area is not ready, but we still need
-	 * to allow patching. We just do the plain old patching
+	 * During early boot patch_instruction is called when the
+	 * patching_mm/text_poke_area is not ready, but we still need to allow
+	 * patching. We just do the plain old patching.
 	 */
-	if (!this_cpu_read(text_poke_area))
+	text_poke_addr = __this_cpu_read(cpu_patching_addr);
+	if (!text_poke_addr)
 		return raw_patch_instruction(addr, instr);
 
+	if (radix_enabled())
+		return patch_instruction_mm(addr, instr);
+
 	local_irq_save(flags);
 
-	text_poke_addr = __this_cpu_read(cpu_patching_addr);
 	err = map_patch_area(addr, text_poke_addr);
 	if (err)
 		goto out;
-- 
2.25.1


^ permalink raw reply related

* [PATCH v3 0/4] powerpc: watchdog fixes
From: Nicholas Piggin @ 2021-11-10  2:50 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Laurent Dufour, Nicholas Piggin

These are some watchdog fixes and improvements, in particular a
deadlock between the wd_smp_lock and console lock when the watchdog
fires, found by Laurent.

Thanks,
Nick

Since v2:
- Fix a false positive warning in patch 1 found by Laurent.
- Move a comment change hunk to the correct patch.
- Drop the patch that removed the unstuck backtrace which is considered
  useful.

Since v1:
- Fixes noticed by Laurent in v1.
- Correct the description of the ABBA deadlock I wrote incorrectly in
  v1.
- Made several other improvements (patches 2,4,5).

Nicholas Piggin (4):
  powerpc/watchdog: Fix missed watchdog reset due to memory ordering
    race
  powerpc/watchdog: tighten non-atomic read-modify-write access
  powerpc/watchdog: Avoid holding wd_smp_lock over printk and
    smp_send_nmi_ipi
  powerpc/watchdog: read TB close to where it is used

 arch/powerpc/kernel/watchdog.c | 182 ++++++++++++++++++++++++++-------
 1 file changed, 147 insertions(+), 35 deletions(-)

-- 
2.23.0


^ permalink raw reply

* [PATCH v3 1/4] powerpc/watchdog: Fix missed watchdog reset due to memory ordering race
From: Nicholas Piggin @ 2021-11-10  2:50 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Laurent Dufour, Nicholas Piggin
In-Reply-To: <20211110025056.2084347-1-npiggin@gmail.com>

It is possible for all CPUs to miss the pending cpumask becoming clear,
and then nobody resetting it, which will cause the lockup detector to
stop working. It will eventually expire, but watchdog_smp_panic will
avoid doing anything if the pending mask is clear and it will never be
reset.

Order the cpumask clear vs the subsequent test to close this race.

Add an extra check for an empty pending mask when the watchdog fires and
finds its bit still clear, to try to catch any other possible races or
bugs here and keep the watchdog working. The extra test in
arch_touch_nmi_watchdog is required to prevent the new warning from
firing off.

Debugged-by: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/watchdog.c | 41 +++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index f9ea0e5357f9..3c60872b6a2c 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -135,6 +135,10 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
 {
 	cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
 	cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
+	/*
+	 * See wd_smp_clear_cpu_pending()
+	 */
+	smp_mb();
 	if (cpumask_empty(&wd_smp_cpus_pending)) {
 		wd_smp_last_reset_tb = tb;
 		cpumask_andnot(&wd_smp_cpus_pending,
@@ -215,13 +219,44 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
 
 			cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
 			wd_smp_unlock(&flags);
+		} else {
+			/*
+			 * The last CPU to clear pending should have reset the
+			 * watchdog so we generally should not find it empty
+			 * here if our CPU was clear. However it could happen
+			 * due to a rare race with another CPU taking the
+			 * last CPU out of the mask concurrently.
+			 *
+			 * We can't add a warning for it. But just in case
+			 * there is a problem with the watchdog that is causing
+			 * the mask to not be reset, try to kick it along here.
+			 */
+			if (unlikely(cpumask_empty(&wd_smp_cpus_pending)))
+				goto none_pending;
 		}
 		return;
 	}
+
 	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
+
+	/*
+	 * Order the store to clear pending with the load(s) to check all
+	 * words in the pending mask to check they are all empty. This orders
+	 * with the same barrier on another CPU. This prevents two CPUs
+	 * clearing the last 2 pending bits, but neither seeing the other's
+	 * store when checking if the mask is empty, and missing an empty
+	 * mask, which ends with a false positive.
+	 */
+	smp_mb();
 	if (cpumask_empty(&wd_smp_cpus_pending)) {
 		unsigned long flags;
 
+none_pending:
+		/*
+		 * Double check under lock because more than one CPU could see
+		 * a clear mask with the lockless check after clearing their
+		 * pending bits.
+		 */
 		wd_smp_lock(&flags);
 		if (cpumask_empty(&wd_smp_cpus_pending)) {
 			wd_smp_last_reset_tb = tb;
@@ -312,8 +347,12 @@ void arch_touch_nmi_watchdog(void)
 {
 	unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
 	int cpu = smp_processor_id();
-	u64 tb = get_tb();
+	u64 tb;
 
+	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
+		return;
+
+	tb = get_tb();
 	if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
 		per_cpu(wd_timer_tb, cpu) = tb;
 		wd_smp_clear_cpu_pending(cpu, tb);
-- 
2.23.0


^ permalink raw reply related

* [PATCH v3 2/4] powerpc/watchdog: tighten non-atomic read-modify-write access
From: Nicholas Piggin @ 2021-11-10  2:50 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Laurent Dufour, Nicholas Piggin
In-Reply-To: <20211110025056.2084347-1-npiggin@gmail.com>

Most updates to wd_smp_cpus_pending are under lock except the watchdog
interrupt bit clear.

This can race with non-atomic RMW updates to the mask under lock, which
can happen in two instances:

Firstly, if another CPU detects this one is stuck, removes it from the
mask, mask becomes empty and is re-filled with non-atomic stores. This
is okay because it would re-fill the mask with this CPU's bit clear
anyway (because this CPU is now stuck), so it doesn't matter that the
bit clear update got "lost". Add a comment for this.

Secondly, if another CPU detects a different CPU is stuck and removes it
from the pending mask with a non-atomic store to bytes which also
include the bit of this CPU. This case can result in the bit clear being
lost and the end result being the bit is set. This should be so rare it
hardly matters, but to make things simpler to reason about just avoid
the non-atomic access for that case.

Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/watchdog.c | 36 ++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 3c60872b6a2c..668ea1c13bef 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -131,10 +131,10 @@ static void wd_lockup_ipi(struct pt_regs *regs)
 	/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
-static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
+static bool set_cpu_stuck(int cpu, u64 tb)
 {
-	cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
-	cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
+	cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
+	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
 	/*
 	 * See wd_smp_clear_cpu_pending()
 	 */
@@ -144,11 +144,9 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
 		cpumask_andnot(&wd_smp_cpus_pending,
 				&wd_cpus_enabled,
 				&wd_smp_cpus_stuck);
+		return true;
 	}
-}
-static void set_cpu_stuck(int cpu, u64 tb)
-{
-	set_cpumask_stuck(cpumask_of(cpu), tb);
+	return false;
 }
 
 static void watchdog_smp_panic(int cpu, u64 tb)
@@ -177,15 +175,17 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 		 * get a backtrace on all of them anyway.
 		 */
 		for_each_cpu(c, &wd_smp_cpus_pending) {
+			bool empty;
 			if (c == cpu)
 				continue;
+			/* Take the stuck CPUs out of the watch group */
+			empty = set_cpu_stuck(c, tb);
 			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+			if (empty)
+				break;
 		}
 	}
 
-	/* Take the stuck CPUs out of the watch group */
-	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
-
 	wd_smp_unlock(&flags);
 
 	if (sysctl_hardlockup_all_cpu_backtrace)
@@ -237,6 +237,22 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
 		return;
 	}
 
+	/*
+	 * All other updates to wd_smp_cpus_pending are performed under
+	 * wd_smp_lock. All of them are atomic except the case where the
+	 * mask becomes empty and is reset. This will not happen here because
+	 * cpu was tested to be in the bitmap (above), and a CPU only clears
+	 * its own bit. _Except_ in the case where another CPU has detected a
+	 * hard lockup on our CPU and takes us out of the pending mask. So in
+	 * normal operation there will be no race here, no problem.
+	 *
+	 * In the lockup case, this atomic clear-bit vs a store that refills
+	 * other bits in the accessed word wll not be a problem. The bit clear
+	 * is atomic so it will not cause the store to get lost, and the store
+	 * will never set this bit so it will not overwrite the bit clear. The
+	 * only way for a stuck CPU to return to the pending bitmap is to
+	 * become unstuck itself.
+	 */
 	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
 
 	/*
-- 
2.23.0


^ permalink raw reply related

* [PATCH v3 3/4] powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi
From: Nicholas Piggin @ 2021-11-10  2:50 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Laurent Dufour, Nicholas Piggin
In-Reply-To: <20211110025056.2084347-1-npiggin@gmail.com>

There is a deadlock with the console_owner lock and the wd_smp_lock:

CPU x takes the console_owner lock
CPU y takes a watchdog timer interrupt and takes __wd_smp_lock
CPU x takes a soft-NMI interrupt, detects deadlock, spins on __wd_smp_lock
CPU y detects deadlock, tries to print something and spins on console_owner
-> deadlock

Change the watchdog locking scheme so wd_smp_lock protects the watchdog
internal data, but "reporting" (printing, issuing NMI IPIs, taking any
action outside of watchdog) uses a non-waiting exclusion. If a CPU detects
a problem but can not take the reporting lock, it just returns because
something else is already reporting. It will try again at some point.

Typically hard lockup watchdog report usefulness is not impacted due to
failure to spewing a large enough amount of data in as short a time as
possible, but by messages getting garbled.

Laurent debugged this and found the deadlock, and this patch is based on
his general approach to avoid expensive operations while holding the lock.
With the addition of the reporting exclusion.

Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
[np: rework to add reporting exclusion update changelog]
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/watchdog.c | 93 +++++++++++++++++++++++++++-------
 1 file changed, 74 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 668ea1c13bef..1b11c4b1c79e 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -85,10 +85,36 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
 
 /* SMP checker bits */
 static unsigned long __wd_smp_lock;
+static unsigned long __wd_reporting;
 static cpumask_t wd_smp_cpus_pending;
 static cpumask_t wd_smp_cpus_stuck;
 static u64 wd_smp_last_reset_tb;
 
+/*
+ * Try to take the exclusive watchdog action / NMI IPI / printing lock.
+ * wd_smp_lock must be held. If this fails, we should return and wait
+ * for the watchdog to kick in again (or another CPU to trigger it).
+ *
+ * Importantly, if hardlockup_panic is set, wd_try_report failure should
+ * not delay the panic, because whichever other CPU is reporting will
+ * call panic.
+ */
+static bool wd_try_report(void)
+{
+	if (__wd_reporting)
+		return false;
+	__wd_reporting = 1;
+	return true;
+}
+
+/* End printing after successful wd_try_report. wd_smp_lock not required. */
+static void wd_end_reporting(void)
+{
+	smp_mb(); /* End printing "critical section" */
+	WARN_ON_ONCE(__wd_reporting == 0);
+	WRITE_ONCE(__wd_reporting, 0);
+}
+
 static inline void wd_smp_lock(unsigned long *flags)
 {
 	/*
@@ -151,6 +177,7 @@ static bool set_cpu_stuck(int cpu, u64 tb)
 
 static void watchdog_smp_panic(int cpu, u64 tb)
 {
+	static cpumask_t wd_smp_cpus_ipi; // protected by reporting
 	unsigned long flags;
 	int c;
 
@@ -160,11 +187,26 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 		goto out;
 	if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
 		goto out;
-	if (cpumask_weight(&wd_smp_cpus_pending) == 0)
+	if (!wd_try_report())
 		goto out;
+	for_each_online_cpu(c) {
+		if (!cpumask_test_cpu(c, &wd_smp_cpus_pending))
+			continue;
+		if (c == cpu)
+			continue; // should not happen
+
+		__cpumask_set_cpu(c, &wd_smp_cpus_ipi);
+		if (set_cpu_stuck(c, tb))
+			break;
+	}
+	if (cpumask_empty(&wd_smp_cpus_ipi)) {
+		wd_end_reporting();
+		goto out;
+	}
+	wd_smp_unlock(&flags);
 
 	pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
-		 cpu, cpumask_pr_args(&wd_smp_cpus_pending));
+		 cpu, cpumask_pr_args(&wd_smp_cpus_ipi));
 	pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n",
 		 cpu, tb, wd_smp_last_reset_tb,
 		 tb_to_ns(tb - wd_smp_last_reset_tb) / 1000000);
@@ -174,26 +216,20 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 		 * Try to trigger the stuck CPUs, unless we are going to
 		 * get a backtrace on all of them anyway.
 		 */
-		for_each_cpu(c, &wd_smp_cpus_pending) {
-			bool empty;
-			if (c == cpu)
-				continue;
-			/* Take the stuck CPUs out of the watch group */
-			empty = set_cpu_stuck(c, tb);
+		for_each_cpu(c, &wd_smp_cpus_ipi) {
 			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
-			if (empty)
-				break;
+			__cpumask_clear_cpu(c, &wd_smp_cpus_ipi);
 		}
-	}
-
-	wd_smp_unlock(&flags);
-
-	if (sysctl_hardlockup_all_cpu_backtrace)
+	} else {
 		trigger_allbutself_cpu_backtrace();
+		cpumask_clear(&wd_smp_cpus_ipi);
+	}
 
 	if (hardlockup_panic)
 		nmi_panic(NULL, "Hard LOCKUP");
 
+	wd_end_reporting();
+
 	return;
 
 out:
@@ -207,8 +243,6 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
 			struct pt_regs *regs = get_irq_regs();
 			unsigned long flags;
 
-			wd_smp_lock(&flags);
-
 			pr_emerg("CPU %d became unstuck TB:%lld\n",
 				 cpu, tb);
 			print_irqtrace_events(current);
@@ -217,6 +251,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
 			else
 				dump_stack();
 
+			wd_smp_lock(&flags);
 			cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
 			wd_smp_unlock(&flags);
 		} else {
@@ -312,13 +347,28 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
 
 	tb = get_tb();
 	if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
+		/*
+		 * Taking wd_smp_lock here means it is a soft-NMI lock, which
+		 * means we can't take any regular or irqsafe spin locks while
+		 * holding this lock. This is why timers can't printk while
+		 * holding the lock.
+		 */
 		wd_smp_lock(&flags);
 		if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
 			wd_smp_unlock(&flags);
 			return 0;
 		}
+		if (!wd_try_report()) {
+			wd_smp_unlock(&flags);
+			/* Couldn't report, try again in 100ms */
+			mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000);
+			return 0;
+		}
+
 		set_cpu_stuck(cpu, tb);
 
+		wd_smp_unlock(&flags);
+
 		pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
 			 cpu, (void *)regs->nip);
 		pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
@@ -328,14 +378,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
 		print_irqtrace_events(current);
 		show_regs(regs);
 
-		wd_smp_unlock(&flags);
-
 		if (sysctl_hardlockup_all_cpu_backtrace)
 			trigger_allbutself_cpu_backtrace();
 
 		if (hardlockup_panic)
 			nmi_panic(regs, "Hard LOCKUP");
+
+		wd_end_reporting();
 	}
+	/*
+	 * We are okay to change DEC in soft_nmi_interrupt because the masked
+	 * handler has marked a DEC as pending, so the timer interrupt will be
+	 * replayed as soon as local irqs are enabled again.
+	 */
 	if (wd_panic_timeout_tb < 0x7fffffff)
 		mtspr(SPRN_DEC, wd_panic_timeout_tb);
 
-- 
2.23.0


^ permalink raw reply related

* [PATCH v3 4/4] powerpc/watchdog: read TB close to where it is used
From: Nicholas Piggin @ 2021-11-10  2:50 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Laurent Dufour, Nicholas Piggin
In-Reply-To: <20211110025056.2084347-1-npiggin@gmail.com>

When taking watchdog actions, printing messages, comparing and
re-setting wd_smp_last_reset_tb, etc., read TB close to the point of use
and under wd_smp_lock or printing lock (if applicable).

This should keep timebase mostly monotonic with kernel log messages, and
could prevent (in theory) a laggy CPU updating wd_smp_last_reset_tb to
something a long way in the past, and causing other CPUs to appear to be
stuck.

These additional TB reads are all slowpath (lockup has been detected),
so performance does not matter.

Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/kernel/watchdog.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 1b11c4b1c79e..936f889995d3 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -157,7 +157,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
 	/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
-static bool set_cpu_stuck(int cpu, u64 tb)
+static bool set_cpu_stuck(int cpu)
 {
 	cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
 	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
@@ -166,7 +166,7 @@ static bool set_cpu_stuck(int cpu, u64 tb)
 	 */
 	smp_mb();
 	if (cpumask_empty(&wd_smp_cpus_pending)) {
-		wd_smp_last_reset_tb = tb;
+		wd_smp_last_reset_tb = get_tb();
 		cpumask_andnot(&wd_smp_cpus_pending,
 				&wd_cpus_enabled,
 				&wd_smp_cpus_stuck);
@@ -175,14 +175,16 @@ static bool set_cpu_stuck(int cpu, u64 tb)
 	return false;
 }
 
-static void watchdog_smp_panic(int cpu, u64 tb)
+static void watchdog_smp_panic(int cpu)
 {
 	static cpumask_t wd_smp_cpus_ipi; // protected by reporting
 	unsigned long flags;
+	u64 tb;
 	int c;
 
 	wd_smp_lock(&flags);
 	/* Double check some things under lock */
+	tb = get_tb();
 	if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
 		goto out;
 	if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
@@ -196,7 +198,7 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 			continue; // should not happen
 
 		__cpumask_set_cpu(c, &wd_smp_cpus_ipi);
-		if (set_cpu_stuck(c, tb))
+		if (set_cpu_stuck(c))
 			break;
 	}
 	if (cpumask_empty(&wd_smp_cpus_ipi)) {
@@ -236,7 +238,7 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 	wd_smp_unlock(&flags);
 }
 
-static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
+static void wd_smp_clear_cpu_pending(int cpu)
 {
 	if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
 		if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
@@ -244,7 +246,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
 			unsigned long flags;
 
 			pr_emerg("CPU %d became unstuck TB:%lld\n",
-				 cpu, tb);
+				 cpu, get_tb());
 			print_irqtrace_events(current);
 			if (regs)
 				show_regs(regs);
@@ -310,7 +312,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
 		 */
 		wd_smp_lock(&flags);
 		if (cpumask_empty(&wd_smp_cpus_pending)) {
-			wd_smp_last_reset_tb = tb;
+			wd_smp_last_reset_tb = get_tb();
 			cpumask_andnot(&wd_smp_cpus_pending,
 					&wd_cpus_enabled,
 					&wd_smp_cpus_stuck);
@@ -325,10 +327,10 @@ static void watchdog_timer_interrupt(int cpu)
 
 	per_cpu(wd_timer_tb, cpu) = tb;
 
-	wd_smp_clear_cpu_pending(cpu, tb);
+	wd_smp_clear_cpu_pending(cpu);
 
 	if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
-		watchdog_smp_panic(cpu, tb);
+		watchdog_smp_panic(cpu);
 }
 
 DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
@@ -365,7 +367,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
 			return 0;
 		}
 
-		set_cpu_stuck(cpu, tb);
+		set_cpu_stuck(cpu);
 
 		wd_smp_unlock(&flags);
 
@@ -426,7 +428,7 @@ void arch_touch_nmi_watchdog(void)
 	tb = get_tb();
 	if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
 		per_cpu(wd_timer_tb, cpu) = tb;
-		wd_smp_clear_cpu_pending(cpu, tb);
+		wd_smp_clear_cpu_pending(cpu);
 	}
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
@@ -484,7 +486,7 @@ static void stop_watchdog(void *arg)
 	cpumask_clear_cpu(cpu, &wd_cpus_enabled);
 	wd_smp_unlock(&flags);
 
-	wd_smp_clear_cpu_pending(cpu, get_tb());
+	wd_smp_clear_cpu_pending(cpu);
 }
 
 static int stop_watchdog_on_cpu(unsigned int cpu)
-- 
2.23.0


^ permalink raw reply related

* [PATCH v1] powerpc/watchdog: help remote CPUs to flush NMI printk output
From: Nicholas Piggin @ 2021-11-10  3:04 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Laurent Dufour, Nicholas Piggin

The printk layer at the moment does not seem to have a good way to force
flush printk messages that are created in NMI context, except in the
panic path.

NMI-context printk messages normally get to the console with irq_work,
but that won't help if the CPU is stuck with irqs disabled, as can be
the case for hard lockup watchdog messages.

The watchdog currently flushes the printk buffers after detecting a
lockup on remote CPUs, but they may not have processed their NMI IPI
yet by that stage, or they may have self-detected a lockup in which
case they won't go via this NMI IPI path.

Improve the situation by having NMI-context mark a flag if it called
printk, and have watchdog timer interrupts check if that flag was set
and try to flush if it was. Latency is not a big problem because we
were already stuck for a while, just need to try to make sure the
messages eventually make it out.

Cc: Laurent Dufour <ldufour@linux.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
This patch is actually based on top of this one which is planned to go
upstream in rc1/2. https://marc.info/?l=linux-kernel&m=163626070312052&w=2

Prior to commit 93d102f094be that is fixed by the above, we had a printk
flush function with a different name but basically does the same job, so
this patch can be backported, just needs some care. I'm posting it for
review now for feedback so it's ready to go when the printk patch is
upstream.

Thanks,
Nick

 arch/powerpc/kernel/watchdog.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index b6533539386b..a7b6b0691203 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -86,6 +86,7 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
 /* SMP checker bits */
 static unsigned long __wd_smp_lock;
 static unsigned long __wd_reporting;
+static unsigned long __wd_nmi_output;
 static cpumask_t wd_smp_cpus_pending;
 static cpumask_t wd_smp_cpus_stuck;
 static u64 wd_smp_last_reset_tb;
@@ -154,6 +155,18 @@ static void wd_lockup_ipi(struct pt_regs *regs)
 	else
 		dump_stack();
 
+	/*
+	 * We printk()ed from NMI context, the contents may not get flushed
+	 * if we return to a context with interrupts disabled because
+	 * printk uses irq_work to schedule flushes of NMI output.
+	 * __wd_nmi_output says there has been output from NMI context, so
+	 * other CPUs are recruited to help flush it.
+	 *
+	 * xchg is not needed here (it could be a simple atomic store), but
+	 * it gives the memory ordering and atomicity required.
+	 */
+	xchg(&__wd_nmi_output, 1);
+
 	/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
@@ -227,12 +240,6 @@ static void watchdog_smp_panic(int cpu)
 		cpumask_clear(&wd_smp_cpus_ipi);
 	}
 
-	/*
-	 * Force flush any remote buffers that might be stuck in IRQ context
-	 * and therefore could not run their irq_work.
-	 */
-	printk_trigger_flush();
-
 	if (hardlockup_panic)
 		nmi_panic(NULL, "Hard LOCKUP");
 
@@ -337,6 +344,14 @@ static void watchdog_timer_interrupt(int cpu)
 
 	if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
 		watchdog_smp_panic(cpu);
+
+	if (__wd_nmi_output && xchg(&__wd_nmi_output, 0)) {
+		/*
+		 * This triggers a flush to try get stuck NMI printk output
+		 * from remote CPUs out to the console. See wd_lockup_ipi.
+		 */
+		printk_trigger_flush();
+	}
 }
 
 DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
@@ -386,6 +401,8 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
 		print_irqtrace_events(current);
 		show_regs(regs);
 
+		xchg(&__wd_nmi_output, 1); // see wd_lockup_ipi
+
 		if (sysctl_hardlockup_all_cpu_backtrace)
 			trigger_allbutself_cpu_backtrace();
 
-- 
2.23.0


^ permalink raw reply related

* Re: [PASEMI] Nemo board doesn't recognize any ATA disks with the pci-v5.16 updates
From: Damien Le Moal @ 2021-11-10  3:52 UTC (permalink / raw)
  To: Krzysztof Wilczyński, Bjorn Helgaas
  Cc: Jens Axboe, Darren Stevens, mad skateman,
	linux-pci@vger.kernel.org, Olof Johansson, R.T.Dickinson,
	Christian Zigotzky, bhelgaas@google.com >> Bjorn Helgaas,
	Matthew Leaman, linuxppc-dev, Christian Zigotzky
In-Reply-To: <YYr4x1xWfptXRmqt@rocinante>

On 2021/11/10 7:40, Krzysztof Wilczyński wrote:
> [+CC Adding Jens and Damien to get their opinion about the problem at hand]
> 
> Hello Jens and Damien,
> 
> Sorry to bother both of you, but we are having a problem that most
> definitely requires someone with an extensive expertise in storage,
> as per the quoted message from Christian below:
> 
>>>> The Nemo board [1] doesn't recognize any ATA disks with the pci-v5.16
>>>> updates [2].
>>>>
>>>> Error messages:
>>>>
>>>> ata4.00: gc timeout cmd 0xec
>>>> ata4.00: failed to IDENTIFY (I/O error, error_mask=0x4)
>>>> ata1.00: gc timeout cmd 0xec
>>>> ata1.00: failed to IDENTIFY (I/O error, error_mask=0x4)
>>>> ata3.00: gc timeout cmd 0xec
>>>> ata3.00: failed to IDENTIFY (I/O error, error_mask=0x4)

IDENTIFY is the first command sent to a device when it is being probed. This
means that at least the AHCI (is it AHCI ?) adapter found the ports and drives
connected. But the qc timeout indicates that there is no response from the
drive. This could be due to interrupts not being received for the command
completion. One thing to try would be to increase the identify command timeout
to see things simply got slow (for whatever reason) or if indeed there is no
response at all. Note that after the first timeout, normally the port is reset
and the command retried. That does not seem to be the case here. Weird...

Maybe try something like this:

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 1d4a6f1e88cd..16e105bcb899 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -79,7 +79,7 @@ enum {
  * take an exceptionally long time to recover from reset.
  */
 static const unsigned long ata_eh_reset_timeouts[] = {
-       10000,  /* most drives spin up by 10sec */
+       30000,  /* most drives spin up by 10sec */
        10000,  /* > 99% working drives spin up before 20sec */
        35000,  /* give > 30 secs of idleness for outlier devices */
         5000,  /* and sweet one last chance */

Also note that I posted a patch a couple of days ago fixing a qc timeout for
read log commands during device probe. This is not what you are hitting here
though. I have not yet sent this to Linus.

https://lore.kernel.org/linux-ide/20211105073106.422623-1-damien.lemoal@opensource.wdc.com/



> 
> The error message is also not very detailed and we aren't really sure what
> the issue coming from the PCI sub-system might be causing or leading to
> this.
> 
>>>>
>>>> I was able to revert the new pci-v5.16 updates [2]. After a new compiling,
>>>> the kernel recognize all ATA disks correctly.
>>>>
>>>> Could you please check the pci-v5.16 updates [2]?
>>>>
>>>> Please find attached the kernel config.
>>>>
>>>> Thanks,
>>>> Christian
>>>>
>>>> [1] https://en.wikipedia.org/wiki/AmigaOne_X1000
>>>> [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0c5c62ddf88c34bc83b66e4ac9beb2bb0e1887d4
>>
>> Sorry for the breakage, and thank you very much for the report.  Can
>> you please collect the complete dmesg logs before and after the
>> pci-v5.16 changes and the "sudo lspci -vv" output from before the
>> changes?
>>
>> You can attach them at https://bugzilla.kernel.org if you don't have
>> a better place to put them.
>>
>> You could attach the kernel config there, too, since it didn't make it
>> to the mailing list (vger may discard them -- see
>> http://vger.kernel.org/majordomo-info.html).
> 
> Bjorn and I looked at which commits that went with a recent Pull Request
> from us might be causing this, but we are a little bit at loss, and were
> hoping that you could give us a hand in troubleshooting this.
> 
> Thank you in advance!
> 
> 	Krzysztof
> 
> 


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply related

* Re: [PATCH v1] powerpc/64s: Get LPID bit width from device tree
From: Nicholas Piggin @ 2021-11-10  7:02 UTC (permalink / raw)
  To: Fabiano Rosas, linuxppc-dev
In-Reply-To: <87lf1xowmu.fsf@linux.ibm.com>

Excerpts from Fabiano Rosas's message of November 10, 2021 7:19 am:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
>> Allow the LPID bit width and partition table size to be set at runtime
>> from the device tree.
>>
>> Move the PID bit width detection into the same place.
>>
>> KVM does not support using the extra bits yet, this is mainly required
>> to get the PTCR register values correct.
>>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/powerpc/include/asm/book3s/64/mmu.h  |  9 ++---
>>  arch/powerpc/include/asm/kvm_book3s_asm.h |  6 +++-
>>  arch/powerpc/include/asm/reg.h            |  2 --
>>  arch/powerpc/mm/book3s64/pgtable.c        |  5 ---
>>  arch/powerpc/mm/book3s64/radix_pgtable.c  | 13 +------
>>  arch/powerpc/mm/init_64.c                 | 44 +++++++++++++++++++++++
>>  6 files changed, 55 insertions(+), 24 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
>> index c02f42d1031e..8c500dd6fee4 100644
>> --- a/arch/powerpc/include/asm/book3s/64/mmu.h
>> +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
>> @@ -62,6 +62,9 @@ extern struct patb_entry *partition_tb;
>>  #define PRTS_MASK	0x1f		/* process table size field */
>>  #define PRTB_MASK	0x0ffffffffffff000UL
>>
>> +/* Number of supported LPID bits */
>> +extern unsigned int mmu_lpid_bits;
>> +
>>  /* Number of supported PID bits */
>>  extern unsigned int mmu_pid_bits;
>>
>> @@ -76,10 +79,8 @@ extern unsigned long __ro_after_init radix_mem_block_size;
>>  #define PRTB_SIZE_SHIFT	(mmu_pid_bits + 4)
>>  #define PRTB_ENTRIES	(1ul << mmu_pid_bits)
>>
>> -/*
>> - * Power9 currently only support 64K partition table size.
>> - */
>> -#define PATB_SIZE_SHIFT	16
>> +#define PATB_SIZE_SHIFT	(mmu_lpid_bits + 4)
>> +#define PATB_ENTRIES	(1ul << mmu_lpid_bits)
>>
>>  typedef unsigned long mm_context_id_t;
>>  struct spinlock;
>> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
>> index b6d31bff5209..4d93e09a1ab9 100644
>> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
>> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
>> @@ -15,7 +15,11 @@
>>  #define XICS_IPI		2	/* interrupt source # for IPIs */
>>
>>  /* LPIDs we support with this build -- runtime limit may be lower */
>> -#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
>> +#define KVMPPC_NR_LPIDS		(1ul << 12)
>> +
>> +/* Reserved LPID for partn switching */
>> +#define   LPID_RSVD_POWER7	((1ul << 10) - 1)
>> +#define   LPID_RSVD		(KVMPPC_NR_LPIDS - 1)
>>
>>  /* Maximum number of threads per physical core */
>>  #define MAX_SMT_THREADS		8
>> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
>> index e9d27265253b..30983f2fd6a4 100644
>> --- a/arch/powerpc/include/asm/reg.h
>> +++ b/arch/powerpc/include/asm/reg.h
>> @@ -474,8 +474,6 @@
>>  #ifndef SPRN_LPID
>>  #define SPRN_LPID	0x13F	/* Logical Partition Identifier */
>>  #endif
>> -#define   LPID_RSVD_POWER7	0x3ff	/* Reserved LPID for partn switching */
>> -#define   LPID_RSVD		0xfff	/* Reserved LPID for partn switching */
>>  #define	SPRN_HMER	0x150	/* Hypervisor maintenance exception reg */
>>  #define   HMER_DEBUG_TRIG	(1ul << (63 - 17)) /* Debug trigger */
>>  #define	SPRN_HMEER	0x151	/* Hyp maintenance exception enable reg */
>> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
>> index 9e16c7b1a6c5..13d1fbddecb9 100644
>> --- a/arch/powerpc/mm/book3s64/pgtable.c
>> +++ b/arch/powerpc/mm/book3s64/pgtable.c
>> @@ -207,17 +207,12 @@ void __init mmu_partition_table_init(void)
>>  	unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
>>  	unsigned long ptcr;
>>
>> -	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
>>  	/* Initialize the Partition Table with no entries */
>>  	partition_tb = memblock_alloc(patb_size, patb_size);
>>  	if (!partition_tb)
>>  		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
>>  		      __func__, patb_size, patb_size);
>>
>> -	/*
>> -	 * update partition table control register,
>> -	 * 64 K size.
>> -	 */
>>  	ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
>>  	set_ptcr_when_no_uv(ptcr);
>>  	powernv_set_nmmu_ptcr(ptcr);
>> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> index ae20add7954a..1c855434f8dc 100644
>> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
>> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
>> @@ -33,7 +33,6 @@
>>
>>  #include <trace/events/thp.h>
>>
>> -unsigned int mmu_pid_bits;
>>  unsigned int mmu_base_pid;
>>  unsigned long radix_mem_block_size __ro_after_init;
>>
>> @@ -357,18 +356,13 @@ static void __init radix_init_pgtable(void)
>>  						-1, PAGE_KERNEL));
>>  	}
>>
>> -	/* Find out how many PID bits are supported */
>>  	if (!cpu_has_feature(CPU_FTR_HVMODE) &&
>>  			cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
>>  		/*
>>  		 * Older versions of KVM on these machines perfer if the
>>  		 * guest only uses the low 19 PID bits.
>>  		 */
>> -		if (!mmu_pid_bits)
>> -			mmu_pid_bits = 19;
>> -	} else {
>> -		if (!mmu_pid_bits)
>> -			mmu_pid_bits = 20;
>> +		mmu_pid_bits = 19;
>>  	}
>>  	mmu_base_pid = 1;
>>
>> @@ -449,11 +443,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
>>  	if (type == NULL || strcmp(type, "cpu") != 0)
>>  		return 0;
>>
>> -	/* Find MMU PID size */
>> -	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
>> -	if (prop && size == 4)
>> -		mmu_pid_bits = be32_to_cpup(prop);
>> -
>>  	/* Grab page size encodings */
>>  	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
>>  	if (!prop)
>> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
>> index 386be136026e..7d5a6cb7c76d 100644
>> --- a/arch/powerpc/mm/init_64.c
>> +++ b/arch/powerpc/mm/init_64.c
>> @@ -370,6 +370,9 @@ void register_page_bootmem_memmap(unsigned long section_nr,
>>  #endif /* CONFIG_SPARSEMEM_VMEMMAP */
>>
>>  #ifdef CONFIG_PPC_BOOK3S_64
>> +unsigned int mmu_lpid_bits;
>> +unsigned int mmu_pid_bits;
>> +
>>  static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
>>
>>  static int __init parse_disable_radix(char *p)
>> @@ -437,12 +440,53 @@ static void __init early_check_vec5(void)
>>  	}
>>  }
>>
>> +static int __init dt_scan_mmu_pid_width(unsigned long node,
>> +					   const char *uname, int depth,
>> +					   void *data)
>> +{
>> +	int size = 0;
>> +	const __be32 *prop;
>> +	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
>> +
>> +	/* We are scanning "cpu" nodes only */
>> +	if (type == NULL || strcmp(type, "cpu") != 0)
>> +		return 0;
>> +
>> +	/* Find MMU LPID, PID register size */
>> +	prop = of_get_flat_dt_prop(node, "ibm,mmu-lpid-bits", &size);
>> +	if (prop && size == 4)
>> +		mmu_lpid_bits = be32_to_cpup(prop);
>> +
>> +	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
>> +	if (prop && size == 4)
>> +		mmu_pid_bits = be32_to_cpup(prop);
>> +
>> +	if (!mmu_pid_bits && !mmu_lpid_bits)
>> +		return 0;
> 
> What if only one of the properties is present?

Ah that may be a problem because the below fallback would not set the 
other one that was not found (e.g., some firmware might only give us
ibm,mmu-pid-bits because that was defined earlier).

I will respin.

Thanks,
Nick

^ permalink raw reply

* [PATCH v2] powerpc/64s: Get LPID bit width from device tree
From: Nicholas Piggin @ 2021-11-10  8:00 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Nicholas Piggin, Fabiano Rosas

Allow the LPID bit width and partition table size to be set at runtime
from the device tree.

Move the PID bit width detection into the same place.

KVM does not support using different sizes yet, this is mainly required
to get the PTCR register values correct.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/book3s/64/mmu.h |  9 +++--
 arch/powerpc/mm/book3s64/pgtable.c       |  5 ---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 13 +------
 arch/powerpc/mm/init_64.c                | 47 +++++++++++++++++++++++-
 4 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index c02f42d1031e..8c500dd6fee4 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -62,6 +62,9 @@ extern struct patb_entry *partition_tb;
 #define PRTS_MASK	0x1f		/* process table size field */
 #define PRTB_MASK	0x0ffffffffffff000UL
 
+/* Number of supported LPID bits */
+extern unsigned int mmu_lpid_bits;
+
 /* Number of supported PID bits */
 extern unsigned int mmu_pid_bits;
 
@@ -76,10 +79,8 @@ extern unsigned long __ro_after_init radix_mem_block_size;
 #define PRTB_SIZE_SHIFT	(mmu_pid_bits + 4)
 #define PRTB_ENTRIES	(1ul << mmu_pid_bits)
 
-/*
- * Power9 currently only support 64K partition table size.
- */
-#define PATB_SIZE_SHIFT	16
+#define PATB_SIZE_SHIFT	(mmu_lpid_bits + 4)
+#define PATB_ENTRIES	(1ul << mmu_lpid_bits)
 
 typedef unsigned long mm_context_id_t;
 struct spinlock;
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 9e16c7b1a6c5..13d1fbddecb9 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -207,17 +207,12 @@ void __init mmu_partition_table_init(void)
 	unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
 	unsigned long ptcr;
 
-	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
 	/* Initialize the Partition Table with no entries */
 	partition_tb = memblock_alloc(patb_size, patb_size);
 	if (!partition_tb)
 		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
 		      __func__, patb_size, patb_size);
 
-	/*
-	 * update partition table control register,
-	 * 64 K size.
-	 */
 	ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
 	set_ptcr_when_no_uv(ptcr);
 	powernv_set_nmmu_ptcr(ptcr);
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index ae20add7954a..1c855434f8dc 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -33,7 +33,6 @@
 
 #include <trace/events/thp.h>
 
-unsigned int mmu_pid_bits;
 unsigned int mmu_base_pid;
 unsigned long radix_mem_block_size __ro_after_init;
 
@@ -357,18 +356,13 @@ static void __init radix_init_pgtable(void)
 						-1, PAGE_KERNEL));
 	}
 
-	/* Find out how many PID bits are supported */
 	if (!cpu_has_feature(CPU_FTR_HVMODE) &&
 			cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
 		/*
 		 * Older versions of KVM on these machines perfer if the
 		 * guest only uses the low 19 PID bits.
 		 */
-		if (!mmu_pid_bits)
-			mmu_pid_bits = 19;
-	} else {
-		if (!mmu_pid_bits)
-			mmu_pid_bits = 20;
+		mmu_pid_bits = 19;
 	}
 	mmu_base_pid = 1;
 
@@ -449,11 +443,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
 	if (type == NULL || strcmp(type, "cpu") != 0)
 		return 0;
 
-	/* Find MMU PID size */
-	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
-	if (prop && size == 4)
-		mmu_pid_bits = be32_to_cpup(prop);
-
 	/* Grab page size encodings */
 	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
 	if (!prop)
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 386be136026e..04f45fcb1222 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -370,6 +370,9 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 #ifdef CONFIG_PPC_BOOK3S_64
+unsigned int mmu_lpid_bits;
+unsigned int mmu_pid_bits;
+
 static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
 
 static int __init parse_disable_radix(char *p)
@@ -437,19 +440,61 @@ static void __init early_check_vec5(void)
 	}
 }
 
+static int __init dt_scan_mmu_pid_width(unsigned long node,
+					   const char *uname, int depth,
+					   void *data)
+{
+	int size = 0;
+	const __be32 *prop;
+	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+
+	/* We are scanning "cpu" nodes only */
+	if (type == NULL || strcmp(type, "cpu") != 0)
+		return 0;
+
+	/* Find MMU LPID, PID register size */
+	prop = of_get_flat_dt_prop(node, "ibm,mmu-lpid-bits", &size);
+	if (prop && size == 4)
+		mmu_lpid_bits = be32_to_cpup(prop);
+
+	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
+	if (prop && size == 4)
+		mmu_pid_bits = be32_to_cpup(prop);
+
+	if (!mmu_pid_bits && !mmu_lpid_bits)
+		return 0;
+
+	return 1;
+}
+
 void __init mmu_early_init_devtree(void)
 {
+	int rc;
+	bool hvmode = !!(mfmsr() & MSR_HV);
+
 	/* Disable radix mode based on kernel command line. */
 	if (disable_radix)
 		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
 
+	rc = of_scan_flat_dt(dt_scan_mmu_pid_width, NULL);
+	if (hvmode && !mmu_lpid_bits) {
+		if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
+			mmu_lpid_bits = 12; /* POWER8-10 */
+		else
+			mmu_lpid_bits = 10; /* POWER7 */
+	}
+	if (!mmu_pid_bits) {
+		if (early_cpu_has_feature(CPU_FTR_ARCH_300))
+			mmu_pid_bits = 20; /* POWER9-10 */
+	}
+
 	/*
 	 * Check /chosen/ibm,architecture-vec-5 if running as a guest.
 	 * When running bare-metal, we can use radix if we like
 	 * even though the ibm,architecture-vec-5 property created by
 	 * skiboot doesn't have the necessary bits set.
 	 */
-	if (!(mfmsr() & MSR_HV))
+	if (!hvmode)
 		early_check_vec5();
 
 	if (early_radix_enabled()) {
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH v12 1/2] tty: hvc: pass DMA capable memory to put_chars()
From: Jiri Slaby @ 2021-11-10  9:50 UTC (permalink / raw)
  To: Xianting Tian, gregkh, amit, arnd, osandov
  Cc: sfr, shile.zhang, linuxppc-dev, linux-kernel, virtualization
In-Reply-To: <7dde342a-c2b7-32fe-7410-e372c82a4a68@linux.alibaba.com>

On 04. 11. 21, 14:06, Xianting Tian wrote:
>> OTOH, you need c[N_OUTBUF] in the console case (hvc_console_print), but
>> not whole hvc_struct. So cons_hvcs should be an array of structs
>> composed of only the lock and the buffer.
> It is ok for me.
>> =============
>>
>> And I would do it even simpler now. One c[N_OUTBUF] buffer for all 
>> consoles and a single lock.
>>
>> And "char c" in struct hvc_struct.
>>
>> No need for the complex logic in hvc_console_print.
> 
> So you will implement this?

No, there is a slight difference between "I would" and "I will" :). I 
don't have anything to test this hvc change on…

thanks,
-- 
js
suse labs

^ permalink raw reply

* [PATCH v2 3/3] soc: fsl: Replace kernel.h with the necessary inclusions
From: Andy Shevchenko @ 2021-11-10 10:59 UTC (permalink / raw)
  To: Andy Shevchenko, linux-kernel, linuxppc-dev, linux-arm-kernel
  Cc: Qiang Zhao, Li Yang
In-Reply-To: <20211110105952.62013-1-andriy.shevchenko@linux.intel.com>

When kernel.h is used in the headers it adds a lot into dependency hell,
especially when there are circular dependencies are involved.

Replace kernel.h inclusion with the list of what is really being used.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
v2: updated Cc list based on previous changes to MAINTAINERS
 include/soc/fsl/dpaa2-fd.h    | 3 ++-
 include/soc/fsl/qe/immap_qe.h | 3 ++-
 include/soc/fsl/qe/qe_tdm.h   | 4 +++-
 include/soc/fsl/qe/ucc_fast.h | 2 +-
 include/soc/fsl/qe/ucc_slow.h | 2 +-
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/soc/fsl/dpaa2-fd.h b/include/soc/fsl/dpaa2-fd.h
index 90ae8d191f1a..bae490cac0aa 100644
--- a/include/soc/fsl/dpaa2-fd.h
+++ b/include/soc/fsl/dpaa2-fd.h
@@ -7,7 +7,8 @@
 #ifndef __FSL_DPAA2_FD_H
 #define __FSL_DPAA2_FD_H
 
-#include <linux/kernel.h>
+#include <linux/byteorder/generic.h>
+#include <linux/types.h>
 
 /**
  * DOC: DPAA2 FD - Frame Descriptor APIs for DPAA2
diff --git a/include/soc/fsl/qe/immap_qe.h b/include/soc/fsl/qe/immap_qe.h
index 7614fee532f1..edd601f53f5d 100644
--- a/include/soc/fsl/qe/immap_qe.h
+++ b/include/soc/fsl/qe/immap_qe.h
@@ -13,7 +13,8 @@
 #define _ASM_POWERPC_IMMAP_QE_H
 #ifdef __KERNEL__
 
-#include <linux/kernel.h>
+#include <linux/types.h>
+
 #include <asm/io.h>
 
 #define QE_IMMAP_SIZE	(1024 * 1024)	/* 1MB from 1MB+IMMR */
diff --git a/include/soc/fsl/qe/qe_tdm.h b/include/soc/fsl/qe/qe_tdm.h
index b6febe225071..43ea830cfe1f 100644
--- a/include/soc/fsl/qe/qe_tdm.h
+++ b/include/soc/fsl/qe/qe_tdm.h
@@ -10,8 +10,8 @@
 #ifndef _QE_TDM_H_
 #define _QE_TDM_H_
 
-#include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/types.h>
 
 #include <soc/fsl/qe/immap_qe.h>
 #include <soc/fsl/qe/qe.h>
@@ -19,6 +19,8 @@
 #include <soc/fsl/qe/ucc.h>
 #include <soc/fsl/qe/ucc_fast.h>
 
+struct device_node;
+
 /* SI RAM entries */
 #define SIR_LAST	0x0001
 #define SIR_BYTE	0x0002
diff --git a/include/soc/fsl/qe/ucc_fast.h b/include/soc/fsl/qe/ucc_fast.h
index 9696a5b9b5d1..ad60b87a3c69 100644
--- a/include/soc/fsl/qe/ucc_fast.h
+++ b/include/soc/fsl/qe/ucc_fast.h
@@ -10,7 +10,7 @@
 #ifndef __UCC_FAST_H__
 #define __UCC_FAST_H__
 
-#include <linux/kernel.h>
+#include <linux/types.h>
 
 #include <soc/fsl/qe/immap_qe.h>
 #include <soc/fsl/qe/qe.h>
diff --git a/include/soc/fsl/qe/ucc_slow.h b/include/soc/fsl/qe/ucc_slow.h
index 11a216e4e919..7548ce8a202d 100644
--- a/include/soc/fsl/qe/ucc_slow.h
+++ b/include/soc/fsl/qe/ucc_slow.h
@@ -11,7 +11,7 @@
 #ifndef __UCC_SLOW_H__
 #define __UCC_SLOW_H__
 
-#include <linux/kernel.h>
+#include <linux/types.h>
 
 #include <soc/fsl/qe/immap_qe.h>
 #include <soc/fsl/qe/qe.h>
-- 
2.33.0


^ permalink raw reply related

* [PATCH v2 2/3] soc: fsl: Correct MAINTAINERS database (SOC)
From: Andy Shevchenko @ 2021-11-10 10:59 UTC (permalink / raw)
  To: Andy Shevchenko, linux-kernel, linuxppc-dev, linux-arm-kernel
  Cc: Qiang Zhao, Li Yang
In-Reply-To: <20211110105952.62013-1-andriy.shevchenko@linux.intel.com>

MAINTAINERS lacks of proper coverage for FSL headers. Fix it accordingly.

Fixes: 1b48706f027c ("MAINTAINERS: add entry for Freescale SoC drivers")
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
v2: new patch
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index a35e4d9e52b8..8748328fdc63 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7689,6 +7689,7 @@ F:	Documentation/devicetree/bindings/misc/fsl,dpaa2-console.yaml
 F:	Documentation/devicetree/bindings/soc/fsl/
 F:	drivers/soc/fsl/
 F:	include/linux/fsl/
+F:	include/soc/fsl/
 
 FREESCALE SOC FS_ENET DRIVER
 M:	Pantelis Antoniou <pantelis.antoniou@gmail.com>
-- 
2.33.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox