* RE: [PATCH 3/3] ptp_qoriq: convert to use module parameters for initialization
From: Y.b. Lu @ 2018-08-01 10:10 UTC (permalink / raw)
To: Richard Cochran
Cc: netdev@vger.kernel.org, Madalin-cristian Bucur, Rob Herring,
Shawn Guo, David S . Miller, devicetree@vger.kernel.org,
linuxppc-dev@lists.ozlabs.org,
linux-arm-kernel@lists.infradead.org,
linux-kernel@vger.kernel.org
In-Reply-To: <20180801061515.krn52iaq2kqdi3wo@localhost>
Hi Richard,
> -----Original Message-----
> From: Richard Cochran [mailto:richardcochran@gmail.com]
> Sent: Wednesday, August 1, 2018 2:15 PM
> To: Y.b. Lu <yangbo.lu@nxp.com>
> Cc: netdev@vger.kernel.org; Madalin-cristian Bucur
> <madalin.bucur@nxp.com>; Rob Herring <robh+dt@kernel.org>; Shawn Guo
> <shawnguo@kernel.org>; David S . Miller <davem@davemloft.net>;
> devicetree@vger.kernel.org; linuxppc-dev@lists.ozlabs.org;
> linux-arm-kernel@lists.infradead.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH 3/3] ptp_qoriq: convert to use module parameters for
> initialization
>=20
> On Wed, Aug 01, 2018 at 04:36:40AM +0000, Y.b. Lu wrote:
>=20
> > Could I add a function to calculate a set of default register values
> > to initialize ptp timer when dts method failed to get required
> > properties in driver?
>=20
> Yes, it would be ideal if the driver can pick correct values automaticall=
y.
>=20
> However, the frequency on the FIPER outputs can't be configured
> automatically, and we don't have an API for the user to choose this.
[Y.b. Lu] Thanks a lot. Just let me send out the v2 patch for your reviewin=
g.
>=20
> > I think this will be useful. The ptp timer on new platforms (you may
> > see two dts patches in this patchset. Many platforms will be
> > affected.) will work without these dts properties. If user want
> > specific setting, they can set dts properties.
>=20
> Sure.
>=20
> Thanks,
> Richard
^ permalink raw reply
* [v2, 3/3] ptp_qoriq: support automatic configuration for ptp timer
From: Yangbo Lu @ 2018-08-01 10:05 UTC (permalink / raw)
To: netdev, madalin.bucur, Richard Cochran, Rob Herring, Shawn Guo,
David S . Miller
Cc: devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel,
Yangbo Lu
In-Reply-To: <20180801100554.36634-1-yangbo.lu@nxp.com>
This patch is to support automatic configuration for ptp timer.
If required ptp dts properties are not provided, driver could
try to calculate a set of default configurations to initialize
the ptp timer. This makes the driver work for many boards which
don't have the required ptp dts properties in current kernel.
Also the users could set dts properties by themselves according
to their requirement.
Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
---
Changes for v2:
- Dropped module_param.
---
drivers/ptp/ptp_qoriq.c | 111 +++++++++++++++++++++++++++++++++++++++-
include/linux/fsl/ptp_qoriq.h | 6 ++-
2 files changed, 113 insertions(+), 4 deletions(-)
diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c
index a14c317..095c185 100644
--- a/drivers/ptp/ptp_qoriq.c
+++ b/drivers/ptp/ptp_qoriq.c
@@ -29,6 +29,7 @@
#include <linux/of_platform.h>
#include <linux/timex.h>
#include <linux/slab.h>
+#include <linux/clk.h>
#include <linux/fsl/ptp_qoriq.h>
@@ -317,6 +318,105 @@ static int ptp_qoriq_enable(struct ptp_clock_info *ptp,
.enable = ptp_qoriq_enable,
};
+/**
+ * qoriq_ptp_nominal_freq - calculate nominal frequency according to
+ * reference clock frequency
+ *
+ * @clk_src: reference clock frequency
+ *
+ * The nominal frequency is the desired clock frequency.
+ * It should be less than the reference clock frequency.
+ * It should be a factor of 1000MHz.
+ *
+ * Return the nominal frequency
+ */
+static u32 qoriq_ptp_nominal_freq(u32 clk_src)
+{
+ u32 remainder = 0;
+
+ clk_src /= 1000000;
+ remainder = clk_src % 100;
+ if (remainder) {
+ clk_src -= remainder;
+ clk_src += 100;
+ }
+
+ do {
+ clk_src -= 100;
+
+ } while (1000 % clk_src);
+
+ return clk_src * 1000000;
+}
+
+/**
+ * qoriq_ptp_auto_config - calculate a set of default configurations
+ *
+ * @qoriq_ptp: pointer to qoriq_ptp
+ * @node: pointer to device_node
+ *
+ * If below dts properties are not provided, this function will be
+ * called to calculate a set of default configurations for them.
+ * "fsl,tclk-period"
+ * "fsl,tmr-prsc"
+ * "fsl,tmr-add"
+ * "fsl,tmr-fiper1"
+ * "fsl,tmr-fiper2"
+ * "fsl,max-adj"
+ *
+ * Return 0 if success
+ */
+static int qoriq_ptp_auto_config(struct qoriq_ptp *qoriq_ptp,
+ struct device_node *node)
+{
+ struct clk *clk;
+ u64 freq_comp;
+ u64 max_adj;
+ u32 nominal_freq;
+ u32 clk_src = 0;
+
+ qoriq_ptp->cksel = DEFAULT_CKSEL;
+
+ clk = of_clk_get(node, 0);
+ if (!IS_ERR(clk)) {
+ clk_src = clk_get_rate(clk);
+ clk_put(clk);
+ }
+
+ if (clk_src <= 100000000UL) {
+ pr_err("error reference clock value, or lower than 100MHz\n");
+ return -EINVAL;
+ }
+
+ nominal_freq = qoriq_ptp_nominal_freq(clk_src);
+ if (!nominal_freq)
+ return -EINVAL;
+
+ qoriq_ptp->tclk_period = 1000000000UL / nominal_freq;
+ qoriq_ptp->tmr_prsc = DEFAULT_TMR_PRSC;
+
+ /* Calculate initial frequency compensation value for TMR_ADD register.
+ * freq_comp = ceil(2^32 / freq_ratio)
+ * freq_ratio = reference_clock_freq / nominal_freq
+ */
+ freq_comp = ((u64)1 << 32) * nominal_freq;
+ if (do_div(freq_comp, clk_src))
+ freq_comp++;
+
+ qoriq_ptp->tmr_add = freq_comp;
+ qoriq_ptp->tmr_fiper1 = DEFAULT_FIPER1_PERIOD - qoriq_ptp->tclk_period;
+ qoriq_ptp->tmr_fiper2 = DEFAULT_FIPER2_PERIOD - qoriq_ptp->tclk_period;
+
+ /* max_adj = 1000000000 * (freq_ratio - 1.0) - 1
+ * freq_ratio = reference_clock_freq / nominal_freq
+ */
+ max_adj = 1000000000ULL * (clk_src - nominal_freq);
+ max_adj = max_adj / nominal_freq - 1;
+ qoriq_ptp->caps.max_adj = max_adj;
+
+ return 0;
+}
+
static int qoriq_ptp_probe(struct platform_device *dev)
{
struct device_node *node = dev->dev.of_node;
@@ -332,7 +432,7 @@ static int qoriq_ptp_probe(struct platform_device *dev)
if (!qoriq_ptp)
goto no_memory;
- err = -ENODEV;
+ err = -EINVAL;
qoriq_ptp->caps = ptp_qoriq_caps;
@@ -351,10 +451,14 @@ static int qoriq_ptp_probe(struct platform_device *dev)
"fsl,tmr-fiper2", &qoriq_ptp->tmr_fiper2) ||
of_property_read_u32(node,
"fsl,max-adj", &qoriq_ptp->caps.max_adj)) {
- pr_err("device tree node missing required elements\n");
- goto no_node;
+ pr_warn("device tree node missing required elements, try automatic configuration\n");
+
+ if (qoriq_ptp_auto_config(qoriq_ptp, node))
+ goto no_config;
}
+ err = -ENODEV;
+
qoriq_ptp->irq = platform_get_irq(dev, 0);
if (qoriq_ptp->irq < 0) {
@@ -436,6 +540,7 @@ static int qoriq_ptp_probe(struct platform_device *dev)
release_resource(qoriq_ptp->rsrc);
no_resource:
free_irq(qoriq_ptp->irq, qoriq_ptp);
+no_config:
no_node:
kfree(qoriq_ptp);
no_memory:
diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h
index dc3dac4..c1f003a 100644
--- a/include/linux/fsl/ptp_qoriq.h
+++ b/include/linux/fsl/ptp_qoriq.h
@@ -127,9 +127,13 @@ struct qoriq_ptp_registers {
#define DRIVER "ptp_qoriq"
-#define DEFAULT_CKSEL 1
#define N_EXT_TS 2
+#define DEFAULT_CKSEL 1
+#define DEFAULT_TMR_PRSC 2
+#define DEFAULT_FIPER1_PERIOD 1000000000
+#define DEFAULT_FIPER2_PERIOD 100000
+
struct qoriq_ptp {
void __iomem *base;
struct qoriq_ptp_registers regs;
--
1.7.1
^ permalink raw reply related
* [v2, 1/3] arm64: dts: fsl: add clocks property for fman ptp timer node
From: Yangbo Lu @ 2018-08-01 10:05 UTC (permalink / raw)
To: netdev, madalin.bucur, Richard Cochran, Rob Herring, Shawn Guo,
David S . Miller
Cc: devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel,
Yangbo Lu
This patch is to add clocks property for fman ptp timer node.
Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
---
Changes for v2:
- None.
---
arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi b/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi
index a56a408..4664c33 100644
--- a/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi
+++ b/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi
@@ -80,4 +80,5 @@ ptp_timer0: ptp-timer@1afe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x0 0x1afe000 0x0 0x1000>;
interrupts = <GIC_SPI 44 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&clockgen 3 0>;
};
--
1.7.1
^ permalink raw reply related
* [v2, 2/3] powerpc/mpc85xx: add clocks property for fman ptp timer node
From: Yangbo Lu @ 2018-08-01 10:05 UTC (permalink / raw)
To: netdev, madalin.bucur, Richard Cochran, Rob Herring, Shawn Guo,
David S . Miller
Cc: devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel,
Yangbo Lu
In-Reply-To: <20180801100554.36634-1-yangbo.lu@nxp.com>
This patch is to add clocks property for fman ptp timer node.
Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
---
Changes for v2:
- None.
---
arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi | 1 +
arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi | 1 +
arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi | 1 +
arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi | 1 +
arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi | 1 +
5 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi
index 6b124f7..9b6cf91 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi
@@ -100,4 +100,5 @@ ptp_timer0: ptp-timer@4fe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x4fe000 0x1000>;
interrupts = <96 2 0 0>;
+ clocks = <&clockgen 3 0>;
};
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi
index b80aaf5..e95c11f 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi
@@ -100,4 +100,5 @@ ptp_timer1: ptp-timer@5fe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x5fe000 0x1000>;
interrupts = <97 2 0 0>;
+ clocks = <&clockgen 3 1>;
};
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi
index d3720fd..d62b36c 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi
@@ -105,4 +105,5 @@ ptp_timer0: ptp-timer@4fe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x4fe000 0x1000>;
interrupts = <96 2 0 0>;
+ clocks = <&clockgen 3 0>;
};
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi
index ae34c20..3102324 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi
@@ -105,4 +105,5 @@ ptp_timer1: ptp-timer@5fe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x5fe000 0x1000>;
interrupts = <97 2 0 0>;
+ clocks = <&clockgen 3 1>;
};
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi
index 02f2755..c90702b 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi
@@ -93,4 +93,5 @@ ptp_timer0: ptp-timer@4fe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x4fe000 0x1000>;
interrupts = <96 2 0 0>;
+ clocks = <&clockgen 3 0>;
};
--
1.7.1
^ permalink raw reply related
* Re: [RESEND PATCH 6/6] arm64: enable RapidIO menu in Kconfig
From: Christoph Hellwig @ 2018-08-01 9:54 UTC (permalink / raw)
To: Alexei Colin
Cc: Alexandre Bounine, Catalin Marinas, Will Deacon, Andrew Morton,
Russell King, John Paul Walters, x86, linuxppc-dev, linux-mips,
linux-arm-kernel, linux-kernel
In-Reply-To: <20180731142954.30345-7-acolin@isi.edu>
On Tue, Jul 31, 2018 at 10:29:54AM -0400, Alexei Colin wrote:
> Platforms with a PCI bus will be offered the RapidIO menu since they may
> be want support for a RapidIO PCI device. Platforms without a PCI bus
> that might include a RapidIO IP block will need to "select HAS_RAPIDIO"
> in the platform-/machine-specific "config ARCH_*" Kconfig entry.
>
> Tested that kernel builds for arm64 with RapidIO subsystem and
> switch drivers enabled, also that the modules load successfully
> on a custom Aarch64 Qemu model.
As said before, please include it from drivers/Kconfig so that _all_
architectures supporting PCI (or other Rapidio attachements) get it
and not some arbitrary selection of architectures.
^ permalink raw reply
* Re: [PATCH] perf tools: allow overriding MAX_NR_CPUS at compile time
From: Christophe LEROY @ 2018-08-01 9:37 UTC (permalink / raw)
To: Arnaldo Carvalho de Melo
Cc: Peter Zijlstra, Ingo Molnar, Alexander Shishkin, linux-kernel,
linuxppc-dev
In-Reply-To: <20180503134037.GB8442@kernel.org>
Le 03/05/2018 à 15:40, Arnaldo Carvalho de Melo a écrit :
> Em Fri, Sep 22, 2017 at 01:20:43PM +0200, Christophe Leroy escreveu:
>> After update of kernel, perf tool doesn't run anymore on my
>> 32MB RAM powerpc board, but still runs on a 128MB RAM board:
>
> Cleaning up my inbox, found this one, simple enough, still applies,
> applied.
Did you finally apply it ? I can't see it in linux-next. Will it be
merged into 4.19 ?
Thanks
Christophe
>
> These all needs to be dynamicly allocated, but still, with this one can
> get a functioning tool, apply it.
>
> - Arnaldo
>
>> ~# strace perf
>> execve("/usr/sbin/perf", ["perf"], [/* 12 vars */]) = -1 ENOMEM (Cannot allocate memory)
>> --- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=0} ---
>> +++ killed by SIGSEGV +++
>> Segmentation fault
>>
>> objdump -x shows that .bss section has a huge size of 24Mbytes:
>>
>> 27 .bss 016baca8 101cebb8 101cebb8 001cd988 2**3
>>
>> With especially the following objects having quite big size
>>
>> 10205f80 l O .bss 00140000 runtime_cycles_stats
>> 10345f80 l O .bss 00140000 runtime_stalled_cycles_front_stats
>> 10485f80 l O .bss 00140000 runtime_stalled_cycles_back_stats
>> 105c5f80 l O .bss 00140000 runtime_branches_stats
>> 10705f80 l O .bss 00140000 runtime_cacherefs_stats
>> 10845f80 l O .bss 00140000 runtime_l1_dcache_stats
>> 10985f80 l O .bss 00140000 runtime_l1_icache_stats
>> 10ac5f80 l O .bss 00140000 runtime_ll_cache_stats
>> 10c05f80 l O .bss 00140000 runtime_itlb_cache_stats
>> 10d45f80 l O .bss 00140000 runtime_dtlb_cache_stats
>> 10e85f80 l O .bss 00140000 runtime_cycles_in_tx_stats
>> 10fc5f80 l O .bss 00140000 runtime_transaction_stats
>> 11105f80 l O .bss 00140000 runtime_elision_stats
>> 11245f80 l O .bss 00140000 runtime_topdown_total_slots
>> 11385f80 l O .bss 00140000 runtime_topdown_slots_retired
>> 114c5f80 l O .bss 00140000 runtime_topdown_slots_issued
>> 11605f80 l O .bss 00140000 runtime_topdown_fetch_bubbles
>> 11745f80 l O .bss 00140000 runtime_topdown_recovery_bubbles
>>
>> This is due to commit 4d255766d28b1 ("perf: Bump max number of cpus
>> to 1024"), because many tables are sized with MAX_NR_CPUS
>>
>> This patch gives the opportunity to redefine MAX_NR_CPUS via
>>
>> make EXTRA_CFLAGS=-DMAX_NR_CPUS=1
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>> ---
>> tools/perf/perf.h | 2 ++
>> 1 file changed, 2 insertions(+)
>>
>> diff --git a/tools/perf/perf.h b/tools/perf/perf.h
>> index dc442ba21bf6..a9db563da0a9 100644
>> --- a/tools/perf/perf.h
>> +++ b/tools/perf/perf.h
>> @@ -23,7 +23,9 @@ static inline unsigned long long rdclock(void)
>> return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
>> }
>>
>> +#ifndef MAX_NR_CPUS
>> #define MAX_NR_CPUS 1024
>> +#endif
>>
>> extern const char *input_name;
>> extern bool perf_host, perf_guest;
>> --
>> 2.13.3
^ permalink raw reply
* Re: [PATCH] powerpc/64s/radix: Fix missing global invalidations when removing copro
From: Vaibhav Jain @ 2018-08-01 9:28 UTC (permalink / raw)
To: Frederic Barrat, linuxppc-dev, npiggin; +Cc: felix, clombard
In-Reply-To: <20180731132452.15994-1-fbarrat@linux.ibm.com>
Frederic Barrat <fbarrat@linux.ibm.com> writes:
> With the optimizations for TLB invalidation from commit 0cef77c7798a
> ("powerpc/64s/radix: flush remote CPUs out of single-threaded
> mm_cpumask"), the scope of a TLBI (global vs. local) can now be
> influenced by the value of the 'copros' counter of the memory context.
>
> When calling mm_context_remove_copro(), the 'copros' counter is
> decremented first before flushing. It may have the unintended side
> effect of sending local TLBIs when we explicitly need global
> invalidations in this case. Thus breaking any nMMU user in a bad and
> unpredictable way.
>
> Fix it by flushing first, before updating the 'copros' counter, so
> that invalidations will be global.
>
> Fixes: 0cef77c7798a ("powerpc/64s/radix: flush remote CPUs out of single-threaded mm_cpumask")
> Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
> ---
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
--
Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Linux Technology Center, IBM India Pvt. Ltd.
^ permalink raw reply
* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Will Deacon @ 2018-08-01 9:05 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Benjamin Herrenschmidt, Michael S. Tsirkin, Anshuman Khandual,
virtualization, linux-kernel, linuxppc-dev, aik, robh, joe,
elfring, david, jasowang, mpe, linuxram, haren, paulus, srikar,
robin.murphy, jean-philippe.brucker, marc.zyngier
In-Reply-To: <20180801083639.GF26378@infradead.org>
Hi Christoph,
On Wed, Aug 01, 2018 at 01:36:39AM -0700, Christoph Hellwig wrote:
> On Wed, Aug 01, 2018 at 09:16:38AM +0100, Will Deacon wrote:
> > On arm/arm64, the problem we have is that legacy virtio devices on the MMIO
> > transport (so definitely not PCI) have historically been advertised by qemu
> > as not being cache coherent, but because the virtio core has bypassed DMA
> > ops then everything has happened to work. If we blindly enable the arch DMA
> > ops,
>
> No one is suggesting that as far as I can tell.
Apologies: it's me that wants the DMA ops enabled to handle legacy devices
behind an IOMMU, but see below.
> > we'll plumb in the non-coherent ops and start getting data corruption,
> > so we do need a way to quirk virtio as being "always coherent" if we want to
> > use the DMA ops (which we do, because our emulation platforms have an IOMMU
> > for all virtio devices).
>
> From all that I've gather so far: no you do not want that. We really
> need to figure out virtio "dma" interacts with the host / device.
>
> If you look at the current iommu spec it does talk of physical address
> with a little careveout for VIRTIO_F_IOMMU_PLATFORM.
That's true, although that doesn't exist in the legacy virtio spec, and we
have an existing emulation platform which puts legacy virtio devices behind
an IOMMU. Currently, Linux is unable to boot on this platform unless the
IOMMU is configured as bypass. If we can use the coherent IOMMU DMA ops,
then it works perfectly.
> So between that and our discussion in this thread and its previous
> iterations I think we need to stick to the current always physical,
> bypass system dma ops mode of virtio operation as the default.
As above -- that means we hang during boot because we get stuck trying to
bring up a virtio-block device whose DMA is aborted by the IOMMU. The easy
answer is "just upgrade to latest virtio and advertise the presence of the
IOMMU". I'm pushing for that in future platforms, but it seems a shame not
to support the current platform, especially given that other systems do have
hacks in mainline to get virtio working.
> We just need to figure out how to deal with devices that deviate
> from the default. One things is that VIRTIO_F_IOMMU_PLATFORM really
> should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu
> dma tweaks (offsets, cache flushing), which seems well in spirit of
> the original design. The other issue is VIRTIO_F_IO_BARRIER
> which is very vaguely defined, and which needs a better definition.
> And last but not least we'll need some text explaining the challenges
> of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER
> is what would basically cover them, but a good description including
> an explanation of why these matter.
I agree that this makes sense for future revisions of virtio (or perhaps
it can just be a clarification to virtio 1.0), but we're still left in the
dark with legacy devices and it would be nice to have them work on the
systems which currently exist, even if it's a legacy-only hack in the arch
code.
Will
^ permalink raw reply
* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Christoph Hellwig @ 2018-08-01 8:36 UTC (permalink / raw)
To: Will Deacon
Cc: Benjamin Herrenschmidt, Christoph Hellwig, Michael S. Tsirkin,
Anshuman Khandual, virtualization, linux-kernel, linuxppc-dev,
aik, robh, joe, elfring, david, jasowang, mpe, linuxram, haren,
paulus, srikar, robin.murphy, jean-philippe.brucker, marc.zyngier
In-Reply-To: <20180801081637.GA14438@arm.com>
On Wed, Aug 01, 2018 at 09:16:38AM +0100, Will Deacon wrote:
> On arm/arm64, the problem we have is that legacy virtio devices on the MMIO
> transport (so definitely not PCI) have historically been advertised by qemu
> as not being cache coherent, but because the virtio core has bypassed DMA
> ops then everything has happened to work. If we blindly enable the arch DMA
> ops,
No one is suggesting that as far as I can tell.
> we'll plumb in the non-coherent ops and start getting data corruption,
> so we do need a way to quirk virtio as being "always coherent" if we want to
> use the DMA ops (which we do, because our emulation platforms have an IOMMU
> for all virtio devices).
>From all that I've gather so far: no you do not want that. We really
need to figure out virtio "dma" interacts with the host / device.
If you look at the current iommu spec it does talk of physical address
with a little careveout for VIRTIO_F_IOMMU_PLATFORM.
So between that and our discussion in this thread and its previous
iterations I think we need to stick to the current always physical,
bypass system dma ops mode of virtio operation as the default.
We just need to figure out how to deal with devices that deviate
from the default. One things is that VIRTIO_F_IOMMU_PLATFORM really
should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu
dma tweaks (offsets, cache flushing), which seems well in spirit of
the original design. The other issue is VIRTIO_F_IO_BARRIER
which is very vaguely defined, and which needs a better definition.
And last but not least we'll need some text explaining the challenges
of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER
is what would basically cover them, but a good description including
an explanation of why these matter.
^ permalink raw reply
* [PATCH v8 4/4] selftests/powerpc: update strlen() test to test the new assembly function for PPC32
From: Christophe Leroy @ 2018-08-01 9:01 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
wei.guo.simon, segher
Cc: linux-kernel, linuxppc-dev
In-Reply-To: <b3d7b6cdb89a48be06a2630bf0d762d9d17d931f.1531511429.git.christophe.leroy@c-s.fr>
This patch adds a test for testing the new assembly strlen() for PPC32
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
v8: removed defines in ppc_asm.h that were added in v6 (not used anymore since v7) ; added missing link to strlen_32.S
v7: reduced the scope to PPC32
v6: added additional necessary defines in ppc_asm.h
v5: no change
v4: new
tools/testing/selftests/powerpc/stringloops/Makefile | 5 ++++-
tools/testing/selftests/powerpc/stringloops/asm/cache.h | 1 +
tools/testing/selftests/powerpc/stringloops/strlen_32.S | 1 +
3 files changed, 6 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/cache.h
create mode 120000 tools/testing/selftests/powerpc/stringloops/strlen_32.S
diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile
index 779b644461c4..9e510de2c07d 100644
--- a/tools/testing/selftests/powerpc/stringloops/Makefile
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -13,9 +13,12 @@ $(OUTPUT)/memcmp_32: CFLAGS += -m32
$(OUTPUT)/strlen: strlen.c string.o
$(OUTPUT)/string.o: string.c
+$(OUTPUT)/strlen_32: strlen.c
+$(OUTPUT)/strlen_32: CFLAGS += -m32
+
ASFLAGS = $(CFLAGS)
-TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen
+TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen strlen_32
include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/cache.h b/tools/testing/selftests/powerpc/stringloops/asm/cache.h
new file mode 100644
index 000000000000..8a2840831122
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/cache.h
@@ -0,0 +1 @@
+#define IFETCH_ALIGN_BYTES 4
diff --git a/tools/testing/selftests/powerpc/stringloops/strlen_32.S b/tools/testing/selftests/powerpc/stringloops/strlen_32.S
new file mode 120000
index 000000000000..72b13731b24c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/strlen_32.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/strlen_32.S
\ No newline at end of file
--
2.13.3
^ permalink raw reply related
* [PATCH v8 3/4] powerpc/lib: implement strlen() in assembly for PPC32
From: Christophe Leroy @ 2018-08-01 9:01 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
wei.guo.simon, segher
Cc: linux-kernel, linuxppc-dev
In-Reply-To: <b3d7b6cdb89a48be06a2630bf0d762d9d17d931f.1531511429.git.christophe.leroy@c-s.fr>
The generic implementation of strlen() reads strings byte per byte.
This patch implements strlen() in assembly based on a read of entire
words, in the same spirit as what some other arches and glibc do.
On a 8xx the time spent in strlen is reduced by 3/4 for long strings.
strlen() selftest on an 8xx provides the following values:
Before the patch (ie with the generic strlen() in lib/string.c):
len 256 : time = 1.195055
len 016 : time = 0.083745
len 008 : time = 0.046828
len 004 : time = 0.028390
After the patch:
len 256 : time = 0.272185 ==> 78% improvment
len 016 : time = 0.040632 ==> 51% improvment
len 008 : time = 0.033060 ==> 29% improvment
len 004 : time = 0.029149 ==> 2% degradation
On a 832x:
Before the patch:
len 256 : time = 0.236125
len 016 : time = 0.018136
len 008 : time = 0.011000
len 004 : time = 0.007229
After the patch:
len 256 : time = 0.094950 ==> 60% improvment
len 016 : time = 0.013357 ==> 26% improvment
len 008 : time = 0.010586 ==> 4% improvment
len 004 : time = 0.008784
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
Changes in v8:
- No change
Changes in v7:
- Reduced the scope to PPC32
- Modified the missalignment handling to be branchless and loopless
Changes in v6:
- Reworked for having branchless conclusion
Changes in v5:
- Fixed for PPC64 LITTLE ENDIAN
Changes in v4:
- Added alignment of the loop
- doing the andc only if still not 0 as it happends only for bytes above 0x7f which is pretty rare in a string
Changes in v3:
- Made it common to PPC32 and PPC64
Changes in v2:
- Moved handling of unaligned strings outside of the main path as it is very unlikely.
- Removed the verification of the fourth byte in case none of the three first ones are NUL.
arch/powerpc/include/asm/string.h | 2 +
arch/powerpc/lib/Makefile | 2 +-
arch/powerpc/lib/strlen_32.S | 78 +++++++++++++++++++++++++++++++++++++++
3 files changed, 81 insertions(+), 1 deletion(-)
create mode 100644 arch/powerpc/lib/strlen_32.S
diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 9b8cedf618f4..1647de15a31e 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -50,6 +50,8 @@ static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
return __memset64(p, v, n * 8);
}
#else
+#define __HAVE_ARCH_STRLEN
+
extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
#endif
#endif /* __KERNEL__ */
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index d0ca13ad8231..670286808928 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -12,7 +12,7 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
obj-y += string.o alloc.o code-patching.o feature-fixups.o
-obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o
+obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o strlen_32.o
# See corresponding test in arch/powerpc/Makefile
# 64-bit linker creates .sfpr on demand for final link (vmlinux),
diff --git a/arch/powerpc/lib/strlen_32.S b/arch/powerpc/lib/strlen_32.S
new file mode 100644
index 000000000000..0a8d3f64d493
--- /dev/null
+++ b/arch/powerpc/lib/strlen_32.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * strlen() for PPC32
+ *
+ * Copyright (C) 2018 Christophe Leroy CS Systemes d'Information.
+ *
+ * Inspired from glibc implementation
+ */
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+#include <asm/cache.h>
+
+ .text
+
+/*
+ * Algorithm:
+ *
+ * 1) Given a word 'x', we can test to see if it contains any 0 bytes
+ * by subtracting 0x01010101, and seeing if any of the high bits of each
+ * byte changed from 0 to 1. This works because the least significant
+ * 0 byte must have had no incoming carry (otherwise it's not the least
+ * significant), so it is 0x00 - 0x01 == 0xff. For all other
+ * byte values, either they have the high bit set initially, or when
+ * 1 is subtracted you get a value in the range 0x00-0x7f, none of which
+ * have their high bit set. The expression here is
+ * (x - 0x01010101) & ~x & 0x80808080), which gives 0x00000000 when
+ * there were no 0x00 bytes in the word. You get 0x80 in bytes that
+ * match, but possibly false 0x80 matches in the next more significant
+ * byte to a true match due to carries. For little-endian this is
+ * of no consequence since the least significant match is the one
+ * we're interested in, but big-endian needs method 2 to find which
+ * byte matches.
+ * 2) Given a word 'x', we can test to see _which_ byte was zero by
+ * calculating ~(((x & ~0x80808080) - 0x80808080 - 1) | x | ~0x80808080).
+ * This produces 0x80 in each byte that was zero, and 0x00 in all
+ * the other bytes. The '| ~0x80808080' clears the low 7 bits in each
+ * byte, and the '| x' part ensures that bytes with the high bit set
+ * produce 0x00. The addition will carry into the high bit of each byte
+ * iff that byte had one of its low 7 bits set. We can then just see
+ * which was the most significant bit set and divide by 8 to find how
+ * many to add to the index.
+ * This is from the book 'The PowerPC Compiler Writer's Guide',
+ * by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren.
+ */
+
+_GLOBAL(strlen)
+ andi. r0, r3, 3
+ lis r7, 0x0101
+ addi r10, r3, -4
+ addic r7, r7, 0x0101 /* r7 = 0x01010101 (lomagic) & clear XER[CA] */
+ rotlwi r6, r7, 31 /* r6 = 0x80808080 (himagic) */
+ bne- 3f
+ .balign IFETCH_ALIGN_BYTES
+1: lwzu r9, 4(r10)
+2: subf r8, r7, r9
+ and. r8, r8, r6
+ beq+ 1b
+ andc. r8, r8, r9
+ beq+ 1b
+ andc r8, r9, r6
+ orc r9, r9, r6
+ subfe r8, r6, r8
+ nor r8, r8, r9
+ cntlzw r8, r8
+ subf r3, r3, r10
+ srwi r8, r8, 3
+ add r3, r3, r8
+ blr
+
+ /* Missaligned string: make sure bytes before string are seen not 0 */
+3: xor r10, r10, r0
+ orc r8, r8, r8
+ lwzu r9, 4(r10)
+ slwi r0, r0, 3
+ srw r8, r8, r0
+ orc r9, r9, r8
+ b 2b
+EXPORT_SYMBOL(strlen)
--
2.13.3
^ permalink raw reply related
* [PATCH v8 2/4] selftests/powerpc: Add test for strlen()
From: Christophe Leroy @ 2018-08-01 9:01 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
wei.guo.simon, segher
Cc: linux-kernel, linuxppc-dev
In-Reply-To: <b3d7b6cdb89a48be06a2630bf0d762d9d17d931f.1531511429.git.christophe.leroy@c-s.fr>
This patch adds a test for strlen()
string.c contains a copy of strlen() from lib/string.c
The test first tests the correctness of strlen() by comparing
the result with libc strlen(). It tests all cases of alignment.
It them tests the duration of an aligned strlen() on a 4 bytes string,
on a 16 bytes string and on a 256 bytes string.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
v8: no change
v7: no change
v6: refactorised the benchmark test
v5: no change
v4: new
.../testing/selftests/powerpc/stringloops/Makefile | 5 +-
.../testing/selftests/powerpc/stringloops/string.c | 36 ++++++
.../testing/selftests/powerpc/stringloops/strlen.c | 127 +++++++++++++++++++++
3 files changed, 167 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/powerpc/stringloops/string.c
create mode 100644 tools/testing/selftests/powerpc/stringloops/strlen.c
diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile
index 3862256c2b7d..779b644461c4 100644
--- a/tools/testing/selftests/powerpc/stringloops/Makefile
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -10,9 +10,12 @@ $(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec
$(OUTPUT)/memcmp_32: memcmp.c
$(OUTPUT)/memcmp_32: CFLAGS += -m32
+$(OUTPUT)/strlen: strlen.c string.o
+$(OUTPUT)/string.o: string.c
+
ASFLAGS = $(CFLAGS)
-TEST_GEN_PROGS := memcmp_32 memcmp_64
+TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen
include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/stringloops/string.c b/tools/testing/selftests/powerpc/stringloops/string.c
new file mode 100644
index 000000000000..d05200481017
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/string.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * linux/lib/string.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+/*
+ * stupid library routines.. The optimized versions should generally be found
+ * as inline code in <asm-xx/string.h>
+ *
+ * These are buggy as well..
+ *
+ * * Fri Jun 25 1999, Ingo Oeser <ioe@informatik.tu-chemnitz.de>
+ * - Added strsep() which will replace strtok() soon (because strsep() is
+ * reentrant and should be faster). Use only strsep() in new code, please.
+ *
+ * * Sat Feb 09 2002, Jason Thomas <jason@topic.com.au>,
+ * Matthew Hawkins <matt@mh.dropbear.id.au>
+ * - Kissed strtok() goodbye
+ */
+
+#include <stddef.h>
+
+/**
+ * strlen - Find the length of a string
+ * @s: The string to be sized
+ */
+size_t test_strlen(const char *s)
+{
+ const char *sc;
+
+ for (sc = s; *sc != '\0'; ++sc)
+ /* nothing */;
+ return sc - s;
+}
diff --git a/tools/testing/selftests/powerpc/stringloops/strlen.c b/tools/testing/selftests/powerpc/stringloops/strlen.c
new file mode 100644
index 000000000000..9055ebc484d0
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/strlen.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <malloc.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "utils.h"
+
+#define SIZE 256
+#define ITERATIONS 1000
+#define ITERATIONS_BENCH 100000
+
+int test_strlen(const void *s);
+
+/* test all offsets and lengths */
+static void test_one(char *s)
+{
+ unsigned long offset;
+
+ for (offset = 0; offset < SIZE; offset++) {
+ int x, y;
+ unsigned long i;
+
+ y = strlen(s + offset);
+ x = test_strlen(s + offset);
+
+ if (x != y) {
+ printf("strlen() returned %d, should have returned %d (%p offset %ld)\n", x, y, s, offset);
+
+ for (i = offset; i < SIZE; i++)
+ printf("%02x ", s[i]);
+ printf("\n");
+ }
+ }
+}
+
+static void bench_test(char *s)
+{
+ struct timespec ts_start, ts_end;
+ int i;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
+
+ for (i = 0; i < ITERATIONS_BENCH; i++)
+ test_strlen(s);
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_end);
+
+ printf("len %3.3d : time = %.6f\n", test_strlen(s), ts_end.tv_sec - ts_start.tv_sec + (ts_end.tv_nsec - ts_start.tv_nsec) / 1e9);
+}
+
+static int testcase(void)
+{
+ char *s;
+ unsigned long i;
+
+ s = memalign(128, SIZE);
+ if (!s) {
+ perror("memalign");
+ exit(1);
+ }
+
+ srandom(1);
+
+ memset(s, 0, SIZE);
+ for (i = 0; i < SIZE; i++) {
+ char c;
+
+ do {
+ c = random() & 0x7f;
+ } while (!c);
+ s[i] = c;
+ test_one(s);
+ }
+
+ for (i = 0; i < ITERATIONS; i++) {
+ unsigned long j;
+
+ for (j = 0; j < SIZE; j++) {
+ char c;
+
+ do {
+ c = random() & 0x7f;
+ } while (!c);
+ s[j] = c;
+ }
+ for (j = 0; j < sizeof(long); j++) {
+ s[SIZE - 1 - j] = 0;
+ test_one(s);
+ }
+ }
+
+ for (i = 0; i < SIZE; i++) {
+ char c;
+
+ do {
+ c = random() & 0x7f;
+ } while (!c);
+ s[i] = c;
+ }
+
+ bench_test(s);
+
+ s[16] = 0;
+ bench_test(s);
+
+ s[8] = 0;
+ bench_test(s);
+
+ s[4] = 0;
+ bench_test(s);
+
+ s[3] = 0;
+ bench_test(s);
+
+ s[2] = 0;
+ bench_test(s);
+
+ s[1] = 0;
+ bench_test(s);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(testcase, "strlen");
+}
--
2.13.3
^ permalink raw reply related
* [PATCH v8 1/4] selftests/powerpc: add test for 32 bits memcmp
From: Christophe Leroy @ 2018-08-01 9:01 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
wei.guo.simon, segher
Cc: linux-kernel, linuxppc-dev
This patch renames memcmp test to memcmp_64 and adds
a memcmp_32 test for testing the 32 bits version of memcmp()
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
v8: rebased on latest powerpc/merge
v7: no change
v6: no change
v5: no change
v4: new
tools/testing/selftests/powerpc/stringloops/Makefile | 14 +++++++++++---
tools/testing/selftests/powerpc/stringloops/memcmp_32.S | 1 +
2 files changed, 12 insertions(+), 3 deletions(-)
create mode 120000 tools/testing/selftests/powerpc/stringloops/memcmp_32.S
diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile
index c60c6172dd3c..3862256c2b7d 100644
--- a/tools/testing/selftests/powerpc/stringloops/Makefile
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -1,10 +1,18 @@
# SPDX-License-Identifier: GPL-2.0
# The loops are all 64-bit code
-CFLAGS += -m64 -maltivec
CFLAGS += -I$(CURDIR)
-TEST_GEN_PROGS := memcmp
-EXTRA_SOURCES := memcmp_64.S ../harness.c
+EXTRA_SOURCES := ../harness.c
+
+$(OUTPUT)/memcmp_64: memcmp.c
+$(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec
+
+$(OUTPUT)/memcmp_32: memcmp.c
+$(OUTPUT)/memcmp_32: CFLAGS += -m32
+
+ASFLAGS = $(CFLAGS)
+
+TEST_GEN_PROGS := memcmp_32 memcmp_64
include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp_32.S b/tools/testing/selftests/powerpc/stringloops/memcmp_32.S
new file mode 120000
index 000000000000..056f2b3af789
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp_32.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/memcmp_32.S
\ No newline at end of file
--
2.13.3
^ permalink raw reply related
* Re: [RFC PATCH kernel 0/5] powerpc/P9/vfio: Pass through NVIDIA Tesla V100
From: Alexey Kardashevskiy @ 2018-08-01 8:37 UTC (permalink / raw)
To: Alex Williamson
Cc: Benjamin Herrenschmidt, linuxppc-dev, David Gibson, kvm-ppc,
Ram Pai, kvm, Alistair Popple
In-Reply-To: <20180731082905.5c7cdeee@t450s.home>
On 01/08/2018 00:29, Alex Williamson wrote:
> On Tue, 31 Jul 2018 14:03:35 +1000
> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>
>> On 31/07/2018 02:29, Alex Williamson wrote:
>>> On Mon, 30 Jul 2018 18:58:49 +1000
>>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>> After some local discussions, it was pointed out that force disabling
>>>> nvlinks won't bring us much as for an nvlink to work, both sides need to
>>>> enable it so malicious guests cannot penetrate good ones (or a host)
>>>> unless a good guest enabled the link but won't happen with a well
>>>> behaving guest. And if two guests became malicious, then can still only
>>>> harm each other, and so can they via other ways such network. This is
>>>> different from PCIe as once PCIe link is unavoidably enabled, a well
>>>> behaving device cannot firewall itself from peers as it is up to the
>>>> upstream bridge(s) now to decide the routing; with nvlink2, a GPU still
>>>> has means to protect itself, just like a guest can run "firewalld" for
>>>> network.
>>>>
>>>> Although it would be a nice feature to have an extra barrier between
>>>> GPUs, is inability to block the links in hypervisor still a blocker for
>>>> V100 pass through?
>>>
>>> How is the NVLink configured by the guest, is it 'on'/'off' or are
>>> specific routes configured?
>>
>> The GPU-GPU links need not to be blocked and need to be enabled
>> (==trained) by a driver in the guest. There are no routes between GPUs
>> in NVLink fabric, these are direct links, it is just a switch on each
>> side, both switches need to be on for a link to work.
>
> Ok, but there is at least the possibility of multiple direct links per
> GPU, the very first diagram I find of NVlink shows 8 interconnected
> GPUs:
>
> https://www.nvidia.com/en-us/data-center/nvlink/
Out design is like the left part of the picture but it is just a detail.
> So if each switch enables one direct, point to point link, how does the
> guest know which links to open for which peer device?
It uses PCI config space on GPUs to discover the topology.
> And of course
> since we can't see the spec, a security audit is at best hearsay :-\
Yup, the exact discovery protocol is hidden.
>> The GPU-CPU links - the GPU bit is the same switch, the CPU NVlink state
>> is controlled via the emulated PCI bridges which I pass through together
>> with the GPU.
>
> So there's a special emulated switch, is that how the guest knows which
> GPUs it can enable NVLinks to?
Since it only has PCI config space (there is nothing relevant in the
device tree at all), I assume (double checking with the NVIDIA folks
now) the guest driver enables them all, tests which pair works and
disables the ones which do not. This gives a malicious guest a tiny
window of opportunity to break into a good guest. Hm :-/
>>> If the former, then isn't a non-malicious
>>> guest still susceptible to a malicious guest?
>>
>> A non-malicious guest needs to turn its switch on for a link to a GPU
>> which belongs to a malicious guest.
>
> Actual security, or obfuscation, will we ever know...
>>>> If the latter, how is
>>> routing configured by the guest given that the guest view of the
>>> topology doesn't match physical hardware? Are these routes
>>> deconfigured by device reset? Are they part of the save/restore
>>> state? Thanks,
>
> Still curious what happens to these routes on reset. Can a later user
> of a GPU inherit a device where the links are already enabled? Thanks,
I am told that the GPU reset disables links. As a side effect, we get an
HMI (a hardware fault which reset the host machine) when trying
accessing the GPU RAM which indicates that the link is down as the
memory is only accessible via the nvlink. We have special fencing code
in our host firmware (skiboot) to fence this memory on PCI reset so
reading from it returns zeroes instead of HMIs.
--
Alexey
^ permalink raw reply
* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Will Deacon @ 2018-08-01 8:16 UTC (permalink / raw)
To: Benjamin Herrenschmidt
Cc: Christoph Hellwig, Michael S. Tsirkin, Anshuman Khandual,
virtualization, linux-kernel, linuxppc-dev, aik, robh, joe,
elfring, david, jasowang, mpe, linuxram, haren, paulus, srikar,
robin.murphy, jean-philippe.brucker, marc.zyngier
In-Reply-To: <3d6e81511571260de1c8047aaffa8ac4df093d2e.camel@kernel.crashing.org>
On Tue, Jul 31, 2018 at 03:36:22PM -0500, Benjamin Herrenschmidt wrote:
> On Tue, 2018-07-31 at 10:30 -0700, Christoph Hellwig wrote:
> > > However the question people raise is that DMA API is already full of
> > > arch-specific tricks the likes of which are outlined in your post linked
> > > above. How is this one much worse?
> >
> > None of these warts is visible to the driver, they are all handled in
> > the architecture (possibly on a per-bus basis).
> >
> > So for virtio we really need to decide if it has one set of behavior
> > as specified in the virtio spec, or if it behaves exactly as if it
> > was on a PCI bus, or in fact probably both as you lined up. But no
> > magic arch specific behavior inbetween.
>
> The only arch specific behaviour is needed in the case where it doesn't
> behave like PCI. In this case, the PCI DMA ops are not suitable, but in
> our secure VMs, we still need to make it use swiotlb in order to bounce
> through non-secure pages.
On arm/arm64, the problem we have is that legacy virtio devices on the MMIO
transport (so definitely not PCI) have historically been advertised by qemu
as not being cache coherent, but because the virtio core has bypassed DMA
ops then everything has happened to work. If we blindly enable the arch DMA
ops, we'll plumb in the non-coherent ops and start getting data corruption,
so we do need a way to quirk virtio as being "always coherent" if we want to
use the DMA ops (which we do, because our emulation platforms have an IOMMU
for all virtio devices).
Will
^ permalink raw reply
* Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals
From: Segher Boessenkool @ 2018-08-01 7:49 UTC (permalink / raw)
To: Joe Perches
Cc: Christophe LEROY, Murilo Opsfelder Araujo, linux-kernel,
Michael Neuling, Simon Guo, Nicholas Piggin, Paul Mackerras,
Eric W . Biederman, Andrew Donnellan, Alastair D'Silva,
Sukadev Bhattiprolu, linuxppc-dev, Cyril Bur, Tobin C . Harding
In-Reply-To: <afa25468e62ae3259878d8a95e9ac96f6be9f88a.camel@perches.com>
On Wed, Aug 01, 2018 at 12:03:50AM -0700, Joe Perches wrote:
> On Wed, 2018-08-01 at 08:37 +0200, Christophe LEROY wrote:
> > Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :
> > I would suggest to instead use a function like this:
> >
> > static const char *signame(int signr)
> > {
> > if (signr == SIGBUS)
> > return "bus error";
> > if (signr == SIGFPE)
> > return "floating point exception";
> > if (signr == SIGILL)
> > return "illegal instruction";
> > if (signr == SIGILL)
> > return "segfault";
> > if (signr == SIGTRAP)
> > return "unhandled trap";
> > return "unknown signal";
> > }
>
> trivia:
>
> Unless the if tests are ordered most to least likely,
> perhaps it would be better to use a switch/case and
> let the compiler decide.
That would also show there are two entries for SIGILL (here and in the
original patch), one of them very wrong.
Check the table with psignal or something?
Segher
^ permalink raw reply
* Re: [next-20180727][qla2xxx][BUG] WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 scsi_end_request+0x250/0x280
From: jianchao.wang @ 2018-08-01 7:19 UTC (permalink / raw)
To: Abdul Haleem, linuxppc-dev, Madhani, Himanshu
Cc: linux-block, linux-fsdevel, linux-ext4, linux-scsi, linux-next,
Stephen Rothwell, linux-kernel, jejb, Jens Axboe, dgilbert,
bart.vanassche, rosattig, kyle.mahlkuch
In-Reply-To: <1533105183.23332.15.camel@abdul>
Hi Abdul
On 08/01/2018 02:33 PM, Abdul Haleem wrote:
> # mkfs -t ext4 /dev/mapper/mpatha
> mke2fs 1.43.1 (08-Jun-2016)
> Found a dos partition table in /dev/mapper/mpatha
> Proceed anyway? (y,n) y
> Discarding device blocks:
> qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 -- 1 2002.
> qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 -- 1 2002.
> qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 -- 1 2002.
> qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 -- 1 2002.
> WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 scsi_end_request+0x250/0x280
...
> NIP [c000000000690080] scsi_end_request+0x250/0x280
> LR [c00000000068fe80] scsi_end_request+0x50/0x280
> Call Trace:
> [c00000027d39b600] [c00000000068fe80] scsi_end_request+0x50/0x280 (unreliable)
> [c00000027d39b660] [c0000000006904ac] scsi_io_completion+0x29c/0x7d0
> [c00000027d39b710] [c0000000006848e4] scsi_finish_command+0x104/0x1c0
> [c00000027d39b790] [c00000000068f148] scsi_softirq_done+0x198/0x1f0
> [c00000027d39b820] [c0000000004f2b80] blk_mq_complete_request+0x130/0x1d0
> [c00000027d39b860] [c00000000068d27c] scsi_mq_done+0x2c/0xe0
> [c00000027d39b890] [d000000004291080] qla2xxx_qpair_sp_compl+0xa8/0x140 [qla2xxx]
> [c00000027d39b900] [d0000000042cc9d0] qla2x00_process_completed_request+0x68/0x140 [qla2xxx]
> ------------[ cut here ]------------
> kernel BUG at block/blk-core.c:3196!
blk_finish_request
BUG_ON(blk_queued_rq(req))
We are also suffering a similar issue on qla2xxx,
the BUG_ON in blk_finish_request is triggered while there are lots of command aborted.
The root cause should be qla2xxx driver still invoke scsi_done for an aborted command
and cause race between requeue path and normal complete path.
Add Himanshu Madhani from qlogic team.
It seems that they are working on this.
Thanks
Jianchao
^ permalink raw reply
* Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals
From: Joe Perches @ 2018-08-01 7:03 UTC (permalink / raw)
To: Christophe LEROY, Murilo Opsfelder Araujo, linux-kernel
Cc: Alastair D'Silva, Andrew Donnellan, Balbir Singh,
Benjamin Herrenschmidt, Cyril Bur, Eric W . Biederman,
Michael Ellerman, Michael Neuling, Nicholas Piggin,
Paul Mackerras, Simon Guo, Sukadev Bhattiprolu, Tobin C . Harding,
linuxppc-dev
In-Reply-To: <631e9a9b-dbbe-ede7-eb81-81520cc36ad5@c-s.fr>
On Wed, 2018-08-01 at 08:37 +0200, Christophe LEROY wrote:
> Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :
> > This adds a human-readable name in the unhandled signal message.
> > Before this patch, a page fault looked like:
> > pandafault[6303]: unhandled signal 11 at 100007d0 nip 1000061c lr 7fff93c55100 code 2 in pandafault[10000000+10000]
> > After this patch, a page fault looks like:
> > pandafault[6352]: segfault (11) at 13a2a09f8 nip 13a2a086c lr 7fffb63e5100 code 2 in pandafault[13a2a0000+10000]
]]
> > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
[]
> > @@ -96,6 +96,41 @@ EXPORT_SYMBOL(__debugger_fault_handler);
> > #define TM_DEBUG(x...) do { } while(0)
> > #endif
> >
> > +static const char *signames[SIGRTMIN + 1] = {
> > + "UNKNOWN",
> > + "SIGHUP", // 1
> > + "SIGINT", // 2
[]
> I don't think is is worth having that full table when we only use a few
> of them. (As discussed in v1 https://patchwork.ozlabs.org/patch/948802/)
>
> I would suggest to instead use a function like this:
>
> static const char *signame(int signr)
> {
> if (signr == SIGBUS)
> return "bus error";
> if (signr == SIGFPE)
> return "floating point exception";
> if (signr == SIGILL)
> return "illegal instruction";
> if (signr == SIGILL)
> return "segfault";
> if (signr == SIGTRAP)
> return "unhandled trap";
> return "unknown signal";
> }
trivia:
Unless the if tests are ordered most to least likely,
perhaps it would be better to use a switch/case and
let the compiler decide.
switch (signr) {
case SIGBUS: return "bus error";
case SIGFPE: return "floating point exception";
case SIGILL: return "illegal instruction";
case SIGSEGV: return "segfault";
case SIGTRAP: return "unhandled trap";
}
return "unknown signal";
}
^ permalink raw reply
* Re: [PATCH v3 9/9] powerpc/traps: Add line prefix in show_instructions()
From: Christophe LEROY @ 2018-08-01 6:41 UTC (permalink / raw)
To: Murilo Opsfelder Araujo, linux-kernel
Cc: Alastair D'Silva, Andrew Donnellan, Balbir Singh,
Benjamin Herrenschmidt, Cyril Bur, Eric W . Biederman,
Joe Perches, Michael Ellerman, Michael Neuling, Nicholas Piggin,
Paul Mackerras, Simon Guo, Sukadev Bhattiprolu, Tobin C . Harding,
linuxppc-dev
In-Reply-To: <20180731145020.14009-10-muriloo@linux.ibm.com>
Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :
> Remove "Instruction dump:" line by adding a prefix to display current->comm
> and current->pid, along with the instructions dump.
>
> The prefix can serve as a glue that links the instructions dump to its
> originator, allowing messages to be interleaved in the logs.
>
> Before this patch, a page fault looked like:
>
> pandafault[10524]: segfault (11) at 100007d0 nip 1000061c lr 7fffbd295100 code 2 in pandafault[10000000+10000]
> Instruction dump:
> 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe
> 392988d0 f93f0020 e93f0020 39400048 <99490000> 39200000 7d234b78 383f0040
>
> After this patch, it looks like:
>
> pandafault[10850]: segfault (11) at 100007d0 nip 1000061c lr 7fff9f3e5100 code 2 in pandafault[10000000+10000]
> pandafault[10850]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe
> pandafault[10850]: code: 392988d0 f93f0020 e93f0020 39400048 <99490000> 39200000 7d234b78 383f0040
>
> Signed-off-by: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
Does the script scripts/decode_stacktrace.sh also works with this format
change ?
> ---
> arch/powerpc/kernel/process.c | 7 +++++--
> 1 file changed, 5 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index e78799a8855a..d12143e7d8f9 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -1265,16 +1265,19 @@ static int instructions_to_print = 16;
> void show_instructions(struct pt_regs *regs)
> {
> int i;
> + const char *prefix = KERN_INFO "%s[%d]: code: ";
> unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 *
> sizeof(int));
>
> - printk("Instruction dump:");
> + printk(prefix, current->comm, current->pid);
>
> for (i = 0; i < instructions_to_print; i++) {
> int instr;
>
> - if (!(i % 8))
> + if (!(i % 8) && (i > 0)) {
> pr_cont("\n");
> + printk(prefix, current->comm, current->pid);
> + }
>
> #if !defined(CONFIG_BOOKE)
> /* If executing with the IMMU off, adjust pc rather
>
^ permalink raw reply
* Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals
From: Christophe LEROY @ 2018-08-01 6:37 UTC (permalink / raw)
To: Murilo Opsfelder Araujo, linux-kernel
Cc: Alastair D'Silva, Andrew Donnellan, Balbir Singh,
Benjamin Herrenschmidt, Cyril Bur, Eric W . Biederman,
Joe Perches, Michael Ellerman, Michael Neuling, Nicholas Piggin,
Paul Mackerras, Simon Guo, Sukadev Bhattiprolu, Tobin C . Harding,
linuxppc-dev
In-Reply-To: <20180731145020.14009-6-muriloo@linux.ibm.com>
Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :
> This adds a human-readable name in the unhandled signal message.
>
> Before this patch, a page fault looked like:
>
> pandafault[6303]: unhandled signal 11 at 100007d0 nip 1000061c lr 7fff93c55100 code 2 in pandafault[10000000+10000]
>
> After this patch, a page fault looks like:
>
> pandafault[6352]: segfault (11) at 13a2a09f8 nip 13a2a086c lr 7fffb63e5100 code 2 in pandafault[13a2a0000+10000]
>
> Signed-off-by: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
> ---
> arch/powerpc/kernel/traps.c | 39 +++++++++++++++++++++++++++++++++++--
> 1 file changed, 37 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 1c4f06fca370..e71f12bca146 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -96,6 +96,41 @@ EXPORT_SYMBOL(__debugger_fault_handler);
> #define TM_DEBUG(x...) do { } while(0)
> #endif
>
> +static const char *signames[SIGRTMIN + 1] = {
> + "UNKNOWN",
> + "SIGHUP", // 1
> + "SIGINT", // 2
> + "SIGQUIT", // 3
> + "SIGILL", // 4
> + "unhandled trap", // 5 = SIGTRAP
> + "SIGABRT", // 6 = SIGIOT
> + "bus error", // 7 = SIGBUS
> + "floating point exception", // 8 = SIGFPE
> + "illegal instruction", // 9 = SIGILL
> + "SIGUSR1", // 10
> + "segfault", // 11 = SIGSEGV
> + "SIGUSR2", // 12
> + "SIGPIPE", // 13
> + "SIGALRM", // 14
> + "SIGTERM", // 15
> + "SIGSTKFLT", // 16
> + "SIGCHLD", // 17
> + "SIGCONT", // 18
> + "SIGSTOP", // 19
> + "SIGTSTP", // 20
> + "SIGTTIN", // 21
> + "SIGTTOU", // 22
> + "SIGURG", // 23
> + "SIGXCPU", // 24
> + "SIGXFSZ", // 25
> + "SIGVTALRM", // 26
> + "SIGPROF", // 27
> + "SIGWINCH", // 28
> + "SIGIO", // 29 = SIGPOLL = SIGLOST
> + "SIGPWR", // 30
> + "SIGSYS", // 31 = SIGUNUSED
> +};
I don't think is is worth having that full table when we only use a few
of them. (As discussed in v1 https://patchwork.ozlabs.org/patch/948802/)
I would suggest to instead use a function like this:
static const char *signame(int signr)
{
if (signr == SIGBUS)
return "bus error";
if (signr == SIGFPE)
return "floating point exception";
if (signr == SIGILL)
return "illegal instruction";
if (signr == SIGILL)
return "segfault";
if (signr == SIGTRAP)
return "unhandled trap";
return "unknown signal";
}
Christophe
> +
> /*
> * Trap & Exception support
> */
> @@ -314,8 +349,8 @@ static void show_signal_msg(int signr, struct pt_regs *regs, int code,
> if (!unhandled_signal(current, signr))
> return;
>
> - pr_info("%s[%d]: unhandled signal %d at %lx nip %lx lr %lx code %x",
> - current->comm, current->pid, signr,
> + pr_info("%s[%d]: %s (%d) at %lx nip %lx lr %lx code %x",
> + current->comm, current->pid, signames[signr], signr,
> addr, regs->nip, regs->link, code);
>
> print_vma_addr(KERN_CONT " in ", regs->nip);
>
^ permalink raw reply
* [next-20180727][qla2xxx][BUG] WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 scsi_end_request+0x250/0x280
From: Abdul Haleem @ 2018-08-01 6:33 UTC (permalink / raw)
To: linuxppc-dev
Cc: linux-block, linux-fsdevel, linux-ext4, linux-scsi, linux-next,
Stephen Rothwell, linux-kernel, jejb, Jens Axboe, dgilbert,
bart.vanassche, rosattig, kyle.mahlkuch
Greeting's
linux-next kernel crashes when creating file system on scsi disks using
mkfs.ext4 (Discarding device blocks)
Machine: Power 8 Power VM LPAR
kernel : 4.18.0-rc6-next-20180727
adapter : Fibre Channel [0c04]: QLogic Corp. ISP2532-based 8Gb Fibre Channel to PCI Express HBA [1077:2532]
Test: mkfs.ext4 /dev/mapper/maptha
Above command triggers WARN_ON_ONCE at line:
0xc000000000690080 is in scsi_end_request (drivers/scsi/scsi_lib.c:691)
686
687 if (blk_queue_add_random(q))
688 add_disk_randomness(req->rq_disk);
689
690 if (!blk_rq_is_scsi(req)) {
691 WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED));
692 cmd->flags &= ~SCMD_INITIALIZED;
693 destroy_rcu_head(&cmd->rcu);
694 }
695
followed by kernel crash
# mkfs -t ext4 /dev/mapper/mpatha
mke2fs 1.43.1 (08-Jun-2016)
Found a dos partition table in /dev/mapper/mpatha
Proceed anyway? (y,n) y
Discarding device blocks:
qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 -- 1 2002.
qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 -- 1 2002.
qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 -- 1 2002.
qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 -- 1 2002.
WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 scsi_end_request+0x250/0x280
Modules linked in: xt_CHECKSUM(E) ipt_MASQUERADE(E) tun(E) kvm_pr(E)
kvm(E) ip6t_rpfilter(E) ip6t_REJECT(E) nf_reject_ipv6(E) ipt_REJECT(E)
nf_reject_ipv4(E) xt_conntrack(E) ip_set(E) nfnetlink(E) ebtable_nat(E)
ebtable_broute(E) bridge(E) stp(E) llc(E) ip6table_raw(E)
ip6table_nat(E) nf_nat_ipv6(E) ip6table_security(E) ip6table_mangle(E)
iptable_raw(E) iptable_nat(E) nf_nat_ipv4(E) nf_nat(E) nf_conntrack(E)
nf_defrag_ipv6(E) nf_defrag_ipv4(E) libcrc32c(E) iptable_security(E)
iptable_mangle(E) ebtable_filter(E) ebtables(E) ip6table_filter(E)
ip6_tables(E) iptable_filter(E) ip_tables(E) dm_service_time(E) xts(E)
vmx_crypto(E) pseries_rng(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E)
grace(E) sunrpc(E) dm_multipath(E) dm_mod(E) sch_fq_codel(E) ext4(E)
mbcache(E) jbd2(E) fscrypto(E) sr_mod(E)
cdrom(E) sd_mod(E) ibmvscsi(E) scsi_transport_srp(E) ibmveth(E)
qla2xxx(E) nvme_fc(E) nvme_fabrics(E) nvme_core(E) scsi_transport_fc(E)
tg3(E)
CPU: 12 PID: 511 Comm: kworker/12:2 Tainted: G E 4.18.0-rc6-next-20180727-autotest-autotest #1
Workqueue: qla2xxx_wq qla25xx_free_rsp_que [qla2xxx]
NIP: c000000000690080 LR: c00000000068fe80 CTR: d000000005050788
REGS: c00000027d39b380 TRAP: 0700 Tainted: G E (4.18.0-rc6-next-20180727-autotest-autotest)
MSR: 800000000282b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 24002824 XER: 20000000
CFAR: c00000000068feb8 IRQMASK: 1
GPR00: c00000000068fe80 c00000027d39b600 c00000000113de00 0000000000000000
GPR04: 0000000000000000 0000000000000000 0000000000000000 0000000000000001
GPR08: c00000027dce3000 0000000000000001 0000000000000000 d000000004330708
GPR12: 0000000000002200 c00000000ec57600 c00000000012cf78 c000000289561b40
GPR16: 0000000000000000 0000000000000000 0000000000044000 0000000000004007
GPR20: c00000027dce3860 0000000000000000 0000000000000000 c000000280db9c60
GPR24: 0000000000000002 c00000000153567c c0000002712a1cc8 0000000000000000
GPR28: 0000000000000000 c0000002712a1cc8 c00000027e217000 c0000002708efe00
NIP [c000000000690080] scsi_end_request+0x250/0x280
LR [c00000000068fe80] scsi_end_request+0x50/0x280
Call Trace:
[c00000027d39b600] [c00000000068fe80] scsi_end_request+0x50/0x280 (unreliable)
[c00000027d39b660] [c0000000006904ac] scsi_io_completion+0x29c/0x7d0
[c00000027d39b710] [c0000000006848e4] scsi_finish_command+0x104/0x1c0
[c00000027d39b790] [c00000000068f148] scsi_softirq_done+0x198/0x1f0
[c00000027d39b820] [c0000000004f2b80] blk_mq_complete_request+0x130/0x1d0
[c00000027d39b860] [c00000000068d27c] scsi_mq_done+0x2c/0xe0
[c00000027d39b890] [d000000004291080] qla2xxx_qpair_sp_compl+0xa8/0x140 [qla2xxx]
[c00000027d39b900] [d0000000042cc9d0] qla2x00_process_completed_request+0x68/0x140 [qla2xxx]
------------[ cut here ]------------
kernel BUG at block/blk-core.c:3196!
[c00000027d39b970] [d0000000042cd4ac] qla2x00_status_entry.isra.7+0x4f4/0x1750 [qla2xxx]
Oops: Exception in kernel mode, sig: 5 [#1]
[c00000027d39bb00] [d0000000042cfab0] qla24xx_process_response_queue+0x7d8/0xba0 [qla2xxx]
LE SMP NR_CPUS=2048
[c00000027d39bc40] [d0000000042f8c48] qla25xx_free_rsp_que+0x1f0/0x270 [qla2xxx]
NUMA pSeries
[c00000027d39bc80] [c00000000012478c] process_one_work+0x24c/0x500
Modules linked in: xt_CHECKSUM(E)
[c00000027d39bd20] [c000000000124acc] worker_thread+0x8c/0x590
ipt_MASQUERADE(E)
[c00000027d39bdc0] [c00000000012d0c8] kthread+0x158/0x1a0
tun(E)
[c00000027d39be30] [c00000000000b65c] ret_from_kernel_thread+0x5c/0x80
kvm_pr(E)
Instruction dump:
kvm(E)
7f23cb78
ip6t_rpfilter(E)
4bed1d15
ip6t_REJECT(E)
60000000 3c62003f
nf_reject_ipv6(E)
e8637888
ipt_REJECT(E)
7f24cb78
nf_reject_ipv4(E)
4bc9ade1
xt_conntrack(E)
60000000
ip_set(E)
e93f0258
nfnetlink(E)
e9290120
ebtable_nat(E)
fb4900c0
ebtable_broute(E)
4bffff48
bridge(E)
<0fe00000>
stp(E)
4bfffe38
llc(E)
60000000 60000000
ip6table_raw(E)
---[ end trace ee1b33dccf2a8b8a ]---
ip6table_nat(E) nf_nat_ipv6(E) ip6table_security(E) ip6table_mangle(E)
iptable_raw(E) iptable_nat(E) nf_nat_ipv4(E) nf_nat(E) nf_conntrack(E)
nf_defrag_ipv6(E) nf_defrag_ipv4(E) libcrc32c(E) iptable_security(E)
iptable_mangle(E) ebtable_filter(E) ebtables(E) ip6table_filter(E)
ip6_tables(E) iptable_filter(E) ip_tables(E) dm_service_time(E) xts(E)
vmx_crypto(E) pseries_rng(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E)
grace(E) sunrpc(E) dm_multipath(E) dm_mod(E) sch_fq_codel(E) ext4(E)
mbcache(E) jbd2(E) fscrypto(E) sr_mod(E) cdrom(E) sd_mod(E) ibmvscsi(E)
scsi_transport_srp(E) ibmveth(E) qla2xxx(E) nvme_fc(E) nvme_fabrics(E)
nvme_core(E) scsi_transport_fc(E) tg3(E)
CPU: 5 PID: 0 Comm: swapper/5 Tainted: G W E 4.18.0-rc6-next-20180727-autotest-autotest #1
NIP: c0000000004e5160 LR: c0000000004e529c CTR: c000000000022a30
REGS: c00000028ffb39f0 TRAP: 0700 Tainted: G W E (4.18.0-rc6-next-20180727-autotest-autotest)
MSR: 8000000000029033 <SF,EE,ME,IR,DR,RI,LE> CR: 28244482 XER: 2000000f
CFAR: c0000000004e52a4 IRQMASK: 1
GPR00: c0000000004e529c c00000028ffb3c70 c00000000113de00 c00000028911cc00
GPR04: 0000002461ee9abb ffffffffffffffff 0000000001f401d3 0010d9418d5afe73
GPR08: 0000000000000018 0000000000000001 00000000000005ff d000000005b89418
GPR12: c000000000022a30 c00000000eca8a00 c00000028bcc3f90 0000000000000000
GPR16: 0000000000000001 c000000000db7200 c000000001163b00 00000000ffffc7db
GPR20: 000000000000000a c000000000db7200 c000000001162200 c00000028ffb0000
GPR24: c00000028ffb0000 0000000000000100 0000000000000028 c000000001162200
GPR28: 0000000000000000 0000002461ee9abb c00000027de90998 c00000028911cc00
NIP [c0000000004e5160] blk_finish_request+0x70/0x200
LR [c0000000004e529c] blk_finish_request+0x1ac/0x200
Call Trace:
[c00000028ffb3c70] [c0000000004e529c] blk_finish_request+0x1ac/0x200 (unreliable)
[c00000028ffb3cb0] [c0000000004e5360] blk_end_bidi_request+0x70/0xc0
[c00000028ffb3cf0] [c0000000004e540c] blk_end_request_all+0x2c/0x60
[c00000028ffb3d10] [d000000005b86ae4] map_request+0x8dc/0x920 [dm_mod]
[c00000028ffb3db0] [c0000000004f0470] blk_done_softirq+0xc0/0xf0
[c00000028ffb3df0] [c0000000009e5ed0] __do_softirq+0x170/0x3ac
[c00000028ffb3ef0] [c000000000108690] irq_exit+0x1b0/0x1d0
[c00000028ffb3f20] [c000000000016ac8] __do_irq+0x98/0x210
[c00000028ffb3f90] [c000000000028830] call_do_irq+0x14/0x24
[c00000028bcc39e0] [c000000000016cd4] do_IRQ+0x94/0x140
[c00000028bcc3a30] [c000000000008c24] hardware_interrupt_common+0x114/0x120
--- interrupt: 501 at plpar_hcall_norets+0x1c/0x28
LR = check_and_cede_processor+0x2c/0x40
[c00000028bcc3d20] [c00000028bcc0000] 0xc00000028bcc0000 (unreliable)
[c00000028bcc3d80] [c0000000007b8cb8] shared_cede_loop+0x48/0x130
[c00000028bcc3db0] [c0000000007b5fd0] cpuidle_enter_state+0xa0/0x3f0
[c00000028bcc3e10] [c000000000144ae4] call_cpuidle+0x44/0x90
[c00000028bcc3e30] [c0000000001450d8] do_idle+0x328/0x3a0
[c00000028bcc3ec0] [c000000000145354] cpu_startup_entry+0x34/0x40
[c00000028bcc3ef0] [c0000000000495e0] start_secondary+0x4d0/0x510
[c00000028bcc3f90] [c00000000000ab70] start_secondary_prolog+0x10/0x14
Instruction dump:
2fa90000 409e01ac 813f0018 752a0002 40820150 712a0004 40820128 7fe9fb78
e9490041 7d295278 3149ffff 7d2a4910 <0b090000> 3d220037 39290e84 81290000
---[ end trace ee1b33dccf2a8b8b ]---
Kernel panic - not syncing: Fatal exception in interrupt
WARNING: CPU: 5 PID: 0 at drivers/tty/vt/vt.c:4220 do_unblank_screen+0x1f0/0x280
Modules linked in: xt_CHECKSUM(E) ipt_MASQUERADE(E) tun(E) kvm_pr(E)
kvm(E) ip6t_rpfilter(E) ip6t_REJECT(E) nf_reject_ipv6(E) ipt_REJECT(E)
nf_reject_ipv4(E) xt_conntrack(E) ip_set(E) nfnetlink(E) ebtable_nat(E)
ebtable_broute(E) bridge(E) stp(E) llc(E) ip6table_raw(E)
ip6table_nat(E) nf_nat_ipv6(E) ip6table_security(E) ip6table_mangle(E)
iptable_raw(E) iptable_nat(E) nf_nat_ipv4(E) nf_nat(E) nf_conntrack(E)
nf_defrag_ipv6(E) nf_defrag_ipv4(E) libcrc32c(E) iptable_security(E)
iptable_mangle(E) ebtable_filter(E) ebtables(E) ip6table_filter(E)
ip6_tables(E) iptable_filter(E) ip_tables(E) dm_service_time(E) xts(E)
vmx_crypto(E) pseries_rng(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E)
grace(E) sunrpc(E) dm_multipath(E) dm_mod(E) sch_fq_codel(E) ext4(E)
mbcache(E) jbd2(E) fscrypto(E) sr_mod(E)
cdrom(E) sd_mod(E) ibmvscsi(E) scsi_transport_srp(E) ibmveth(E)
qla2xxx(E) nvme_fc(E) nvme_fabrics(E) nvme_core(E) scsi_transport_fc(E)
tg3(E)
CPU: 5 PID: 0 Comm: swapper/5 Tainted: G D W E 4.18.0-rc6-next-20180727-autotest-autotest #1
NIP: c000000000605360 LR: c0000000006051ac CTR: c00000000002c0f0
REGS: c00000028ffb3320 TRAP: 0700 Tainted: G D W E (4.18.0-rc6-next-20180727-autotest-autotest)
MSR: 8000000000021033 <SF,ME,IR,DR,RI,LE> CR: 88242222 XER: 20000009
CFAR: c0000000006051c0 IRQMASK: 3
GPR00: c0000000006053a4 c00000028ffb35a0 c00000000113de00 0000000000000000
GPR04: 0000000000000003 000000000000000a 0000000030656665 6320303030303030
GPR08: 000000028eeb0000 0000000000000000 0000000000000000 3030303731326537
GPR12: c00000000002c0f0 c00000000eca8a00 c00000028bcc3f90 0000000000000000
GPR16: 0000000000000001 c000000000db7200 c000000001163b00 00000000ffffc7db
GPR20: 000000000000000a c000000000db7200 c000000001162200 c00000028ffb0000
GPR24: c00000028ffb0000 0000000000000100 c000000000feb688 c0000000012f6658
GPR28: c0000000004e5160 0000000000000000 c0000000014050c0 0000000000000000
NIP [c000000000605360] do_unblank_screen+0x1f0/0x280
LR [c0000000006051ac] do_unblank_screen+0x3c/0x280
Call Trace:
[c00000028ffb35a0] [c0000000006053a4] do_unblank_screen+0x234/0x280 (unreliable)
[c00000028ffb3620] [c00000000052bbf0] bust_spinlocks+0x40/0x80
[c00000028ffb3640] [c000000000100200] panic+0x1b4/0x314
[c00000028ffb36e0] [c000000000023d48] oops_end+0x1d8/0x200
[c00000028ffb3760] [c000000000024254] _exception_pkey+0x1c4/0x1f0
[c00000028ffb3900] [c000000000026330] program_check_exception+0x2c0/0x3e0
[c00000028ffb3980] [c000000000008e94] program_check_common+0x134/0x140
--- interrupt: 700 at blk_finish_request+0x70/0x200
LR = blk_finish_request+0x1ac/0x200
[c00000028ffb3cb0] [c0000000004e5360] blk_end_bidi_request+0x70/0xc0
[c00000028ffb3cf0] [c0000000004e540c] blk_end_request_all+0x2c/0x60
[c00000028ffb3d10] [d000000005b86ae4] map_request+0x8dc/0x920 [dm_mod]
[c00000028ffb3db0] [c0000000004f0470] blk_done_softirq+0xc0/0xf0
[c00000028ffb3df0] [c0000000009e5ed0] __do_softirq+0x170/0x3ac
[c00000028ffb3ef0] [c000000000108690] irq_exit+0x1b0/0x1d0
[c00000028ffb3f20] [c000000000016ac8] __do_irq+0x98/0x210
[c00000028ffb3f90] [c000000000028830] call_do_irq+0x14/0x24
[c00000028bcc39e0] [c000000000016cd4] do_IRQ+0x94/0x140
[c00000028bcc3a30] [c000000000008c24] hardware_interrupt_common+0x114/0x120
--- interrupt: 501 at plpar_hcall_norets+0x1c/0x28
LR = check_and_cede_processor+0x2c/0x40
[c00000028bcc3d20] [c00000028bcc0000] 0xc00000028bcc0000 (unreliable)
[c00000028bcc3d80] [c0000000007b8cb8] shared_cede_loop+0x48/0x130
[c00000028bcc3db0] [c0000000007b5fd0] cpuidle_enter_state+0xa0/0x3f0
[c00000028bcc3e10] [c000000000144ae4] call_cpuidle+0x44/0x90
[c00000028bcc3e30] [c0000000001450d8] do_idle+0x328/0x3a0
[c00000028bcc3ec0] [c000000000145354] cpu_startup_entry+0x34/0x40
[c00000028bcc3ef0] [c0000000000495e0] start_secondary+0x4d0/0x510
[c00000028bcc3f90] [c00000000000ab70] start_secondary_prolog+0x10/0x14
Instruction dump:
4bb79dc5 60000000 38210080 e8010010 eba1ffe8 ebc1fff0 ebe1fff8 7c0803a6
4e800020 60000000 60000000 60000000 <0fe00000> 4bfffe60 60000000 60000000
---[ end trace ee1b33dccf2a8b8c ]---
Rebooting in 10 seconds..
# lsblk
NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
sdd 8:48 0 30G 0 disk
└─mpatha 253:1 0 30G 0 mpath
sdb 8:16 0 30G 0 disk
└─mpatha 253:1 0 30G 0 mpath
# lsscsi
[0:0:1:0] disk AIX VDASD 0001 /dev/sda
[1:0:0:0] disk IBM 2145 0000 /dev/sdb
[1:0:1:0] disk IBM 2145 0000 /dev/sdd
# lsscsi -H
[0] ibmvscsi
[1] qla2xxx
--
Regard's
Abdul Haleem
IBM Linux Technology Centre
^ permalink raw reply
* Re: [PATCH 3/3] ptp_qoriq: convert to use module parameters for initialization
From: Richard Cochran @ 2018-08-01 6:15 UTC (permalink / raw)
To: Y.b. Lu
Cc: netdev@vger.kernel.org, Madalin-cristian Bucur, Rob Herring,
Shawn Guo, David S . Miller, devicetree@vger.kernel.org,
linuxppc-dev@lists.ozlabs.org,
linux-arm-kernel@lists.infradead.org,
linux-kernel@vger.kernel.org
In-Reply-To: <DB6PR0401MB253643010802739AD55820F3F82D0@DB6PR0401MB2536.eurprd04.prod.outlook.com>
On Wed, Aug 01, 2018 at 04:36:40AM +0000, Y.b. Lu wrote:
> Could I add a function to calculate a set of default register values
> to initialize ptp timer when dts method failed to get required
> properties in driver?
Yes, it would be ideal if the driver can pick correct values
automatically.
However, the frequency on the FIPER outputs can't be configured
automatically, and we don't have an API for the user to choose this.
> I think this will be useful. The ptp timer on new platforms (you may
> see two dts patches in this patchset. Many platforms will be
> affected.) will work without these dts properties. If user want
> specific setting, they can set dts properties.
Sure.
Thanks,
Richard
^ permalink raw reply
* Re: [PATCH v6 8/8] powernv/pseries: consolidate code for mce early handling.
From: Nicholas Piggin @ 2018-08-01 6:10 UTC (permalink / raw)
To: Michal Suchánek
Cc: Mahesh J Salgaonkar, linuxppc-dev, Laurent Dufour,
Michal Suchanek, Aneesh Kumar K.V
In-Reply-To: <20180709180239.3646bf39@kitsune.suse.cz>
On Mon, 9 Jul 2018 18:02:39 +0200
Michal Such=C3=A1nek <msuchanek@suse.de> wrote:
> On Fri, 6 Jul 2018 19:40:24 +1000
> Nicholas Piggin <npiggin@gmail.com> wrote:
>=20
> > On Wed, 04 Jul 2018 23:30:12 +0530
> > Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:
> > =20
> > > From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> > >=20
> > > Now that other platforms also implements real mode mce handler,
> > > lets consolidate the code by sharing existing powernv machine check
> > > early code. Rename machine_check_powernv_early to
> > > machine_check_common_early and reuse the code.
> > >=20
> > > Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> > > ---
> > > arch/powerpc/kernel/exceptions-64s.S | 56
> > > +++++++--------------------------- 1 file changed, 11
> > > insertions(+), 45 deletions(-)
> > >=20
> > > diff --git a/arch/powerpc/kernel/exceptions-64s.S
> > > b/arch/powerpc/kernel/exceptions-64s.S index
> > > 0038596b7906..3e877ec55d50 100644 ---
> > > a/arch/powerpc/kernel/exceptions-64s.S +++
> > > b/arch/powerpc/kernel/exceptions-64s.S @@ -243,14 +243,13 @@
> > > EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
> > > SET_SCRATCH0(r13) /* save r13 */
> > > EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION
> > > - b machine_check_powernv_early
> > > + b machine_check_common_early
> > > FTR_SECTION_ELSE
> > > b machine_check_pSeries_0
> > > ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
> > > EXC_REAL_END(machine_check, 0x200, 0x100)
> > > EXC_VIRT_NONE(0x4200, 0x100)
> > > -TRAMP_REAL_BEGIN(machine_check_powernv_early)
> > > -BEGIN_FTR_SECTION
> > > +TRAMP_REAL_BEGIN(machine_check_common_early)
> > > EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
> > > /*
> > > * Register contents:
> > > @@ -306,7 +305,9 @@ BEGIN_FTR_SECTION
> > > /* Save r9 through r13 from EXMC save area to stack frame.
> > > */ EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
> > > mfmsr r11 /* get MSR value */
> > > +BEGIN_FTR_SECTION
> > > ori r11,r11,MSR_ME /* turn on ME bit
> > > */ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
> > > ori r11,r11,MSR_RI /* turn on RI bit
> > > */ LOAD_HANDLER(r12, machine_check_handle_early)
> > > 1: mtspr SPRN_SRR0,r12
> > > @@ -325,7 +326,6 @@ BEGIN_FTR_SECTION
> > > andc r11,r11,r10 /* Turn off MSR_ME
> > > */ b 1b
> > > b . /* prevent speculative execution */
> > > -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
> > > =20
> > > TRAMP_REAL_BEGIN(machine_check_pSeries)
> > > .globl machine_check_fwnmi
> > > @@ -333,7 +333,7 @@ machine_check_fwnmi:
> > > SET_SCRATCH0(r13) /* save r13 */
> > > EXCEPTION_PROLOG_0(PACA_EXMC)
> > > BEGIN_FTR_SECTION
> > > - b machine_check_pSeries_early
> > > + b machine_check_common_early
> > > END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
> > > machine_check_pSeries_0:
> > > EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
> > > @@ -346,45 +346,6 @@ machine_check_pSeries_0:
> > > =20
> > > TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
> > > =20
> > > -TRAMP_REAL_BEGIN(machine_check_pSeries_early)
> > > -BEGIN_FTR_SECTION
> > > - EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
> > > - mr r10,r1 /* Save r1 */
> > > - ld r1,PACAMCEMERGSP(r13) /* Use MC emergency
> > > stack */
> > > - subi r1,r1,INT_FRAME_SIZE /* alloc stack
> > > frame */
> > > - mfspr r11,SPRN_SRR0 /* Save SRR0 */
> > > - mfspr r12,SPRN_SRR1 /* Save SRR1 */
> > > - EXCEPTION_PROLOG_COMMON_1()
> > > - EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
> > > - EXCEPTION_PROLOG_COMMON_3(0x200)
> > > - addi r3,r1,STACK_FRAME_OVERHEAD
> > > - BRANCH_LINK_TO_FAR(machine_check_early) /* Function call
> > > ABI */ -
> > > - /* Move original SRR0 and SRR1 into the respective regs */
> > > - ld r9,_MSR(r1)
> > > - mtspr SPRN_SRR1,r9
> > > - ld r3,_NIP(r1)
> > > - mtspr SPRN_SRR0,r3
> > > - ld r9,_CTR(r1)
> > > - mtctr r9
> > > - ld r9,_XER(r1)
> > > - mtxer r9
> > > - ld r9,_LINK(r1)
> > > - mtlr r9
> > > - REST_GPR(0, r1)
> > > - REST_8GPRS(2, r1)
> > > - REST_GPR(10, r1)
> > > - ld r11,_CCR(r1)
> > > - mtcr r11
> > > - REST_GPR(11, r1)
> > > - REST_2GPRS(12, r1)
> > > - /* restore original r1. */
> > > - ld r1,GPR1(r1)
> > > - SET_SCRATCH0(r13) /* save r13 */
> > > - EXCEPTION_PROLOG_0(PACA_EXMC)
> > > - b machine_check_pSeries_0
> > > -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
> > > -
> > > EXC_COMMON_BEGIN(machine_check_common)
> > > /*
> > > * Machine check is different because we use a different
> > > @@ -483,6 +444,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
> > > bl machine_check_early
> > > std r3,RESULT(r1) /* Save result */
> > > ld r12,_MSR(r1)
> > > +BEGIN_FTR_SECTION
> > > + bne 9f /* pSeries: continue
> > > to V mode. */ +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) =20
> >=20
> > Should this be "b 9f" ? Although...
> > =20
> > > =20
> > > #ifdef CONFIG_PPC_P7_NAP
> > > /*
> > > @@ -564,7 +528,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
> > > 9:
> > > /* Deliver the machine check to host kernel in V mode. */
> > > MACHINE_CHECK_HANDLER_WINDUP
> > > - b machine_check_pSeries
> > > + SET_SCRATCH0(r13) /* save r13 */
> > > + EXCEPTION_PROLOG_0(PACA_EXMC)
> > > + b machine_check_pSeries_0 =20
> >=20
> > I'm not sure that's quite right. You're missing out testing the result
> > of the early handler call? Is this buggy in existing code too? We
> > should be testing that result in all cases, shouldn't we? But it
> > doesn't seem like we are. =20
>=20
> At least for the pSeries case the result of realmode handler is stored
> in the MCE log data. Both the real and virtual part of the handler
> should be called in any case. They do different and independent things.
Hmm, well the return code of the function in the powernv case is put
into regs->result which is missing here, but I guess it gets picked
up again when the pseries code is consolidated back with powernv.
But... maybe that result is not actually used anywhere though. I
guess that's okay, maybe we should just make it return void though.
We want to avoid the case of going to virtual mode if we have errors
in the SLB or TLB that causes a recursive MCE, but the recursion
limit will eventually catch that and checkstop us which might be
the sanest option.
Thanks,
Nick
^ permalink raw reply
* Re: [PATCH v6 5/8] powerpc/pseries: flush SLB contents on SLB MCE errors.
From: Nicholas Piggin @ 2018-08-01 5:58 UTC (permalink / raw)
To: Mahesh J Salgaonkar
Cc: linuxppc-dev, Aneesh Kumar K.V, Laurent Dufour, Michal Suchanek,
Michael Ellerman
In-Reply-To: <153072708065.29016.482194584457257883.stgit@jupiter.in.ibm.com>
On Wed, 04 Jul 2018 23:28:21 +0530
Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:
> From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
>
> On pseries, as of today system crashes if we get a machine check
> exceptions due to SLB errors. These are soft errors and can be fixed by
> flushing the SLBs so the kernel can continue to function instead of
> system crash. We do this in real mode before turning on MMU. Otherwise
> we would run into nested machine checks. This patch now fetches the
> rtas error log in real mode and flushes the SLBs on SLB errors.
>
> Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> ---
> arch/powerpc/include/asm/book3s/64/mmu-hash.h | 1
> arch/powerpc/include/asm/machdep.h | 1
> arch/powerpc/kernel/exceptions-64s.S | 42 +++++++++++++++++++++
> arch/powerpc/kernel/mce.c | 16 +++++++-
> arch/powerpc/mm/slb.c | 6 +++
> arch/powerpc/platforms/pseries/pseries.h | 1
> arch/powerpc/platforms/pseries/ras.c | 51 +++++++++++++++++++++++++
> arch/powerpc/platforms/pseries/setup.c | 1
> 8 files changed, 116 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> index 50ed64fba4ae..cc00a7088cf3 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> @@ -487,6 +487,7 @@ extern void hpte_init_native(void);
>
> extern void slb_initialize(void);
> extern void slb_flush_and_rebolt(void);
> +extern void slb_flush_and_rebolt_realmode(void);
>
> extern void slb_vmalloc_update(void);
> extern void slb_set_size(u16 size);
> diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
> index ffe7c71e1132..fe447e0d4140 100644
> --- a/arch/powerpc/include/asm/machdep.h
> +++ b/arch/powerpc/include/asm/machdep.h
> @@ -108,6 +108,7 @@ struct machdep_calls {
>
> /* Early exception handlers called in realmode */
> int (*hmi_exception_early)(struct pt_regs *regs);
> + int (*machine_check_early)(struct pt_regs *regs);
>
> /* Called during machine check exception to retrive fixup address. */
> bool (*mce_check_early_recovery)(struct pt_regs *regs);
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index f283958129f2..0038596b7906 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -332,6 +332,9 @@ TRAMP_REAL_BEGIN(machine_check_pSeries)
> machine_check_fwnmi:
> SET_SCRATCH0(r13) /* save r13 */
> EXCEPTION_PROLOG_0(PACA_EXMC)
> +BEGIN_FTR_SECTION
> + b machine_check_pSeries_early
> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
> machine_check_pSeries_0:
> EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
> /*
> @@ -343,6 +346,45 @@ machine_check_pSeries_0:
>
> TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
>
> +TRAMP_REAL_BEGIN(machine_check_pSeries_early)
> +BEGIN_FTR_SECTION
> + EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
> + mr r10,r1 /* Save r1 */
> + ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */
> + subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
> + mfspr r11,SPRN_SRR0 /* Save SRR0 */
> + mfspr r12,SPRN_SRR1 /* Save SRR1 */
> + EXCEPTION_PROLOG_COMMON_1()
> + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
> + EXCEPTION_PROLOG_COMMON_3(0x200)
> + addi r3,r1,STACK_FRAME_OVERHEAD
> + BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */
> +
> + /* Move original SRR0 and SRR1 into the respective regs */
> + ld r9,_MSR(r1)
> + mtspr SPRN_SRR1,r9
> + ld r3,_NIP(r1)
> + mtspr SPRN_SRR0,r3
> + ld r9,_CTR(r1)
> + mtctr r9
> + ld r9,_XER(r1)
> + mtxer r9
> + ld r9,_LINK(r1)
> + mtlr r9
> + REST_GPR(0, r1)
> + REST_8GPRS(2, r1)
> + REST_GPR(10, r1)
> + ld r11,_CCR(r1)
> + mtcr r11
> + REST_GPR(11, r1)
> + REST_2GPRS(12, r1)
> + /* restore original r1. */
> + ld r1,GPR1(r1)
> + SET_SCRATCH0(r13) /* save r13 */
> + EXCEPTION_PROLOG_0(PACA_EXMC)
> + b machine_check_pSeries_0
> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
> +
> EXC_COMMON_BEGIN(machine_check_common)
> /*
> * Machine check is different because we use a different
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index efdd16a79075..221271c96a57 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs)
> {
> long handled = 0;
>
> - __this_cpu_inc(irq_stat.mce_exceptions);
> + /*
> + * For pSeries we count mce when we go into virtual mode machine
> + * check handler. Hence skip it. Also, We can't access per cpu
> + * variables in real mode for LPAR.
> + */
> + if (early_cpu_has_feature(CPU_FTR_HVMODE))
> + __this_cpu_inc(irq_stat.mce_exceptions);
>
> - if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> + /*
> + * See if platform is capable of handling machine check.
> + * Otherwise fallthrough and allow CPU to handle this machine check.
> + */
> + if (ppc_md.machine_check_early)
> + handled = ppc_md.machine_check_early(regs);
> + else if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> handled = cur_cpu_spec->machine_check_early(regs);
> return handled;
> }
This looks fine to me after Michal's patch. Not sure if you want to
fold them or add his immediately after this one in your series.
> diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
> index 66577cc66dc9..5b1813b98358 100644
> --- a/arch/powerpc/mm/slb.c
> +++ b/arch/powerpc/mm/slb.c
> @@ -145,6 +145,12 @@ void slb_flush_and_rebolt(void)
> get_paca()->slb_cache_ptr = 0;
> }
>
> +void slb_flush_and_rebolt_realmode(void)
> +{
> + __slb_flush_and_rebolt();
> + get_paca()->slb_cache_ptr = 0;
> +}
I think this should do something more like flush_and_reload_slb from
powernv machine check code. We are real mode so should invalidate all
SLBs.
It happens I also need very similar code (without the initial
invalidate) for implementing idle wakeup code in C, so we should move
that function and variants into mm/slb.c IMO.
Thanks,
Nick
^ permalink raw reply
* Re: [PATCH v5 5/7] powerpc/pseries: flush SLB contents on SLB MCE errors.
From: Nicholas Piggin @ 2018-08-01 5:49 UTC (permalink / raw)
To: Michal Suchánek
Cc: Mahesh J Salgaonkar, Aneesh Kumar K.V, Laurent Dufour,
linuxppc-dev
In-Reply-To: <20180712154113.46845936@kitsune.suse.cz>
On Thu, 12 Jul 2018 15:41:13 +0200
Michal Such=C3=A1nek <msuchanek@suse.de> wrote:
> On Tue, 3 Jul 2018 08:08:14 +1000
> "Nicholas Piggin" <npiggin@gmail.com> wrote:
>=20
> > On Mon, 02 Jul 2018 11:17:06 +0530
> > Mahesh J Salgaonkar <mahesh@linux.vnet.ibm.com> wrote:
> > =20
> > > From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> > >=20
> > > On pseries, as of today system crashes if we get a machine check
> > > exceptions due to SLB errors. These are soft errors and can be
> > > fixed by flushing the SLBs so the kernel can continue to function
> > > instead of system crash. We do this in real mode before turning on
> > > MMU. Otherwise we would run into nested machine checks. This patch
> > > now fetches the rtas error log in real mode and flushes the SLBs on
> > > SLB errors.
> > >=20
> > > Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
> > > ---
> > > arch/powerpc/include/asm/book3s/64/mmu-hash.h | 1=20
> > > arch/powerpc/include/asm/machdep.h | 1=20
> > > arch/powerpc/kernel/exceptions-64s.S | 42
> > > +++++++++++++++++++++ arch/powerpc/kernel/mce.c
> > > | 16 +++++++- arch/powerpc/mm/slb.c |
> > > 6 +++ arch/powerpc/platforms/powernv/opal.c | 1=20
> > > arch/powerpc/platforms/pseries/pseries.h | 1=20
> > > arch/powerpc/platforms/pseries/ras.c | 51
> > > +++++++++++++++++++++++++
> > > arch/powerpc/platforms/pseries/setup.c | 1 9 files
> > > changed, 116 insertions(+), 4 deletions(-) =20
> >=20
> > =20
> > > +TRAMP_REAL_BEGIN(machine_check_pSeries_early)
> > > +BEGIN_FTR_SECTION
> > > + EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
> > > + mr r10,r1 /* Save r1 */
> > > + ld r1,PACAMCEMERGSP(r13) /* Use MC emergency
> > > stack */
> > > + subi r1,r1,INT_FRAME_SIZE /* alloc stack
> > > frame */
> > > + mfspr r11,SPRN_SRR0 /* Save SRR0 */
> > > + mfspr r12,SPRN_SRR1 /* Save SRR1 */
> > > + EXCEPTION_PROLOG_COMMON_1()
> > > + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
> > > + EXCEPTION_PROLOG_COMMON_3(0x200)
> > > + addi r3,r1,STACK_FRAME_OVERHEAD
> > > + BRANCH_LINK_TO_FAR(machine_check_early) /* Function call
> > > ABI */ =20
> >=20
> > Is there any reason you can't use the existing
> > machine_check_powernv_early code to do all this?
> > =20
> > > diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> > > index efdd16a79075..221271c96a57 100644
> > > --- a/arch/powerpc/kernel/mce.c
> > > +++ b/arch/powerpc/kernel/mce.c
> > > @@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs)
> > > {
> > > long handled =3D 0;
> > > =20
> > > - __this_cpu_inc(irq_stat.mce_exceptions);
> > > + /*
> > > + * For pSeries we count mce when we go into virtual mode
> > > machine
> > > + * check handler. Hence skip it. Also, We can't access per
> > > cpu
> > > + * variables in real mode for LPAR.
> > > + */
> > > + if (early_cpu_has_feature(CPU_FTR_HVMODE))
> > > + __this_cpu_inc(irq_stat.mce_exceptions);
> > > =20
> > > - if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> > > + /*
> > > + * See if platform is capable of handling machine check.
> > > + * Otherwise fallthrough and allow CPU to handle this
> > > machine check.
> > > + */
> > > + if (ppc_md.machine_check_early)
> > > + handled =3D ppc_md.machine_check_early(regs);
> > > + else if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> > > handled =3D
> > > cur_cpu_spec->machine_check_early(regs); =20
> >=20
> > Would be good to add a powernv ppc_md handler which does the
> > cur_cpu_spec->machine_check_early() call now that other platforms are
> > calling this code. Because those aren't valid as a fallback call, but
> > specific to powernv.
> > =20
>=20
> Something like this (untested)?
Sorry, some emails fell through the cracks. Yes exactly like this would
be good. If you can add a quick changelog and SOB, and
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Thanks,
Nick
>=20
> Subject: [PATCH] powerpc/powernv: define platform MCE handler.
>=20
> ---
> arch/powerpc/kernel/mce.c | 3 ---
> arch/powerpc/platforms/powernv/setup.c | 11 +++++++++++
> 2 files changed, 11 insertions(+), 3 deletions(-)
>=20
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index 221271c96a57..ae17d8aa60c4 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -498,12 +498,9 @@ long machine_check_early(struct pt_regs *regs)
> =20
> /*
> * See if platform is capable of handling machine check.
> - * Otherwise fallthrough and allow CPU to handle this machine check.
> */
> if (ppc_md.machine_check_early)
> handled =3D ppc_md.machine_check_early(regs);
> - else if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> - handled =3D cur_cpu_spec->machine_check_early(regs);
> return handled;
> }
> =20
> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platfo=
rms/powernv/setup.c
> index f96df0a25d05..b74c93bc2e55 100644
> --- a/arch/powerpc/platforms/powernv/setup.c
> +++ b/arch/powerpc/platforms/powernv/setup.c
> @@ -431,6 +431,16 @@ static unsigned long pnv_get_proc_freq(unsigned int =
cpu)
> return ret_freq;
> }
> =20
> +static long pnv_machine_check_early(struct pt_regs *regs)
> +{
> + long handled =3D 0;
> +
> + if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> + handled =3D cur_cpu_spec->machine_check_early(regs);
> +
> + return handled;
> +}
> +
> define_machine(powernv) {
> .name =3D "PowerNV",
> .probe =3D pnv_probe,
> @@ -442,6 +452,7 @@ define_machine(powernv) {
> .machine_shutdown =3D pnv_shutdown,
> .power_save =3D NULL,
> .calibrate_decr =3D generic_calibrate_decr,
> + .machine_check_early =3D pnv_machine_check_early,
> #ifdef CONFIG_KEXEC_CORE
> .kexec_cpu_down =3D pnv_kexec_cpu_down,
> #endif
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox