* [PATCH BACKPORTv2 4.4] crypto: vmx - ghash: do nosimd fallback manually
From: Daniel Axtens @ 2019-06-03 2:09 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Herbert Xu, stable, Daniel Axtens
commit 357d065a44cdd77ed5ff35155a989f2a763e96ef upstream.
[backported: the VMX driver did not use crypto_simd_usable() until
after 5.1, CRYPTO_ALG_TYPE_SHASH was still specified in .options
until after 4.14, and the sequence for preparing the kernel to use
vmx changed after 4.4.]
VMX ghash was using a fallback that did not support interleaving simd
and nosimd operations, leading to failures in the extended test suite.
If I understood correctly, Eric's suggestion was to use the same
data format that the generic code uses, allowing us to call into it
with the same contexts. I wasn't able to get that to work - I think
there's a very different key structure and data layout being used.
So instead steal the arm64 approach and perform the fallback
operations directly if required.
Fixes: cc333cd68dfa ("crypto: vmx - Adding GHASH routines for VMX module")
Cc: stable@vger.kernel.org # v4.1+
Reported-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Daniel Axtens <dja@axtens.net>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Daniel Axtens <dja@axtens.net>
---
v2: do stable backport form correctly.
---
drivers/crypto/vmx/ghash.c | 218 +++++++++++++++----------------------
1 file changed, 89 insertions(+), 129 deletions(-)
diff --git a/drivers/crypto/vmx/ghash.c b/drivers/crypto/vmx/ghash.c
index 84b9389bf1ed..d6b68cf7bba7 100644
--- a/drivers/crypto/vmx/ghash.c
+++ b/drivers/crypto/vmx/ghash.c
@@ -1,22 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
/**
* GHASH routines supporting VMX instructions on the Power 8
*
- * Copyright (C) 2015 International Business Machines Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 only.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Copyright (C) 2015, 2019 International Business Machines Inc.
*
* Author: Marcelo Henrique Cerri <mhcerri@br.ibm.com>
+ *
+ * Extended by Daniel Axtens <dja@axtens.net> to replace the fallback
+ * mechanism. The new approach is based on arm64 code, which is:
+ * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
#include <linux/types.h>
@@ -39,71 +31,25 @@ void gcm_ghash_p8(u64 Xi[2], const u128 htable[16],
const u8 *in, size_t len);
struct p8_ghash_ctx {
+ /* key used by vector asm */
u128 htable[16];
- struct crypto_shash *fallback;
+ /* key used by software fallback */
+ be128 key;
};
struct p8_ghash_desc_ctx {
u64 shash[2];
u8 buffer[GHASH_DIGEST_SIZE];
int bytes;
- struct shash_desc fallback_desc;
};
-static int p8_ghash_init_tfm(struct crypto_tfm *tfm)
-{
- const char *alg = "ghash-generic";
- struct crypto_shash *fallback;
- struct crypto_shash *shash_tfm = __crypto_shash_cast(tfm);
- struct p8_ghash_ctx *ctx = crypto_tfm_ctx(tfm);
-
- fallback = crypto_alloc_shash(alg, 0, CRYPTO_ALG_NEED_FALLBACK);
- if (IS_ERR(fallback)) {
- printk(KERN_ERR
- "Failed to allocate transformation for '%s': %ld\n",
- alg, PTR_ERR(fallback));
- return PTR_ERR(fallback);
- }
-
- crypto_shash_set_flags(fallback,
- crypto_shash_get_flags((struct crypto_shash
- *) tfm));
-
- /* Check if the descsize defined in the algorithm is still enough. */
- if (shash_tfm->descsize < sizeof(struct p8_ghash_desc_ctx)
- + crypto_shash_descsize(fallback)) {
- printk(KERN_ERR
- "Desc size of the fallback implementation (%s) does not match the expected value: %lu vs %u\n",
- alg,
- shash_tfm->descsize - sizeof(struct p8_ghash_desc_ctx),
- crypto_shash_descsize(fallback));
- return -EINVAL;
- }
- ctx->fallback = fallback;
-
- return 0;
-}
-
-static void p8_ghash_exit_tfm(struct crypto_tfm *tfm)
-{
- struct p8_ghash_ctx *ctx = crypto_tfm_ctx(tfm);
-
- if (ctx->fallback) {
- crypto_free_shash(ctx->fallback);
- ctx->fallback = NULL;
- }
-}
-
static int p8_ghash_init(struct shash_desc *desc)
{
- struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
dctx->bytes = 0;
memset(dctx->shash, 0, GHASH_DIGEST_SIZE);
- dctx->fallback_desc.tfm = ctx->fallback;
- dctx->fallback_desc.flags = desc->flags;
- return crypto_shash_init(&dctx->fallback_desc);
+ return 0;
}
static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
@@ -122,7 +68,53 @@ static int p8_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
gcm_init_p8(ctx->htable, (const u64 *) key);
pagefault_enable();
preempt_enable();
- return crypto_shash_setkey(ctx->fallback, key, keylen);
+
+ memcpy(&ctx->key, key, GHASH_BLOCK_SIZE);
+
+ return 0;
+}
+
+static inline void __ghash_block(struct p8_ghash_ctx *ctx,
+ struct p8_ghash_desc_ctx *dctx)
+{
+ if (!IN_INTERRUPT) {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_altivec();
+ enable_kernel_vsx();
+ enable_kernel_fp();
+ gcm_ghash_p8(dctx->shash, ctx->htable,
+ dctx->buffer, GHASH_DIGEST_SIZE);
+ pagefault_enable();
+ preempt_enable();
+ } else {
+ crypto_xor((u8 *)dctx->shash, dctx->buffer, GHASH_BLOCK_SIZE);
+ gf128mul_lle((be128 *)dctx->shash, &ctx->key);
+ }
+}
+
+static inline void __ghash_blocks(struct p8_ghash_ctx *ctx,
+ struct p8_ghash_desc_ctx *dctx,
+ const u8 *src, unsigned int srclen)
+{
+ if (!IN_INTERRUPT) {
+ preempt_disable();
+ pagefault_disable();
+ enable_kernel_altivec();
+ enable_kernel_vsx();
+ enable_kernel_fp();
+ gcm_ghash_p8(dctx->shash, ctx->htable,
+ src, srclen);
+ pagefault_enable();
+ preempt_enable();
+ } else {
+ while (srclen >= GHASH_BLOCK_SIZE) {
+ crypto_xor((u8 *)dctx->shash, src, GHASH_BLOCK_SIZE);
+ gf128mul_lle((be128 *)dctx->shash, &ctx->key);
+ srclen -= GHASH_BLOCK_SIZE;
+ src += GHASH_BLOCK_SIZE;
+ }
+ }
}
static int p8_ghash_update(struct shash_desc *desc,
@@ -132,51 +124,33 @@ static int p8_ghash_update(struct shash_desc *desc,
struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
- if (IN_INTERRUPT) {
- return crypto_shash_update(&dctx->fallback_desc, src,
- srclen);
- } else {
- if (dctx->bytes) {
- if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
- memcpy(dctx->buffer + dctx->bytes, src,
- srclen);
- dctx->bytes += srclen;
- return 0;
- }
+ if (dctx->bytes) {
+ if (dctx->bytes + srclen < GHASH_DIGEST_SIZE) {
memcpy(dctx->buffer + dctx->bytes, src,
- GHASH_DIGEST_SIZE - dctx->bytes);
- preempt_disable();
- pagefault_disable();
- enable_kernel_altivec();
- enable_kernel_vsx();
- enable_kernel_fp();
- gcm_ghash_p8(dctx->shash, ctx->htable,
- dctx->buffer, GHASH_DIGEST_SIZE);
- pagefault_enable();
- preempt_enable();
- src += GHASH_DIGEST_SIZE - dctx->bytes;
- srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
- dctx->bytes = 0;
- }
- len = srclen & ~(GHASH_DIGEST_SIZE - 1);
- if (len) {
- preempt_disable();
- pagefault_disable();
- enable_kernel_altivec();
- enable_kernel_vsx();
- enable_kernel_fp();
- gcm_ghash_p8(dctx->shash, ctx->htable, src, len);
- pagefault_enable();
- preempt_enable();
- src += len;
- srclen -= len;
- }
- if (srclen) {
- memcpy(dctx->buffer, src, srclen);
- dctx->bytes = srclen;
+ srclen);
+ dctx->bytes += srclen;
+ return 0;
}
- return 0;
+ memcpy(dctx->buffer + dctx->bytes, src,
+ GHASH_DIGEST_SIZE - dctx->bytes);
+
+ __ghash_block(ctx, dctx);
+
+ src += GHASH_DIGEST_SIZE - dctx->bytes;
+ srclen -= GHASH_DIGEST_SIZE - dctx->bytes;
+ dctx->bytes = 0;
+ }
+ len = srclen & ~(GHASH_DIGEST_SIZE - 1);
+ if (len) {
+ __ghash_blocks(ctx, dctx, src, len);
+ src += len;
+ srclen -= len;
}
+ if (srclen) {
+ memcpy(dctx->buffer, src, srclen);
+ dctx->bytes = srclen;
+ }
+ return 0;
}
static int p8_ghash_final(struct shash_desc *desc, u8 *out)
@@ -185,26 +159,14 @@ static int p8_ghash_final(struct shash_desc *desc, u8 *out)
struct p8_ghash_ctx *ctx = crypto_tfm_ctx(crypto_shash_tfm(desc->tfm));
struct p8_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
- if (IN_INTERRUPT) {
- return crypto_shash_final(&dctx->fallback_desc, out);
- } else {
- if (dctx->bytes) {
- for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
- dctx->buffer[i] = 0;
- preempt_disable();
- pagefault_disable();
- enable_kernel_altivec();
- enable_kernel_vsx();
- enable_kernel_fp();
- gcm_ghash_p8(dctx->shash, ctx->htable,
- dctx->buffer, GHASH_DIGEST_SIZE);
- pagefault_enable();
- preempt_enable();
- dctx->bytes = 0;
- }
- memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
- return 0;
+ if (dctx->bytes) {
+ for (i = dctx->bytes; i < GHASH_DIGEST_SIZE; i++)
+ dctx->buffer[i] = 0;
+ __ghash_block(ctx, dctx);
+ dctx->bytes = 0;
}
+ memcpy(out, dctx->shash, GHASH_DIGEST_SIZE);
+ return 0;
}
struct shash_alg p8_ghash_alg = {
@@ -219,11 +181,9 @@ struct shash_alg p8_ghash_alg = {
.cra_name = "ghash",
.cra_driver_name = "p8_ghash",
.cra_priority = 1000,
- .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_NEED_FALLBACK,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = GHASH_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct p8_ghash_ctx),
.cra_module = THIS_MODULE,
- .cra_init = p8_ghash_init_tfm,
- .cra_exit = p8_ghash_exit_tfm,
},
};
--
2.19.1
^ permalink raw reply related
* Re: [PATCH kernel v3 1/3] powerpc/iommu: Allow bypass-only for DMA
From: David Gibson @ 2019-06-03 2:03 UTC (permalink / raw)
To: Alexey Kardashevskiy
Cc: Sam Bobroff, Alistair Popple, Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20190530070355.121802-2-aik@ozlabs.ru>
[-- Attachment #1: Type: text/plain, Size: 1953 bytes --]
On Thu, May 30, 2019 at 05:03:53PM +1000, Alexey Kardashevskiy wrote:
> POWER8 and newer support a bypass mode which maps all host memory to
> PCI buses so an IOMMU table is not always required. However if we fail to
> create such a table, the DMA setup fails and the kernel does not boot.
>
> This skips the 32bit DMA setup check if the bypass is can be selected.
>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>
> This minor thing helped me debugging next 2 patches so it can help
> somebody else too.
> ---
> arch/powerpc/kernel/dma-iommu.c | 11 +++++------
> 1 file changed, 5 insertions(+), 6 deletions(-)
>
> diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
> index 09231ef06d01..809c1dc01edf 100644
> --- a/arch/powerpc/kernel/dma-iommu.c
> +++ b/arch/powerpc/kernel/dma-iommu.c
> @@ -118,18 +118,17 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
> {
> struct iommu_table *tbl = get_iommu_table_base(dev);
>
> - if (!tbl) {
> - dev_info(dev, "Warning: IOMMU dma not supported: mask 0x%08llx"
> - ", table unavailable\n", mask);
> - return 0;
> - }
> -
> if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
> dev->archdata.iommu_bypass = true;
> dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
> return 1;
> }
>
> + if (!tbl) {
> + dev_err(dev, "Warning: IOMMU dma not supported: mask 0x%08llx, table unavailable\n", mask);
> + return 0;
> + }
> +
> if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
> dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
> dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply
* Re: [PATCH v3 1/3] PCI: Introduce pcibios_ignore_alignment_request
From: Shawn Anastasio @ 2019-06-03 2:23 UTC (permalink / raw)
To: Alexey Kardashevskiy, Oliver
Cc: Sam Bobroff, linux-pci, Linux Kernel Mailing List, rppt,
Paul Mackerras, Bjorn Helgaas, xyjxie, linuxppc-dev
In-Reply-To: <81a015ed-2c99-7ca8-c5ad-cede93aeba97@ozlabs.ru>
On 5/30/19 10:56 PM, Alexey Kardashevskiy wrote:
>
>
> On 31/05/2019 08:49, Shawn Anastasio wrote:
>> On 5/29/19 10:39 PM, Alexey Kardashevskiy wrote:
>>>
>>>
>>> On 28/05/2019 17:39, Shawn Anastasio wrote:
>>>>
>>>>
>>>> On 5/28/19 1:27 AM, Alexey Kardashevskiy wrote:
>>>>>
>>>>>
>>>>> On 28/05/2019 15:36, Oliver wrote:
>>>>>> On Tue, May 28, 2019 at 2:03 PM Shawn Anastasio <shawn@anastas.io>
>>>>>> wrote:
>>>>>>>
>>>>>>> Introduce a new pcibios function pcibios_ignore_alignment_request
>>>>>>> which allows the PCI core to defer to platform-specific code to
>>>>>>> determine whether or not to ignore alignment requests for PCI
>>>>>>> resources.
>>>>>>>
>>>>>>> The existing behavior is to simply ignore alignment requests when
>>>>>>> PCI_PROBE_ONLY is set. This is behavior is maintained by the
>>>>>>> default implementation of pcibios_ignore_alignment_request.
>>>>>>>
>>>>>>> Signed-off-by: Shawn Anastasio <shawn@anastas.io>
>>>>>>> ---
>>>>>>> drivers/pci/pci.c | 9 +++++++--
>>>>>>> include/linux/pci.h | 1 +
>>>>>>> 2 files changed, 8 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>>>>>>> index 8abc843b1615..8207a09085d1 100644
>>>>>>> --- a/drivers/pci/pci.c
>>>>>>> +++ b/drivers/pci/pci.c
>>>>>>> @@ -5882,6 +5882,11 @@ resource_size_t __weak
>>>>>>> pcibios_default_alignment(void)
>>>>>>> return 0;
>>>>>>> }
>>>>>>>
>>>>>>> +int __weak pcibios_ignore_alignment_request(void)
>>>>>>> +{
>>>>>>> + return pci_has_flag(PCI_PROBE_ONLY);
>>>>>>> +}
>>>>>>> +
>>>>>>> #define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE
>>>>>>> static char
>>>>>>> resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0};
>>>>>>> static DEFINE_SPINLOCK(resource_alignment_lock);
>>>>>>> @@ -5906,9 +5911,9 @@ static resource_size_t
>>>>>>> pci_specified_resource_alignment(struct pci_dev *dev,
>>>>>>> p = resource_alignment_param;
>>>>>>> if (!*p && !align)
>>>>>>> goto out;
>>>>>>> - if (pci_has_flag(PCI_PROBE_ONLY)) {
>>>>>>> + if (pcibios_ignore_alignment_request()) {
>>>>>>> align = 0;
>>>>>>> - pr_info_once("PCI: Ignoring requested alignments
>>>>>>> (PCI_PROBE_ONLY)\n");
>>>>>>> + pr_info_once("PCI: Ignoring requested alignments\n");
>>>>>>> goto out;
>>>>>>> }
>>>>>>
>>>>>> I think the logic here is questionable to begin with. If the user has
>>>>>> explicitly requested re-aligning a resource via the command line then
>>>>>> we should probably do it even if PCI_PROBE_ONLY is set. When it breaks
>>>>>> they get to keep the pieces.
>>>>>>
>>>>>> That said, the real issue here is that PCI_PROBE_ONLY probably
>>>>>> shouldn't be set under qemu/kvm. Under the other hypervisor (PowerVM)
>>>>>> hotplugged devices are configured by firmware before it's passed to
>>>>>> the guest and we need to keep the FW assignments otherwise things
>>>>>> break. QEMU however doesn't do any BAR assignments and relies on that
>>>>>> being handled by the guest. At boot time this is done by SLOF, but
>>>>>> Linux only keeps SLOF around until it's extracted the device-tree.
>>>>>> Once that's done SLOF gets blown away and the kernel needs to do it's
>>>>>> own BAR assignments. I'm guessing there's a hack in there to make it
>>>>>> work today, but it's a little surprising that it works at all...
>>>>>
>>>>>
>>>>> The hack is to run a modified qemu-aware "/usr/sbin/rtas_errd" in the
>>>>> guest which receives an event from qemu (RAS_EPOW from
>>>>> /proc/interrupts), fetches device tree chunks (and as I understand it -
>>>>> they come with BARs from phyp but without from qemu) and writes "1" to
>>>>> "/sys/bus/pci/rescan" which calls pci_assign_resource() eventually:
>>>>
>>>> Interesting. Does this mean that the PHYP hotplug path doesn't
>>>> call pci_assign_resource?
>>>
>>>
>>> I'd expect dlpar_add_slot() to be called under phyp and eventually
>>> pci_device_add() which (I think) may or may not trigger later
>>> reassignment.
>>>
>>>
>>>> If so it means the patch may not
>>>> break that platform after all, though it still may not be
>>>> the correct way of doing things.
>>>
>>>
>>> We should probably stop enforcing the PCI_PROBE_ONLY flag - it seems
>>> that (unless resource_alignment= is used) the pseries guest should just
>>> walk through all allocated resources and leave them unchanged.
>>
>> If we add a pcibios_default_alignment() implementation like was
>> suggested earlier, then it will behave as if the user has
>> specified resource_alignment= by default and SLOF's assignments
>> won't be honored (I think).
>
>
> I removed pci_add_flags(PCI_PROBE_ONLY) from pSeries_setup_arch and
> tried booting with and without pci=resource_alignment= and I can see no
> difference - BARs are still aligned to 64K as programmed in SLOF; if I
> hack SLOF to align to 4K or 32K - BARs get packed and the guest leaves
> them unchanged.
>
>
>> I guess it boils down to one question - is it important that we
>> observe SLOF's initial BAR assignments?
>
> It isn't if it's SLOF but it is if it's phyp. It used to not
> allow/support BAR reassignment and even if it does not, I'd rather avoid
> touching them.
A quick update. I tried removing pci_add_flags(PCI_PROBE_ONLY) which
worked, but if I add an implementation of pcibios_default_alignment
which simply returns PAGE_SIZE, my VM fails to boot and many errors
from the virtio disk driver are printed to the console.
After some investigation, it seems that with pcibios_default_alignment
present, Linux will reallocate all resources provided by SLOF on
boot. I'm still not sure why exactly this causes the virtio driver
to fail, but it does indicate that there is a reason to keep
SLOF's initial assignments.
Anybody have an idea what's causing this?
^ permalink raw reply
* Re: [PATCH kernel] prom_init: Fetch flatten device tree from the system firmware
From: Alexey Kardashevskiy @ 2019-06-03 2:56 UTC (permalink / raw)
To: Segher Boessenkool, Benjamin Herrenschmidt
Cc: linuxppc-dev, Suraj Jitindar Singh, David Gibson
In-Reply-To: <20190602232330.GN31586@gate.crashing.org>
On 03/06/2019 09:23, Segher Boessenkool wrote:
> Hi!
>
> On Fri, May 31, 2019 at 11:03:26AM +1000, Benjamin Herrenschmidt wrote:
>> On Thu, 2019-05-30 at 14:37 -0500, Segher Boessenkool wrote:
>>> On Thu, May 30, 2019 at 05:09:06PM +1000, Alexey Kardashevskiy wrote:
>>>> so, it is sort-of nack from David and sort-of ack from Segher, what
>>>> happens now?
>>>
>>> Maybe what we really need just a CI call to get all properties of a node
>>> at once? Will that speed up things enough?
>>>
>>> That way you need no change at all in lifetime of properties and how they
>>> are used, etc.; just a client getting the properties is a lot faster.
>>
>> Hrm... if we're going to create a new interface, let's go for what we
>> need.
>>
>> What we need is the FDT. It's a rather ubiquitous thing these days, it
>> makes sense to have a way to fetch an FDT directly from FW.
>
> That is all you need if you do not want to use OF at all.
? We also need OF drivers to boot grub and the system, and a default
console for early booting, but yes, we do not want to keep using slof
that much.
> If you *do* want to keep having an Open Firmware, what we want or need
> is a faster way to walk huge device trees.
Why? We do not need to _walk_ the tree at all (we _have_ to now and
while walking we do nothing but pack everything together into the fdt
blob) as slof can easily do this already.
>> There is no use for the "fetch all properties" cases other than
>> building an FDT that any of us can think of, and it would create a more
>> complicated interface than just "fetch an FDT".
>
> It is a simple way to speed up fetching the device tree enormously,
> without needing big changes to either OF or the clients using it -- not
> in the code, but importantly also not conceptually: everything works just
> as before, just a lot faster.
I can safely presume though that this will never ever be used in
practice. And it will be still slower, a tiny bit. And require new code
in both slof and linux.
I can rather see us getting rid of SLOF in the future which in turn will
require the fdt blob pointer in r5 (or whatever it is, forgot) when the
guest starts; so "fetch-all-props" won't be used there either.
>> So I go for the simple one and agree with Alexey's idea.
>
> When dealing with a whole device tree you have to know about the various
> dynamically generated nodes and props, and handle each appropriately.
The code I am changing fetches the device tree and build an fdt. What is
that special knowledge in this context you are talking about?
--
Alexey
^ permalink raw reply
* Re: [next-20190530] Boot failure on PowerPC
From: Sachin Sant @ 2019-06-03 3:11 UTC (permalink / raw)
To: Dexuan-Linux Cui; +Cc: linux-next, linuxppc-dev, v-lide, Dexuan Cui
In-Reply-To: <CAA42JLYcPi4ypvX=Ma8yWzUCF=B=FkDrzLex=bJiLyryuWTE2g@mail.gmail.com>
> On 31-May-2019, at 11:43 PM, Dexuan-Linux Cui <dexuan.linux@gmail.com> wrote:
>
> On Fri, May 31, 2019 at 6:52 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>>>
>>> Machine boots till login prompt and then panics few seconds later.
>>>
>>> Last known next build was May 24th. Will attempt few builds till May 30 to
>>> narrow down this problem.
>>
>> My CI was fine with next-20190529 (9a15d2e3fd03e3).
>>
>> cheers
>
> Hi Sachin,
> It looks this patch may fix the issue:
> https://lkml.org/lkml/2019/5/30/1630 , but I'm not sure.
It does not help fix the kernel panic issue, but it fixes the get_swap_device warning
messages during the boot.
Thanks
-Sachin
^ permalink raw reply
* Re: [PATCH] scsi: ibmvscsi: Don't use rc uninitialized in ibmvscsi_do_work
From: Nathan Chancellor @ 2019-06-03 3:23 UTC (permalink / raw)
To: Michael Ellerman
Cc: Tyrel Datwyler, linux-scsi, Martin K. Petersen,
James E.J. Bottomley, linux-kernel, clang-built-linux,
linuxppc-dev
In-Reply-To: <87blzgnvhx.fsf@concordia.ellerman.id.au>
Hi Michael,
On Sun, Jun 02, 2019 at 08:15:38PM +1000, Michael Ellerman wrote:
> Hi Nathan,
>
> It's always preferable IMHO to keep any initialisation as localised as
> possible, so that the compiler can continue to warn about uninitialised
> usages elsewhere. In this case that would mean doing the rc = 0 in the
> switch, something like:
I am certainly okay with implementing this in a v2. I mulled over which
would be preferred, I suppose I guessed wrong :) Thank you for the
review and input.
>
> diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
> index 727c31dc11a0..7ee5755cf636 100644
> --- a/drivers/scsi/ibmvscsi/ibmvscsi.c
> +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
> @@ -2123,9 +2123,6 @@ static void ibmvscsi_do_work(struct ibmvscsi_host_data *hostdata)
>
> spin_lock_irqsave(hostdata->host->host_lock, flags);
> switch (hostdata->action) {
> - case IBMVSCSI_HOST_ACTION_NONE:
> - case IBMVSCSI_HOST_ACTION_UNBLOCK:
> - break;
> case IBMVSCSI_HOST_ACTION_RESET:
> spin_unlock_irqrestore(hostdata->host->host_lock, flags);
> rc = ibmvscsi_reset_crq_queue(&hostdata->queue, hostdata);
> @@ -2142,7 +2139,10 @@ static void ibmvscsi_do_work(struct ibmvscsi_host_data *hostdata)
> if (!rc)
> rc = ibmvscsi_send_crq(hostdata, 0xC001000000000000LL, 0);
> break;
> + case IBMVSCSI_HOST_ACTION_NONE:
> + case IBMVSCSI_HOST_ACTION_UNBLOCK:
> default:
> + rc = 0;
> break;
> }
>
>
> But then that makes me wonder if that's actually correct?
>
> If we get an action that we don't recognise should we just throw it away
> like that? (by doing hostdata->action = IBMVSCSI_HOST_ACTION_NONE). Tyrel?
However, because of this, I will hold off on v2 until Tyrel can give
some feedback.
Thanks,
Nathan
^ permalink raw reply
* RE: [next-20190530] Boot failure on PowerPC
From: Lili Deng (Wicresoft North America Ltd) @ 2019-06-03 3:23 UTC (permalink / raw)
To: Sachin Sant, Dexuan-Linux Cui
Cc: linux-next@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
Dexuan Cui
In-Reply-To: <D8C9598F-4A51-4402-9344-894EC6B6CE47@linux.vnet.ibm.com>
Hi Sachin,
I verified below patch against Ubuntu 18.04, didn't hit the kernel panic any more, could you please let know how did you verify?
Thanks,
Lili
-----Original Message-----
From: Sachin Sant <sachinp@linux.vnet.ibm.com>
Sent: Monday, June 3, 2019 11:12 AM
To: Dexuan-Linux Cui <dexuan.linux@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>; linuxppc-dev@lists.ozlabs.org; linux-next@vger.kernel.org; Dexuan Cui <decui@microsoft.com>; Lili Deng (Wicresoft North America Ltd) <v-lide@microsoft.com>
Subject: Re: [next-20190530] Boot failure on PowerPC
> On 31-May-2019, at 11:43 PM, Dexuan-Linux Cui <dexuan.linux@gmail.com> wrote:
>
> On Fri, May 31, 2019 at 6:52 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>>>
>>> Machine boots till login prompt and then panics few seconds later.
>>>
>>> Last known next build was May 24th. Will attempt few builds till May
>>> 30 to narrow down this problem.
>>
>> My CI was fine with next-20190529 (9a15d2e3fd03e3).
>>
>> cheers
>
> Hi Sachin,
> It looks this patch may fix the issue:
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Flkml.org%2Flkml%2F2019%2F5%2F30%2F1630&data=02%7C01%7Cv-lide%40microsoft.com%7C66e1ef6017aa461703f808d6e7d148cd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C1%7C636951283393233385&sdata=IJFhtvL2Bd87HCoMZ7oWL%2Bar6NY%2FfPbmdCZMT%2BJz5t4%3D&reserved=0 , but I'm not sure.
It does not help fix the kernel panic issue, but it fixes the get_swap_device warning messages during the boot.
Thanks
-Sachin
^ permalink raw reply
* Re: [RFC] mm: Generalize notify_page_fault()
From: Anshuman Khandual @ 2019-06-03 4:53 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Mark Rutland, Michal Hocko, linux-ia64, linux-sh, Catalin Marinas,
Will Deacon, linux-mm, Paul Mackerras, sparclinux, linux-s390,
Yoshinori Sato, Russell King, Fenghua Yu, Stephen Rothwell,
Andrey Konovalov, linux-arm-kernel, Tony Luck, Heiko Carstens,
linux-kernel, Martin Schwidefsky, Andrew Morton, linuxppc-dev,
David S. Miller
In-Reply-To: <20190531174854.GA31852@bombadil.infradead.org>
On 05/31/2019 11:18 PM, Matthew Wilcox wrote:
> On Fri, May 31, 2019 at 02:17:43PM +0530, Anshuman Khandual wrote:
>> On 05/30/2019 07:09 PM, Matthew Wilcox wrote:
>>> On Thu, May 30, 2019 at 05:31:15PM +0530, Anshuman Khandual wrote:
>>>> On 05/30/2019 04:36 PM, Matthew Wilcox wrote:
>>>>> The two handle preemption differently. Why is x86 wrong and this one
>>>>> correct?
>>>>
>>>> Here it expects context to be already non-preemptible where as the proposed
>>>> generic function makes it non-preemptible with a preempt_[disable|enable]()
>>>> pair for the required code section, irrespective of it's present state. Is
>>>> not this better ?
>>>
>>> git log -p arch/x86/mm/fault.c
>>>
>>> search for 'kprobes'.
>>>
>>> tell me what you think.
>>
>> Are you referring to these following commits
>>
>> a980c0ef9f6d ("x86/kprobes: Refactor kprobes_fault() like kprobe_exceptions_notify()")
>> b506a9d08bae ("x86: code clarification patch to Kprobes arch code")
>>
>> In particular the later one (b506a9d08bae). It explains how the invoking context
>> in itself should be non-preemptible for the kprobes processing context irrespective
>> of whether kprobe_running() or perhaps smp_processor_id() is safe or not. Hence it
>> does not make much sense to continue when original invoking context is preemptible.
>> Instead just bail out earlier. This seems to be making more sense than preempt
>> disable-enable pair. If there are no concerns about this change from other platforms,
>> I will change the preemption behavior in proposed generic function next time around.
>
> Exactly.
>
> So, any of the arch maintainers know of a reason they behave differently
> from x86 in this regard? Or can Anshuman use the x86 implementation
> for all the architectures supporting kprobes?
So the generic notify_page_fault() will be like this.
int __kprobes notify_page_fault(struct pt_regs *regs, unsigned int trap)
{
int ret = 0;
/*
* To be potentially processing a kprobe fault and to be allowed
* to call kprobe_running(), we have to be non-preemptible.
*/
if (kprobes_built_in() && !preemptible() && !user_mode(regs)) {
if (kprobe_running() && kprobe_fault_handler(regs, trap))
ret = 1;
}
return ret;
}
^ permalink raw reply
* Re: [PATCH v3 1/3] PCI: Introduce pcibios_ignore_alignment_request
From: Alexey Kardashevskiy @ 2019-06-03 5:02 UTC (permalink / raw)
To: Shawn Anastasio, Oliver
Cc: Sam Bobroff, linux-pci, Linux Kernel Mailing List, rppt,
Paul Mackerras, Bjorn Helgaas, xyjxie, linuxppc-dev
In-Reply-To: <bdc914aa-9aab-1377-c036-cca4710ef233@anastas.io>
On 03/06/2019 12:23, Shawn Anastasio wrote:
>
>
> On 5/30/19 10:56 PM, Alexey Kardashevskiy wrote:
>>
>>
>> On 31/05/2019 08:49, Shawn Anastasio wrote:
>>> On 5/29/19 10:39 PM, Alexey Kardashevskiy wrote:
>>>>
>>>>
>>>> On 28/05/2019 17:39, Shawn Anastasio wrote:
>>>>>
>>>>>
>>>>> On 5/28/19 1:27 AM, Alexey Kardashevskiy wrote:
>>>>>>
>>>>>>
>>>>>> On 28/05/2019 15:36, Oliver wrote:
>>>>>>> On Tue, May 28, 2019 at 2:03 PM Shawn Anastasio <shawn@anastas.io>
>>>>>>> wrote:
>>>>>>>>
>>>>>>>> Introduce a new pcibios function pcibios_ignore_alignment_request
>>>>>>>> which allows the PCI core to defer to platform-specific code to
>>>>>>>> determine whether or not to ignore alignment requests for PCI
>>>>>>>> resources.
>>>>>>>>
>>>>>>>> The existing behavior is to simply ignore alignment requests when
>>>>>>>> PCI_PROBE_ONLY is set. This is behavior is maintained by the
>>>>>>>> default implementation of pcibios_ignore_alignment_request.
>>>>>>>>
>>>>>>>> Signed-off-by: Shawn Anastasio <shawn@anastas.io>
>>>>>>>> ---
>>>>>>>> drivers/pci/pci.c | 9 +++++++--
>>>>>>>> include/linux/pci.h | 1 +
>>>>>>>> 2 files changed, 8 insertions(+), 2 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>>>>>>>> index 8abc843b1615..8207a09085d1 100644
>>>>>>>> --- a/drivers/pci/pci.c
>>>>>>>> +++ b/drivers/pci/pci.c
>>>>>>>> @@ -5882,6 +5882,11 @@ resource_size_t __weak
>>>>>>>> pcibios_default_alignment(void)
>>>>>>>> return 0;
>>>>>>>> }
>>>>>>>>
>>>>>>>> +int __weak pcibios_ignore_alignment_request(void)
>>>>>>>> +{
>>>>>>>> + return pci_has_flag(PCI_PROBE_ONLY);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> #define RESOURCE_ALIGNMENT_PARAM_SIZE COMMAND_LINE_SIZE
>>>>>>>> static char
>>>>>>>> resource_alignment_param[RESOURCE_ALIGNMENT_PARAM_SIZE] = {0};
>>>>>>>> static DEFINE_SPINLOCK(resource_alignment_lock);
>>>>>>>> @@ -5906,9 +5911,9 @@ static resource_size_t
>>>>>>>> pci_specified_resource_alignment(struct pci_dev *dev,
>>>>>>>> p = resource_alignment_param;
>>>>>>>> if (!*p && !align)
>>>>>>>> goto out;
>>>>>>>> - if (pci_has_flag(PCI_PROBE_ONLY)) {
>>>>>>>> + if (pcibios_ignore_alignment_request()) {
>>>>>>>> align = 0;
>>>>>>>> - pr_info_once("PCI: Ignoring requested alignments
>>>>>>>> (PCI_PROBE_ONLY)\n");
>>>>>>>> + pr_info_once("PCI: Ignoring requested
>>>>>>>> alignments\n");
>>>>>>>> goto out;
>>>>>>>> }
>>>>>>>
>>>>>>> I think the logic here is questionable to begin with. If the user
>>>>>>> has
>>>>>>> explicitly requested re-aligning a resource via the command line
>>>>>>> then
>>>>>>> we should probably do it even if PCI_PROBE_ONLY is set. When it
>>>>>>> breaks
>>>>>>> they get to keep the pieces.
>>>>>>>
>>>>>>> That said, the real issue here is that PCI_PROBE_ONLY probably
>>>>>>> shouldn't be set under qemu/kvm. Under the other hypervisor
>>>>>>> (PowerVM)
>>>>>>> hotplugged devices are configured by firmware before it's passed to
>>>>>>> the guest and we need to keep the FW assignments otherwise things
>>>>>>> break. QEMU however doesn't do any BAR assignments and relies on
>>>>>>> that
>>>>>>> being handled by the guest. At boot time this is done by SLOF, but
>>>>>>> Linux only keeps SLOF around until it's extracted the device-tree.
>>>>>>> Once that's done SLOF gets blown away and the kernel needs to do
>>>>>>> it's
>>>>>>> own BAR assignments. I'm guessing there's a hack in there to make it
>>>>>>> work today, but it's a little surprising that it works at all...
>>>>>>
>>>>>>
>>>>>> The hack is to run a modified qemu-aware "/usr/sbin/rtas_errd" in the
>>>>>> guest which receives an event from qemu (RAS_EPOW from
>>>>>> /proc/interrupts), fetches device tree chunks (and as I understand
>>>>>> it -
>>>>>> they come with BARs from phyp but without from qemu) and writes
>>>>>> "1" to
>>>>>> "/sys/bus/pci/rescan" which calls pci_assign_resource() eventually:
>>>>>
>>>>> Interesting. Does this mean that the PHYP hotplug path doesn't
>>>>> call pci_assign_resource?
>>>>
>>>>
>>>> I'd expect dlpar_add_slot() to be called under phyp and eventually
>>>> pci_device_add() which (I think) may or may not trigger later
>>>> reassignment.
>>>>
>>>>
>>>>> If so it means the patch may not
>>>>> break that platform after all, though it still may not be
>>>>> the correct way of doing things.
>>>>
>>>>
>>>> We should probably stop enforcing the PCI_PROBE_ONLY flag - it seems
>>>> that (unless resource_alignment= is used) the pseries guest should just
>>>> walk through all allocated resources and leave them unchanged.
>>>
>>> If we add a pcibios_default_alignment() implementation like was
>>> suggested earlier, then it will behave as if the user has
>>> specified resource_alignment= by default and SLOF's assignments
>>> won't be honored (I think).
>>
>>
>> I removed pci_add_flags(PCI_PROBE_ONLY) from pSeries_setup_arch and
>> tried booting with and without pci=resource_alignment= and I can see no
>> difference - BARs are still aligned to 64K as programmed in SLOF; if I
>> hack SLOF to align to 4K or 32K - BARs get packed and the guest leaves
>> them unchanged.
>>
>>
>>> I guess it boils down to one question - is it important that we
>>> observe SLOF's initial BAR assignments?
>>
>> It isn't if it's SLOF but it is if it's phyp. It used to not
>> allow/support BAR reassignment and even if it does not, I'd rather avoid
>> touching them.
>
> A quick update. I tried removing pci_add_flags(PCI_PROBE_ONLY) which
> worked, but if I add an implementation of pcibios_default_alignment
> which simply returns PAGE_SIZE, my VM fails to boot and many errors
> from the virtio disk driver are printed to the console.
>
> After some investigation, it seems that with pcibios_default_alignment
> present, Linux will reallocate all resources provided by SLOF on
> boot. I'm still not sure why exactly this causes the virtio driver
> to fail, but it does indicate that there is a reason to keep
> SLOF's initial assignments.
>
> Anybody have an idea what's causing this?
With your changes the guest feels the urge to reassign bars (no idea why
but ok), when it does so, it puts both BARs (one is prefetchable) into
the 32bit non-prefetchable window of the PHB (SLOF puts the prefetchable
bar to a 64bit prefetchable window, I have no idea why the guest does it
different either but this must still work) and then qemu does not
emulate something properly - unassigned_mem_accepts() is triggered on
the bar access - no idea why - I am debugging it right now.
--
Alexey
^ permalink raw reply
* Re: [RFC PATCH v2] powerpc/xmon: restrict when kernel is locked down
From: Andrew Donnellan @ 2019-06-03 5:36 UTC (permalink / raw)
To: Christopher M. Riedl, linuxppc-dev, kernel-hardening; +Cc: mjg59, dja
In-Reply-To: <20190524123816.1773-1-cmr@informatik.wtf>
On 24/5/19 10:38 pm, Christopher M. Riedl wrote:
> Xmon should be either fully or partially disabled depending on the
> kernel lockdown state.
>
> Put xmon into read-only mode for lockdown=integrity and completely
> disable xmon when lockdown=confidentiality. Xmon checks the lockdown
> state and takes appropriate action:
>
> (1) during xmon_setup to prevent early xmon'ing
>
> (2) when triggered via sysrq
>
> (3) when toggled via debugfs
>
> (4) when triggered via a previously enabled breakpoint
>
> The following lockdown state transitions are handled:
>
> (1) lockdown=none -> lockdown=integrity
> clear all breakpoints, set xmon read-only mode
>
> (2) lockdown=none -> lockdown=confidentiality
> clear all breakpoints, prevent re-entry into xmon
>
> (3) lockdown=integrity -> lockdown=confidentiality
> prevent re-entry into xmon
>
> Suggested-by: Andrew Donnellan <ajd@linux.ibm.com>
> Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
> ---
>
> Applies on top of this series:
> https://patchwork.kernel.org/cover/10884631/
>
> I've done some limited testing of the scenarios mentioned in the commit
> message on a single CPU QEMU config.
>
> v1->v2:
> Fix subject line
> Submit to linuxppc-dev and kernel-hardening
>
> arch/powerpc/xmon/xmon.c | 56 +++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 55 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index 3e7be19aa208..8c4a5a0c28f0 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -191,6 +191,9 @@ static void dump_tlb_44x(void);
> static void dump_tlb_book3e(void);
> #endif
>
> +static void clear_all_bpt(void);
> +static void xmon_init(int);
> +
> #ifdef CONFIG_PPC64
> #define REG "%.16lx"
> #else
> @@ -291,6 +294,39 @@ Commands:\n\
> zh halt\n"
> ;
>
> +#ifdef CONFIG_LOCK_DOWN_KERNEL
> +static bool xmon_check_lockdown(void)
> +{
> + static bool lockdown = false;
> +
> + if (!lockdown) {
> + lockdown = kernel_is_locked_down("Using xmon",
> + LOCKDOWN_CONFIDENTIALITY);
> + if (lockdown) {
> + printf("xmon: Disabled by strict kernel lockdown\n");
> + xmon_on = 0;
> + xmon_init(0);
> + }
> + }
> +
> + if (!xmon_is_ro) {
> + xmon_is_ro = kernel_is_locked_down("Using xmon write-access",
> + LOCKDOWN_INTEGRITY);
> + if (xmon_is_ro) {
> + printf("xmon: Read-only due to kernel lockdown\n");
> + clear_all_bpt();
Remind me again why we need to clear breakpoints in integrity mode?
Andrew
> + }
> + }
> +
> + return lockdown;
> +}
> +#else
> +inline static bool xmon_check_lockdown(void)
> +{
> + return false;
> +}
> +#endif /* CONFIG_LOCK_DOWN_KERNEL */
> +
> static struct pt_regs *xmon_regs;
>
> static inline void sync(void)
> @@ -708,6 +744,9 @@ static int xmon_bpt(struct pt_regs *regs)
> struct bpt *bp;
> unsigned long offset;
>
> + if (xmon_check_lockdown())
> + return 0;
> +
> if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) != (MSR_IR|MSR_64BIT))
> return 0;
>
> @@ -739,6 +778,9 @@ static int xmon_sstep(struct pt_regs *regs)
>
> static int xmon_break_match(struct pt_regs *regs)
> {
> + if (xmon_check_lockdown())
> + return 0;
> +
> if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) != (MSR_IR|MSR_64BIT))
> return 0;
> if (dabr.enabled == 0)
> @@ -749,6 +791,9 @@ static int xmon_break_match(struct pt_regs *regs)
>
> static int xmon_iabr_match(struct pt_regs *regs)
> {
> + if (xmon_check_lockdown())
> + return 0;
> +
> if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) != (MSR_IR|MSR_64BIT))
> return 0;
> if (iabr == NULL)
> @@ -3742,6 +3787,9 @@ static void xmon_init(int enable)
> #ifdef CONFIG_MAGIC_SYSRQ
> static void sysrq_handle_xmon(int key)
> {
> + if (xmon_check_lockdown())
> + return;
> +
> /* ensure xmon is enabled */
> xmon_init(1);
> debugger(get_irq_regs());
> @@ -3763,7 +3811,6 @@ static int __init setup_xmon_sysrq(void)
> device_initcall(setup_xmon_sysrq);
> #endif /* CONFIG_MAGIC_SYSRQ */
>
> -#ifdef CONFIG_DEBUG_FS
> static void clear_all_bpt(void)
> {
> int i;
> @@ -3785,8 +3832,12 @@ static void clear_all_bpt(void)
> printf("xmon: All breakpoints cleared\n");
> }
>
> +#ifdef CONFIG_DEBUG_FS
> static int xmon_dbgfs_set(void *data, u64 val)
> {
> + if (xmon_check_lockdown())
> + return 0;
> +
> xmon_on = !!val;
> xmon_init(xmon_on);
>
> @@ -3845,6 +3896,9 @@ early_param("xmon", early_parse_xmon);
>
> void __init xmon_setup(void)
> {
> + if (xmon_check_lockdown())
> + return;
> +
> if (xmon_on)
> xmon_init(1);
> if (xmon_early)
>
--
Andrew Donnellan OzLabs, ADL Canberra
ajd@linux.ibm.com IBM Australia Limited
^ permalink raw reply
* Re: [WIP RFC PATCH 0/6] Generic Firmware Variable Filesystem
From: Daniel Axtens @ 2019-06-03 6:04 UTC (permalink / raw)
To: Nayna, nayna, cclaudio, linux-fsdevel, greg, linuxppc-dev
In-Reply-To: <316a0865-7e14-b36a-7e49-5113f3dfc35f@linux.vnet.ibm.com>
Hi Nayna,
>> As PowerNV moves towards secure boot, we need a place to put secure
>> variables. One option that has been canvassed is to make our secure
>> variables look like EFI variables. This is an early sketch of another
>> approach where we create a generic firmware variable file system,
>> fwvarfs, and an OPAL Secure Variable backend for it.
>
> Is there a need of new filesystem ? I am wondering why can't these be
> exposed via sysfs / securityfs ?
> Probably, something like... /sys/firmware/secureboot or
> /sys/kernel/security/secureboot/ ?
I suppose we could put secure variables in sysfs, but I'm not sure
that's what sysfs was intended for. I understand sysfs as "a
filesystem-based view of kernel objects" (from
Documentation/filesystems/configfs/configfs.txt), and I don't think a
secure variable is really a kernel object in the same way most other
things in sysfs are... but I'm open to being convinced.
securityfs seems to be reserved for LSMs, I don't think we can put
things there.
My hope with fwvarfs is to provide a generic place for firmware
variables so that we don't need to expand the list of firmware-specific
filesystems beyond efivarfs. I am also aiming to make things simple to
use so that people familiar with firmware don't also have to become
familiar with filesystem code in order to expose firmware variables to
userspace.
> Also, it sounds like this is needed only for secure firmware variables
> and does not include
> other firmware variables which are not security relevant ? Is that
> correct understanding ?
The primary use case at the moment - OPAL secure variables - is security
focused because the current OPAL secure variable design stores and
manipulates secure variables separately from the rest of nvram. This
isn't an inherent feature of fwvarfs.
fwvarfs can also be used for variables that are not security relevant as
well. For example, with the EFI backend (patch 3), both secure and
insecure variables can be read.
Regards,
Daniel
^ permalink raw reply
* [PATCH] powerpc/64s: Fix THP PMD collapse serialisation
From: Nicholas Piggin @ 2019-06-03 6:05 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin
Commit 1b2443a547f9 ("powerpc/book3s64: Avoid multiple endian conversion
in pte helpers") changed the actual bitwise tests in pte_access_permitted
by using pte_write() and pte_present() helpers rather than raw bitwise
testing _PAGE_WRITE and _PAGE_PRESENT bits.
The pte_present change now returns true for ptes which are !_PAGE_PRESENT
and _PAGE_INVALID, which is the combination used by pmdp_invalidate to
synchronize access from lock-free lookups. pte_access_permitted is used by
pmd_access_permitted, so allowing GUP lock free access to proceed with
such PTEs breaks this synchronisation.
This bug has been observed on HPT host, with random crashes and corruption
in guests, usually together with bad PMD messages in the host.
Fix this by adding an explicit check in pmd_access_permitted, and
documenting the condition explicitly.
The pte_write() change should be okay, and would prevent GUP from falling
back to the slow path when encountering savedwrite ptes, which matches
what x86 (that does not implement savedwrite) does.
Fixes: 1b2443a547f9 ("powerpc/book3s64: Avoid multiple endian conversion in pte helpers")
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/powerpc/include/asm/book3s/64/pgtable.h | 19 ++++++++++++++++++-
arch/powerpc/mm/book3s64/pgtable.c | 3 +++
2 files changed, 21 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 7dede2e34b70..aaa72aa1b765 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1092,7 +1092,24 @@ static inline int pmd_protnone(pmd_t pmd)
#define pmd_access_permitted pmd_access_permitted
static inline bool pmd_access_permitted(pmd_t pmd, bool write)
{
- return pte_access_permitted(pmd_pte(pmd), write);
+ pte_t pte = pmd_pte(pmd);
+ unsigned long pteval = pte_val(pte);
+
+ /*
+ * pmdp_invalidate sets this combination (that is not caught by
+ * !pte_present() check in pte_access_permitted), to prevent
+ * lock-free lookups, as part of the serialize_against_pte_lookup()
+ * synchronisation.
+ *
+ * This check inadvertently catches the case where the PTE's hardware
+ * PRESENT bit is cleared while TLB is flushed, to work around
+ * hardware TLB issues. This is suboptimal, but should not be hit
+ * frequently and should be harmless.
+ */
+ if ((pteval & _PAGE_INVALID) && !(pteval & _PAGE_PRESENT))
+ return false;
+
+ return pte_access_permitted(pte, write);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 16bda049187a..ff98b663c83e 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -116,6 +116,9 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
/*
* This ensures that generic code that rely on IRQ disabling
* to prevent a parallel THP split work as expected.
+ *
+ * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires
+ * a special case check in pmd_access_permitted.
*/
serialize_against_pte_lookup(vma->vm_mm);
return __pmd(old_pmd);
--
2.20.1
^ permalink raw reply related
* Re: [PATCH] powerpc/64s: Fix THP PMD collapse serialisation
From: Aneesh Kumar K.V @ 2019-06-03 6:43 UTC (permalink / raw)
To: Nicholas Piggin, linuxppc-dev
In-Reply-To: <20190603060531.13088-1-npiggin@gmail.com>
On 6/3/19 11:35 AM, Nicholas Piggin wrote:
> Commit 1b2443a547f9 ("powerpc/book3s64: Avoid multiple endian conversion
> in pte helpers") changed the actual bitwise tests in pte_access_permitted
> by using pte_write() and pte_present() helpers rather than raw bitwise
> testing _PAGE_WRITE and _PAGE_PRESENT bits.
>
> The pte_present change now returns true for ptes which are !_PAGE_PRESENT
> and _PAGE_INVALID, which is the combination used by pmdp_invalidate to
> synchronize access from lock-free lookups. pte_access_permitted is used by
> pmd_access_permitted, so allowing GUP lock free access to proceed with
> such PTEs breaks this synchronisation.
>
> This bug has been observed on HPT host, with random crashes and corruption
> in guests, usually together with bad PMD messages in the host.
>
> Fix this by adding an explicit check in pmd_access_permitted, and
> documenting the condition explicitly.
>
> The pte_write() change should be okay, and would prevent GUP from falling
> back to the slow path when encountering savedwrite ptes, which matches
> what x86 (that does not implement savedwrite) does.
>
> Fixes: 1b2443a547f9 ("powerpc/book3s64: Avoid multiple endian conversion in pte helpers")
> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> Cc: Christophe Leroy <christophe.leroy@c-s.fr>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
> arch/powerpc/include/asm/book3s/64/pgtable.h | 19 ++++++++++++++++++-
> arch/powerpc/mm/book3s64/pgtable.c | 3 +++
> 2 files changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 7dede2e34b70..aaa72aa1b765 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -1092,7 +1092,24 @@ static inline int pmd_protnone(pmd_t pmd)
> #define pmd_access_permitted pmd_access_permitted
> static inline bool pmd_access_permitted(pmd_t pmd, bool write)
> {
> - return pte_access_permitted(pmd_pte(pmd), write);
> + pte_t pte = pmd_pte(pmd);
> + unsigned long pteval = pte_val(pte);
> +
> + /*
> + * pmdp_invalidate sets this combination (that is not caught by
> + * !pte_present() check in pte_access_permitted), to prevent
> + * lock-free lookups, as part of the serialize_against_pte_lookup()
> + * synchronisation.
> + *
> + * This check inadvertently catches the case where the PTE's hardware
> + * PRESENT bit is cleared while TLB is flushed, to work around
> + * hardware TLB issues. This is suboptimal, but should not be hit
> + * frequently and should be harmless.
> + */
> + if ((pteval & _PAGE_INVALID) && !(pteval & _PAGE_PRESENT))
> + return false;
> +
> + return pte_access_permitted(pte, write);
> }
>
you need to do similar for other lockless page table walk like
find_linux_pte
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
> index 16bda049187a..ff98b663c83e 100644
> --- a/arch/powerpc/mm/book3s64/pgtable.c
> +++ b/arch/powerpc/mm/book3s64/pgtable.c
> @@ -116,6 +116,9 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
> /*
> * This ensures that generic code that rely on IRQ disabling
> * to prevent a parallel THP split work as expected.
> + *
> + * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires
> + * a special case check in pmd_access_permitted.
> */
> serialize_against_pte_lookup(vma->vm_mm);
> return __pmd(old_pmd);
>
-anesh
^ permalink raw reply
* [RFC PATCH] powerpc/64s: __find_linux_pte synchronization vs pmdp_invalidate
From: Nicholas Piggin @ 2019-06-03 6:44 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Aneesh Kumar K . V, Nicholas Piggin
The pmd_none check does not catch hugepage collapse, nor does the
pmd_present check in pmd_trans_huge, because hugepage collapse sets
!_PAGE_PRESENT && _PAGE_INVALID (which results in !pmd_none and
pmd_present).
Aneesh noticed we might need this check as well.
---
arch/powerpc/mm/pgtable.c | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index db4a6253df92..7a702d21400a 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -372,13 +372,20 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
pdshift = PMD_SHIFT;
pmdp = pmd_offset(&pud, ea);
pmd = READ_ONCE(*pmdp);
- /*
- * A hugepage collapse is captured by pmd_none, because
- * it mark the pmd none and do a hpte invalidate.
- */
+
if (pmd_none(pmd))
return NULL;
+#ifdef CONFIG_PPC_BOOK3S_64
+ if (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_INVALID) == _PAGE_INVALID) {
+ /*
+ * A hugepage collapse is captured by this condition, see
+ * pmdp_invalidate.
+ */
+ return NULL;
+ }
+#endif
+
if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
if (is_thp)
*is_thp = true;
--
2.20.1
^ permalink raw reply related
* [PATCH v2] powerpc: pseries/hvconsole: fix stack overread via udbg
From: Daniel Axtens @ 2019-06-03 6:56 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Dmitry Vyukov, Daniel Axtens
While developing kasan for 64-bit book3s, I hit the following stack
over-read.
It occurs because the hypercall to put characters onto the terminal
takes 2 longs (128 bits/16 bytes) of characters at a time, and so
hvc_put_chars would unconditionally copy 16 bytes from the argument
buffer, regardless of supplied length. However, udbg_hvc_putc can
call hvc_put_chars with a single-byte buffer, leading to the error.
[ 0.001931] ================================================================== [150/819]
[ 0.001933] BUG: KASAN: stack-out-of-bounds in hvc_put_chars+0xdc/0x110
[ 0.001934] Read of size 8 at addr c0000000023e7a90 by task swapper/0
[ 0.001934]
[ 0.001935] CPU: 0 PID: 0 Comm: swapper Not tainted 5.2.0-rc2-next-20190528-02824-g048a6ab4835b #113
[ 0.001935] Call Trace:
[ 0.001936] [c0000000023e7790] [c000000001b4a450] dump_stack+0x104/0x154 (unreliable)
[ 0.001937] [c0000000023e77f0] [c0000000006d3524] print_address_description+0xa0/0x30c
[ 0.001938] [c0000000023e7880] [c0000000006d318c] __kasan_report+0x20c/0x224
[ 0.001939] [c0000000023e7950] [c0000000006d19d8] kasan_report+0x18/0x30
[ 0.001940] [c0000000023e7970] [c0000000006d4854] __asan_report_load8_noabort+0x24/0x40
[ 0.001941] [c0000000023e7990] [c0000000001511ac] hvc_put_chars+0xdc/0x110
[ 0.001942] [c0000000023e7a10] [c000000000f81cfc] hvterm_raw_put_chars+0x9c/0x110
[ 0.001943] [c0000000023e7a50] [c000000000f82634] udbg_hvc_putc+0x154/0x200
[ 0.001944] [c0000000023e7b10] [c000000000049c90] udbg_write+0xf0/0x240
[ 0.001945] [c0000000023e7b70] [c0000000002e5d88] console_unlock+0x868/0xd30
[ 0.001946] [c0000000023e7ca0] [c0000000002e6e00] register_console+0x970/0xe90
[ 0.001947] [c0000000023e7d80] [c000000001ff1928] register_early_udbg_console+0xf8/0x114
[ 0.001948] [c0000000023e7df0] [c000000001ff1174] setup_arch+0x108/0x790
[ 0.001948] [c0000000023e7e90] [c000000001fe41c8] start_kernel+0x104/0x784
[ 0.001949] [c0000000023e7f90] [c00000000000b368] start_here_common+0x1c/0x534
[ 0.001950]
[ 0.001950]
[ 0.001951] Memory state around the buggy address:
[ 0.001952] c0000000023e7980: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 0.001952] c0000000023e7a00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1
[ 0.001953] >c0000000023e7a80: f1 f1 01 f2 f2 f2 00 00 00 00 00 00 00 00 00 00
[ 0.001953] ^
[ 0.001954] c0000000023e7b00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 0.001954] c0000000023e7b80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 0.001955] ==================================================================
Document that a 16-byte buffer is requred, and provide it in udbg.
CC: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Daniel Axtens <dja@axtens.net>
---
v2: avoid memcpy, push responsibility to caller.
Solution suggested by mpe.
---
arch/powerpc/platforms/pseries/hvconsole.c | 2 +-
drivers/tty/hvc/hvc_vio.c | 16 +++++++++++++++-
2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c
index 74da18de853a..73ec15cd2708 100644
--- a/arch/powerpc/platforms/pseries/hvconsole.c
+++ b/arch/powerpc/platforms/pseries/hvconsole.c
@@ -62,7 +62,7 @@ EXPORT_SYMBOL(hvc_get_chars);
* @vtermno: The vtermno or unit_address of the adapter from which the data
* originated.
* @buf: The character buffer that contains the character data to send to
- * firmware.
+ * firmware. Must be at least 16 bytes, even if count is less than 16.
* @count: Send this number of characters.
*/
int hvc_put_chars(uint32_t vtermno, const char *buf, int count)
diff --git a/drivers/tty/hvc/hvc_vio.c b/drivers/tty/hvc/hvc_vio.c
index 6de6d4a1a221..7af54d6ed5b8 100644
--- a/drivers/tty/hvc/hvc_vio.c
+++ b/drivers/tty/hvc/hvc_vio.c
@@ -107,6 +107,14 @@ static int hvterm_raw_get_chars(uint32_t vtermno, char *buf, int count)
return got;
}
+/**
+ * hvterm_raw_put_chars: send characters to firmware for given vterm adapter
+ * @vtermno: The virtual terminal number.
+ * @buf: The characters to send. Because of the underlying hypercall in
+ * hvc_put_chars(), this buffer must be at least 16 bytes long, even if
+ * you are sending fewer chars.
+ * @count: number of chars to send.
+ */
static int hvterm_raw_put_chars(uint32_t vtermno, const char *buf, int count)
{
struct hvterm_priv *pv = hvterm_privs[vtermno];
@@ -219,6 +227,7 @@ static const struct hv_ops hvterm_hvsi_ops = {
static void udbg_hvc_putc(char c)
{
int count = -1;
+ unsigned char bounce_buffer[16];
if (!hvterm_privs[0])
return;
@@ -229,7 +238,12 @@ static void udbg_hvc_putc(char c)
do {
switch(hvterm_privs[0]->proto) {
case HV_PROTOCOL_RAW:
- count = hvterm_raw_put_chars(0, &c, 1);
+ /*
+ * hvterm_raw_put_chars requires at least a 16-byte
+ * buffer, so go via the bounce buffer
+ */
+ bounce_buffer[0] = c;
+ count = hvterm_raw_put_chars(0, bounce_buffer, 1);
break;
case HV_PROTOCOL_HVSI:
count = hvterm_hvsi_put_chars(0, &c, 1);
--
2.19.1
^ permalink raw reply related
* Re: [RFC PATCH] powerpc/book3e: KASAN Full support for 64bit
From: Christophe Leroy @ 2019-06-03 7:06 UTC (permalink / raw)
To: Daniel Axtens, Benjamin Herrenschmidt, Paul Mackerras,
Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <877ea7za12.fsf@dja-thinkpad.axtens.net>
Hi,
Ok, can you share your .config ?
Christophe
Le 31/05/2019 à 03:29, Daniel Axtens a écrit :
> Hi Christophe,
>
> I tried this on the t4240rdb and it fails to boot if KASAN is
> enabled. It does boot with the patch applied but KASAN disabled, so that
> narrows it down a little bit.
>
> I need to focus on 3s first so I'll just drop 3e from my patch set for
> now.
>
> Regards,
> Daniel
>
>> The KASAN shadow area is mapped into vmemmap space:
>> 0x8000 0400 0000 0000 to 0x8000 0600 0000 0000.
>> For this vmemmap has to be disabled.
>>
>> Cc: Daniel Axtens <dja@axtens.net>
>> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
>> ---
>> arch/powerpc/Kconfig | 1 +
>> arch/powerpc/Kconfig.debug | 3 +-
>> arch/powerpc/include/asm/kasan.h | 11 +++
>> arch/powerpc/kernel/Makefile | 2 +
>> arch/powerpc/kernel/head_64.S | 3 +
>> arch/powerpc/kernel/setup_64.c | 20 +++---
>> arch/powerpc/mm/kasan/Makefile | 1 +
>> arch/powerpc/mm/kasan/kasan_init_64.c | 129 ++++++++++++++++++++++++++++++++++
>> 8 files changed, 159 insertions(+), 11 deletions(-)
>> create mode 100644 arch/powerpc/mm/kasan/kasan_init_64.c
>>
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index 1a2fb50126b2..e0b7c45e4dc7 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -174,6 +174,7 @@ config PPC
>> select HAVE_ARCH_AUDITSYSCALL
>> select HAVE_ARCH_JUMP_LABEL
>> select HAVE_ARCH_KASAN if PPC32
>> + select HAVE_ARCH_KASAN if PPC_BOOK3E_64 && !SPARSEMEM_VMEMMAP
>> select HAVE_ARCH_KGDB
>> select HAVE_ARCH_MMAP_RND_BITS
>> select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
>> diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
>> index 61febbbdd02b..b4140dd6b4e4 100644
>> --- a/arch/powerpc/Kconfig.debug
>> +++ b/arch/powerpc/Kconfig.debug
>> @@ -370,4 +370,5 @@ config PPC_FAST_ENDIAN_SWITCH
>> config KASAN_SHADOW_OFFSET
>> hex
>> depends on KASAN
>> - default 0xe0000000
>> + default 0xe0000000 if PPC32
>> + default 0x6800040000000000 if PPC64
>> diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h
>> index 296e51c2f066..756b3d58f921 100644
>> --- a/arch/powerpc/include/asm/kasan.h
>> +++ b/arch/powerpc/include/asm/kasan.h
>> @@ -23,10 +23,21 @@
>>
>> #define KASAN_SHADOW_OFFSET ASM_CONST(CONFIG_KASAN_SHADOW_OFFSET)
>>
>> +#ifdef CONFIG_PPC32
>> #define KASAN_SHADOW_END 0UL
>>
>> #define KASAN_SHADOW_SIZE (KASAN_SHADOW_END - KASAN_SHADOW_START)
>>
>> +#else
>> +
>> +#include <asm/pgtable.h>
>> +
>> +#define KASAN_SHADOW_SIZE (KERN_VIRT_SIZE >> KASAN_SHADOW_SCALE_SHIFT)
>> +
>> +#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
>> +
>> +#endif /* CONFIG_PPC32 */
>> +
>> #ifdef CONFIG_KASAN
>> void kasan_early_init(void);
>> void kasan_mmu_init(void);
>> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
>> index 0ea6c4aa3a20..7f232c06f11d 100644
>> --- a/arch/powerpc/kernel/Makefile
>> +++ b/arch/powerpc/kernel/Makefile
>> @@ -35,6 +35,8 @@ KASAN_SANITIZE_early_32.o := n
>> KASAN_SANITIZE_cputable.o := n
>> KASAN_SANITIZE_prom_init.o := n
>> KASAN_SANITIZE_btext.o := n
>> +KASAN_SANITIZE_paca.o := n
>> +KASAN_SANITIZE_setup_64.o := n
>>
>> ifdef CONFIG_KASAN
>> CFLAGS_early_32.o += -DDISABLE_BRANCH_PROFILING
>> diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
>> index 3fad8d499767..80fbd8024fb2 100644
>> --- a/arch/powerpc/kernel/head_64.S
>> +++ b/arch/powerpc/kernel/head_64.S
>> @@ -966,6 +966,9 @@ start_here_multiplatform:
>> * and SLB setup before we turn on relocation.
>> */
>>
>> +#ifdef CONFIG_KASAN
>> + bl kasan_early_init
>> +#endif
>> /* Restore parameters passed from prom_init/kexec */
>> mr r3,r31
>> bl early_setup /* also sets r13 and SPRG_PACA */
>> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
>> index ba404dd9ce1d..d2bf860dd966 100644
>> --- a/arch/powerpc/kernel/setup_64.c
>> +++ b/arch/powerpc/kernel/setup_64.c
>> @@ -311,6 +311,16 @@ void __init early_setup(unsigned long dt_ptr)
>> DBG(" -> early_setup(), dt_ptr: 0x%lx\n", dt_ptr);
>>
>> /*
>> + * Configure exception handlers. This include setting up trampolines
>> + * if needed, setting exception endian mode, etc...
>> + */
>> + configure_exceptions();
>> +
>> + /* Apply all the dynamic patching */
>> + apply_feature_fixups();
>> + setup_feature_keys();
>> +
>> + /*
>> * Do early initialization using the flattened device
>> * tree, such as retrieving the physical memory map or
>> * calculating/retrieving the hash table size.
>> @@ -325,16 +335,6 @@ void __init early_setup(unsigned long dt_ptr)
>> setup_paca(paca_ptrs[boot_cpuid]);
>> fixup_boot_paca();
>>
>> - /*
>> - * Configure exception handlers. This include setting up trampolines
>> - * if needed, setting exception endian mode, etc...
>> - */
>> - configure_exceptions();
>> -
>> - /* Apply all the dynamic patching */
>> - apply_feature_fixups();
>> - setup_feature_keys();
>> -
>> /* Initialize the hash table or TLB handling */
>> early_init_mmu();
>>
>> diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile
>> index 6577897673dd..0bfbe3892808 100644
>> --- a/arch/powerpc/mm/kasan/Makefile
>> +++ b/arch/powerpc/mm/kasan/Makefile
>> @@ -3,3 +3,4 @@
>> KASAN_SANITIZE := n
>>
>> obj-$(CONFIG_PPC32) += kasan_init_32.o
>> +obj-$(CONFIG_PPC64) += kasan_init_64.o
>> diff --git a/arch/powerpc/mm/kasan/kasan_init_64.c b/arch/powerpc/mm/kasan/kasan_init_64.c
>> new file mode 100644
>> index 000000000000..7fd71b8e883b
>> --- /dev/null
>> +++ b/arch/powerpc/mm/kasan/kasan_init_64.c
>> @@ -0,0 +1,129 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +#define DISABLE_BRANCH_PROFILING
>> +
>> +#include <linux/kasan.h>
>> +#include <linux/printk.h>
>> +#include <linux/memblock.h>
>> +#include <linux/sched/task.h>
>> +#include <asm/pgalloc.h>
>> +
>> +static void __init kasan_populate_pte(pte_t *ptep, pgprot_t prot)
>> +{
>> + unsigned long va = (unsigned long)kasan_early_shadow_page;
>> + phys_addr_t pa = __pa(kasan_early_shadow_page);
>> + int i;
>> +
>> + for (i = 0; i < PTRS_PER_PTE; i++, ptep++)
>> + __set_pte_at(&init_mm, va, ptep, pfn_pte(PHYS_PFN(pa), prot), 0);
>> +}
>> +
>> +static void __init kasan_populate_pmd(pmd_t *pmdp)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < PTRS_PER_PMD; i++)
>> + pmd_populate_kernel(&init_mm, pmdp + i, kasan_early_shadow_pte);
>> +}
>> +
>> +static void __init kasan_populate_pud(pud_t *pudp)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < PTRS_PER_PUD; i++)
>> + pud_populate(&init_mm, pudp + i, kasan_early_shadow_pmd);
>> +}
>> +
>> +static void __init *kasan_alloc_pgtable(unsigned long size)
>> +{
>> + void *ptr = memblock_alloc_try_nid(size, size, MEMBLOCK_LOW_LIMIT,
>> + __pa(MAX_DMA_ADDRESS), NUMA_NO_NODE);
>> +
>> + if (!ptr)
>> + panic("%s: Failed to allocate %lu bytes align=0x%lx max_addr=%lx\n",
>> + __func__, size, size, __pa(MAX_DMA_ADDRESS));
>> +
>> + return ptr;
>> +}
>> +
>> +static int __init kasan_map_page(unsigned long va, unsigned long pa, pgprot_t prot)
>> +{
>> + pgd_t *pgdp = pgd_offset_k(va);
>> + pud_t *pudp;
>> + pmd_t *pmdp;
>> + pte_t *ptep;
>> +
>> + if (pgd_none(*pgdp) || (void *)pgd_page_vaddr(*pgdp) == kasan_early_shadow_pud) {
>> + pudp = kasan_alloc_pgtable(PUD_TABLE_SIZE);
>> + kasan_populate_pud(pudp);
>> + pgd_populate(&init_mm, pgdp, pudp);
>> + }
>> + pudp = pud_offset(pgdp, va);
>> + if (pud_none(*pudp) || (void *)pud_page_vaddr(*pudp) == kasan_early_shadow_pmd) {
>> + pmdp = kasan_alloc_pgtable(PMD_TABLE_SIZE);
>> + kasan_populate_pmd(pmdp);
>> + pud_populate(&init_mm, pudp, pmdp);
>> + }
>> + pmdp = pmd_offset(pudp, va);
>> + if (!pmd_present(*pmdp) || (void *)pmd_page_vaddr(*pmdp) == kasan_early_shadow_pte) {
>> + ptep = kasan_alloc_pgtable(PTE_TABLE_SIZE);
>> + kasan_populate_pte(ptep, PAGE_KERNEL);
>> + pmd_populate_kernel(&init_mm, pmdp, ptep);
>> + }
>> + ptep = pte_offset_kernel(pmdp, va);
>> +
>> + __set_pte_at(&init_mm, va, ptep, pfn_pte(pa >> PAGE_SHIFT, prot), 0);
>> +
>> + return 0;
>> +}
>> +
>> +static void __init kasan_init_region(struct memblock_region *reg)
>> +{
>> + void *start = __va(reg->base);
>> + void *end = __va(reg->base + reg->size);
>> + unsigned long k_start, k_end, k_cur;
>> +
>> + if (start >= end)
>> + return;
>> +
>> + k_start = (unsigned long)kasan_mem_to_shadow(start);
>> + k_end = (unsigned long)kasan_mem_to_shadow(end);
>> +
>> + for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE) {
>> + void *va = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
>> +
>> + kasan_map_page(k_cur, __pa(va), PAGE_KERNEL);
>> + }
>> + flush_tlb_kernel_range(k_start, k_end);
>> +}
>> +
>> +void __init kasan_init(void)
>> +{
>> + struct memblock_region *reg;
>> +
>> + for_each_memblock(memory, reg)
>> + kasan_init_region(reg);
>> +
>> + /* It's too early to use clear_page() ! */
>> + memset(kasan_early_shadow_page, 0, sizeof(kasan_early_shadow_page));
>> +
>> + /* Enable error messages */
>> + init_task.kasan_depth = 0;
>> + pr_info("KASAN init done\n");
>> +}
>> +
>> +/* The early shadow maps everything to a single page of zeroes */
>> +asmlinkage void __init kasan_early_init(void)
>> +{
>> + unsigned long addr = KASAN_SHADOW_START;
>> + unsigned long end = KASAN_SHADOW_END;
>> + pgd_t *pgdp = pgd_offset_k(addr);
>> +
>> + kasan_populate_pte(kasan_early_shadow_pte, PAGE_KERNEL);
>> + kasan_populate_pmd(kasan_early_shadow_pmd);
>> + kasan_populate_pud(kasan_early_shadow_pud);
>> +
>> + do {
>> + pgd_populate(&init_mm, pgdp, kasan_early_shadow_pud);
>> + } while (pgdp++, addr = pgd_addr_end(addr, end), addr != end);
>> +}
>> --
>> 2.13.3
^ permalink raw reply
* RE: [next-20190530] Boot failure on PowerPC
From: Lili Deng (Wicresoft North America Ltd) @ 2019-06-03 7:15 UTC (permalink / raw)
To: Sachin Sant, Dexuan-Linux Cui
Cc: linux-next@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
Dexuan Cui
In-Reply-To: <KL1P15301MB02948DA3C780A2DC4A145E0F9E140@KL1P15301MB0294.APCP153.PROD.OUTLOOK.COM>
ACK for ' but it fixes the get_swap_device warning messages during the boot.'
Double check for kernel panic issue, without this patch, it seems not repro in my local manual environment, so please ignore my previous mail for 'the patch fixes the kernel panic'.
Sorry for the confusion.
-----Original Message-----
From: Lili Deng (Wicresoft North America Ltd)
Sent: Monday, June 3, 2019 11:24 AM
To: Sachin Sant <sachinp@linux.vnet.ibm.com>; Dexuan-Linux Cui <dexuan.linux@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>; linuxppc-dev@lists.ozlabs.org; linux-next@vger.kernel.org; Dexuan Cui <decui@microsoft.com>
Subject: RE: [next-20190530] Boot failure on PowerPC
Hi Sachin,
I verified below patch against Ubuntu 18.04, didn't hit the kernel panic any more, could you please let know how did you verify?
Thanks,
Lili
-----Original Message-----
From: Sachin Sant <sachinp@linux.vnet.ibm.com>
Sent: Monday, June 3, 2019 11:12 AM
To: Dexuan-Linux Cui <dexuan.linux@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>; linuxppc-dev@lists.ozlabs.org; linux-next@vger.kernel.org; Dexuan Cui <decui@microsoft.com>; Lili Deng (Wicresoft North America Ltd) <v-lide@microsoft.com>
Subject: Re: [next-20190530] Boot failure on PowerPC
> On 31-May-2019, at 11:43 PM, Dexuan-Linux Cui <dexuan.linux@gmail.com> wrote:
>
> On Fri, May 31, 2019 at 6:52 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>>>
>>> Machine boots till login prompt and then panics few seconds later.
>>>
>>> Last known next build was May 24th. Will attempt few builds till May
>>> 30 to narrow down this problem.
>>
>> My CI was fine with next-20190529 (9a15d2e3fd03e3).
>>
>> cheers
>
> Hi Sachin,
> It looks this patch may fix the issue:
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Flkml.org%2Flkml%2F2019%2F5%2F30%2F1630&data=02%7C01%7Cv-lide%40microsoft.com%7C66e1ef6017aa461703f808d6e7d148cd%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C1%7C636951283393233385&sdata=IJFhtvL2Bd87HCoMZ7oWL%2Bar6NY%2FfPbmdCZMT%2BJz5t4%3D&reserved=0 , but I'm not sure.
It does not help fix the kernel panic issue, but it fixes the get_swap_device warning messages during the boot.
Thanks
-Sachin
^ permalink raw reply
* Re: [WIP RFC PATCH 0/6] Generic Firmware Variable Filesystem
From: Greg KH @ 2019-06-03 7:29 UTC (permalink / raw)
To: Daniel Axtens; +Cc: linux-fsdevel, nayna, Nayna, linuxppc-dev, cclaudio
In-Reply-To: <87zhmzxkzz.fsf@dja-thinkpad.axtens.net>
On Mon, Jun 03, 2019 at 04:04:32PM +1000, Daniel Axtens wrote:
> Hi Nayna,
>
> >> As PowerNV moves towards secure boot, we need a place to put secure
> >> variables. One option that has been canvassed is to make our secure
> >> variables look like EFI variables. This is an early sketch of another
> >> approach where we create a generic firmware variable file system,
> >> fwvarfs, and an OPAL Secure Variable backend for it.
> >
> > Is there a need of new filesystem ? I am wondering why can't these be
> > exposed via sysfs / securityfs ?
> > Probably, something like... /sys/firmware/secureboot or
> > /sys/kernel/security/secureboot/ ?
>
> I suppose we could put secure variables in sysfs, but I'm not sure
> that's what sysfs was intended for. I understand sysfs as "a
> filesystem-based view of kernel objects" (from
> Documentation/filesystems/configfs/configfs.txt), and I don't think a
> secure variable is really a kernel object in the same way most other
> things in sysfs are... but I'm open to being convinced.
What makes them more "secure" than anything else that is in sysfs today?
I didn't see anything in this patchset that provided "additional
security", did I miss it?
> securityfs seems to be reserved for LSMs, I don't think we can put
> things there.
Yeah, I wouldn't mess with that.
I would just recommend putting this in sysfs. Make a new subsystem
(i.e. class) and away you go.
> My hope with fwvarfs is to provide a generic place for firmware
> variables so that we don't need to expand the list of firmware-specific
> filesystems beyond efivarfs. I am also aiming to make things simple to
> use so that people familiar with firmware don't also have to become
> familiar with filesystem code in order to expose firmware variables to
> userspace.
Why would anyone need to be writing new code to firmware variables that
makes it any different from any other kernel change?
> > Also, it sounds like this is needed only for secure firmware variables
> > and does not include
> > other firmware variables which are not security relevant ? Is that
> > correct understanding ?
>
> The primary use case at the moment - OPAL secure variables - is security
> focused because the current OPAL secure variable design stores and
> manipulates secure variables separately from the rest of nvram. This
> isn't an inherent feature of fwvarfs.
Again, why not just put it in sysfs please?
> fwvarfs can also be used for variables that are not security relevant as
> well. For example, with the EFI backend (patch 3), both secure and
> insecure variables can be read.
I don't remember why efi variables were not put in sysfs, I think there
was some reasoning behind it originally. Perhaps look in the linux-efi
archives.
thanks,
greg k-h
^ permalink raw reply
* Re: [PATCH 09/22] docs: mark orphan documents as such
From: Christophe Leroy @ 2019-06-03 7:32 UTC (permalink / raw)
To: Mauro Carvalho Chehab, Linux Doc Mailing List
Cc: kvm, Radim Krčmář, Maxime Ripard, dri-devel,
platform-driver-x86, Paul Mackerras, linux-stm32,
Andrew Donnellan, Jonathan Corbet, David Airlie, Alexandre Torgue,
linux-pm, Maarten Lankhorst, Matan Ziv-Av, Mauro Carvalho Chehab,
Daniel Vetter, Sean Paul, linux-arm-kernel, linux-kernel,
Maxime Coquelin, Frederic Barrat, Paolo Bonzini, linuxppc-dev,
Georgi Djakov
In-Reply-To: <e0bf4e767dd5de9189e5993fbec2f4b1bafd2064.1559171394.git.mchehab+samsung@kernel.org>
Le 30/05/2019 à 01:23, Mauro Carvalho Chehab a écrit :
> Sphinx doesn't like orphan documents:
>
> Documentation/accelerators/ocxl.rst: WARNING: document isn't included in any toctree
> Documentation/arm/stm32/overview.rst: WARNING: document isn't included in any toctree
> Documentation/arm/stm32/stm32f429-overview.rst: WARNING: document isn't included in any toctree
> Documentation/arm/stm32/stm32f746-overview.rst: WARNING: document isn't included in any toctree
> Documentation/arm/stm32/stm32f769-overview.rst: WARNING: document isn't included in any toctree
> Documentation/arm/stm32/stm32h743-overview.rst: WARNING: document isn't included in any toctree
> Documentation/arm/stm32/stm32mp157-overview.rst: WARNING: document isn't included in any toctree
> Documentation/gpu/msm-crash-dump.rst: WARNING: document isn't included in any toctree
> Documentation/interconnect/interconnect.rst: WARNING: document isn't included in any toctree
> Documentation/laptops/lg-laptop.rst: WARNING: document isn't included in any toctree
> Documentation/powerpc/isa-versions.rst: WARNING: document isn't included in any toctree
> Documentation/virtual/kvm/amd-memory-encryption.rst: WARNING: document isn't included in any toctree
> Documentation/virtual/kvm/vcpu-requests.rst: WARNING: document isn't included in any toctree
>
> So, while they aren't on any toctree, add :orphan: to them, in order
> to silent this warning.
Are those files really not meant to be included in a toctree ?
Shouldn't we include them in the relevant toctree instead of just
shutting up Sphinx warnings ?
Christophe
>
> Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
> ---
> Documentation/accelerators/ocxl.rst | 2 ++
> Documentation/arm/stm32/overview.rst | 2 ++
> Documentation/arm/stm32/stm32f429-overview.rst | 2 ++
> Documentation/arm/stm32/stm32f746-overview.rst | 2 ++
> Documentation/arm/stm32/stm32f769-overview.rst | 2 ++
> Documentation/arm/stm32/stm32h743-overview.rst | 2 ++
> Documentation/arm/stm32/stm32mp157-overview.rst | 2 ++
> Documentation/gpu/msm-crash-dump.rst | 2 ++
> Documentation/interconnect/interconnect.rst | 2 ++
> Documentation/laptops/lg-laptop.rst | 2 ++
> Documentation/powerpc/isa-versions.rst | 2 ++
> Documentation/virtual/kvm/amd-memory-encryption.rst | 2 ++
> Documentation/virtual/kvm/vcpu-requests.rst | 2 ++
> 13 files changed, 26 insertions(+)
>
> diff --git a/Documentation/accelerators/ocxl.rst b/Documentation/accelerators/ocxl.rst
> index 14cefc020e2d..b1cea19a90f5 100644
> --- a/Documentation/accelerators/ocxl.rst
> +++ b/Documentation/accelerators/ocxl.rst
> @@ -1,3 +1,5 @@
> +:orphan:
> +
> ========================================================
> OpenCAPI (Open Coherent Accelerator Processor Interface)
> ========================================================
> diff --git a/Documentation/arm/stm32/overview.rst b/Documentation/arm/stm32/overview.rst
> index 85cfc8410798..f7e734153860 100644
> --- a/Documentation/arm/stm32/overview.rst
> +++ b/Documentation/arm/stm32/overview.rst
> @@ -1,3 +1,5 @@
> +:orphan:
> +
> ========================
> STM32 ARM Linux Overview
> ========================
> diff --git a/Documentation/arm/stm32/stm32f429-overview.rst b/Documentation/arm/stm32/stm32f429-overview.rst
> index 18feda97f483..65bbb1c3b423 100644
> --- a/Documentation/arm/stm32/stm32f429-overview.rst
> +++ b/Documentation/arm/stm32/stm32f429-overview.rst
> @@ -1,3 +1,5 @@
> +:orphan:
> +
> STM32F429 Overview
> ==================
>
> diff --git a/Documentation/arm/stm32/stm32f746-overview.rst b/Documentation/arm/stm32/stm32f746-overview.rst
> index b5f4b6ce7656..42d593085015 100644
> --- a/Documentation/arm/stm32/stm32f746-overview.rst
> +++ b/Documentation/arm/stm32/stm32f746-overview.rst
> @@ -1,3 +1,5 @@
> +:orphan:
> +
> STM32F746 Overview
> ==================
>
> diff --git a/Documentation/arm/stm32/stm32f769-overview.rst b/Documentation/arm/stm32/stm32f769-overview.rst
> index 228656ced2fe..f6adac862b17 100644
> --- a/Documentation/arm/stm32/stm32f769-overview.rst
> +++ b/Documentation/arm/stm32/stm32f769-overview.rst
> @@ -1,3 +1,5 @@
> +:orphan:
> +
> STM32F769 Overview
> ==================
>
> diff --git a/Documentation/arm/stm32/stm32h743-overview.rst b/Documentation/arm/stm32/stm32h743-overview.rst
> index 3458dc00095d..c525835e7473 100644
> --- a/Documentation/arm/stm32/stm32h743-overview.rst
> +++ b/Documentation/arm/stm32/stm32h743-overview.rst
> @@ -1,3 +1,5 @@
> +:orphan:
> +
> STM32H743 Overview
> ==================
>
> diff --git a/Documentation/arm/stm32/stm32mp157-overview.rst b/Documentation/arm/stm32/stm32mp157-overview.rst
> index 62e176d47ca7..2c52cd020601 100644
> --- a/Documentation/arm/stm32/stm32mp157-overview.rst
> +++ b/Documentation/arm/stm32/stm32mp157-overview.rst
> @@ -1,3 +1,5 @@
> +:orphan:
> +
> STM32MP157 Overview
> ===================
>
> diff --git a/Documentation/gpu/msm-crash-dump.rst b/Documentation/gpu/msm-crash-dump.rst
> index 757cd257e0d8..240ef200f76c 100644
> --- a/Documentation/gpu/msm-crash-dump.rst
> +++ b/Documentation/gpu/msm-crash-dump.rst
> @@ -1,3 +1,5 @@
> +:orphan:
> +
> =====================
> MSM Crash Dump Format
> =====================
> diff --git a/Documentation/interconnect/interconnect.rst b/Documentation/interconnect/interconnect.rst
> index c3e004893796..56e331dab70e 100644
> --- a/Documentation/interconnect/interconnect.rst
> +++ b/Documentation/interconnect/interconnect.rst
> @@ -1,5 +1,7 @@
> .. SPDX-License-Identifier: GPL-2.0
>
> +:orphan:
> +
> =====================================
> GENERIC SYSTEM INTERCONNECT SUBSYSTEM
> =====================================
> diff --git a/Documentation/laptops/lg-laptop.rst b/Documentation/laptops/lg-laptop.rst
> index aa503ee9b3bc..f2c2ffe31101 100644
> --- a/Documentation/laptops/lg-laptop.rst
> +++ b/Documentation/laptops/lg-laptop.rst
> @@ -1,5 +1,7 @@
> .. SPDX-License-Identifier: GPL-2.0+
>
> +:orphan:
> +
> LG Gram laptop extra features
> =============================
>
> diff --git a/Documentation/powerpc/isa-versions.rst b/Documentation/powerpc/isa-versions.rst
> index 812e20cc898c..66c24140ebf1 100644
> --- a/Documentation/powerpc/isa-versions.rst
> +++ b/Documentation/powerpc/isa-versions.rst
> @@ -1,3 +1,5 @@
> +:orphan:
> +
> CPU to ISA Version Mapping
> ==========================
>
> diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virtual/kvm/amd-memory-encryption.rst
> index 659bbc093b52..33d697ab8a58 100644
> --- a/Documentation/virtual/kvm/amd-memory-encryption.rst
> +++ b/Documentation/virtual/kvm/amd-memory-encryption.rst
> @@ -1,3 +1,5 @@
> +:orphan:
> +
> ======================================
> Secure Encrypted Virtualization (SEV)
> ======================================
> diff --git a/Documentation/virtual/kvm/vcpu-requests.rst b/Documentation/virtual/kvm/vcpu-requests.rst
> index 5feb3706a7ae..c1807a1b92e6 100644
> --- a/Documentation/virtual/kvm/vcpu-requests.rst
> +++ b/Documentation/virtual/kvm/vcpu-requests.rst
> @@ -1,3 +1,5 @@
> +:orphan:
> +
> =================
> KVM VCPU Requests
> =================
>
^ permalink raw reply
* Re: [PATCH] powerpc/64s: Fix THP PMD collapse serialisation
From: Nicholas Piggin @ 2019-06-03 7:33 UTC (permalink / raw)
To: Aneesh Kumar K.V, linuxppc-dev
In-Reply-To: <bb7eabc7-8dbb-14e9-f797-6dfd339bb0ba@linux.ibm.com>
Aneesh Kumar K.V's on June 3, 2019 4:43 pm:
> On 6/3/19 11:35 AM, Nicholas Piggin wrote:
>> Commit 1b2443a547f9 ("powerpc/book3s64: Avoid multiple endian conversion
>> in pte helpers") changed the actual bitwise tests in pte_access_permitted
>> by using pte_write() and pte_present() helpers rather than raw bitwise
>> testing _PAGE_WRITE and _PAGE_PRESENT bits.
>>
>> The pte_present change now returns true for ptes which are !_PAGE_PRESENT
>> and _PAGE_INVALID, which is the combination used by pmdp_invalidate to
>> synchronize access from lock-free lookups. pte_access_permitted is used by
>> pmd_access_permitted, so allowing GUP lock free access to proceed with
>> such PTEs breaks this synchronisation.
>>
>> This bug has been observed on HPT host, with random crashes and corruption
>> in guests, usually together with bad PMD messages in the host.
>>
>> Fix this by adding an explicit check in pmd_access_permitted, and
>> documenting the condition explicitly.
>>
>> The pte_write() change should be okay, and would prevent GUP from falling
>> back to the slow path when encountering savedwrite ptes, which matches
>> what x86 (that does not implement savedwrite) does.
>>
>> Fixes: 1b2443a547f9 ("powerpc/book3s64: Avoid multiple endian conversion in pte helpers")
>> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> Cc: Christophe Leroy <christophe.leroy@c-s.fr>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>> arch/powerpc/include/asm/book3s/64/pgtable.h | 19 ++++++++++++++++++-
>> arch/powerpc/mm/book3s64/pgtable.c | 3 +++
>> 2 files changed, 21 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> index 7dede2e34b70..aaa72aa1b765 100644
>> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
>> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> @@ -1092,7 +1092,24 @@ static inline int pmd_protnone(pmd_t pmd)
>> #define pmd_access_permitted pmd_access_permitted
>> static inline bool pmd_access_permitted(pmd_t pmd, bool write)
>> {
>> - return pte_access_permitted(pmd_pte(pmd), write);
>> + pte_t pte = pmd_pte(pmd);
>> + unsigned long pteval = pte_val(pte);
>> +
>> + /*
>> + * pmdp_invalidate sets this combination (that is not caught by
>> + * !pte_present() check in pte_access_permitted), to prevent
>> + * lock-free lookups, as part of the serialize_against_pte_lookup()
>> + * synchronisation.
>> + *
>> + * This check inadvertently catches the case where the PTE's hardware
>> + * PRESENT bit is cleared while TLB is flushed, to work around
>> + * hardware TLB issues. This is suboptimal, but should not be hit
>> + * frequently and should be harmless.
>> + */
>> + if ((pteval & _PAGE_INVALID) && !(pteval & _PAGE_PRESENT))
>> + return false;
>> +
>> + return pte_access_permitted(pte, write);
>> }
>>
>
>
> you need to do similar for other lockless page table walk like
> find_linux_pte
Yeah good point as discussed offline. I was going to make that a
separate patch, it would have a different Fixes:. I have not been
able to trigger any bugs caused by it, whereas the bug caused by
this patch hits reliably in about 10 minutes or less.
Maybe the race window is just a lot smaller or the function is
less frequently used?
Thanks,
Nick
^ permalink raw reply
* Re: [PATCH 03/16] mm: simplify gup_fast_permitted
From: Christoph Hellwig @ 2019-06-03 7:41 UTC (permalink / raw)
To: Linus Torvalds
Cc: Rich Felker, Yoshinori Sato, Linux-sh list, James Hogan,
the arch/x86 maintainers, Khalid Aziz, Nicholas Piggin,
David S. Miller, Linux-MM, Paul Burton, Paul Mackerras,
Andrey Konovalov, sparclinux, linux-mips, linuxppc-dev,
Christoph Hellwig, Linux List Kernel Mailing
In-Reply-To: <CAHk-=whusWKhS=SYoC9f9HjVmPvR5uP51Mq=ZCtktqTBT2qiBw@mail.gmail.com>
On Sat, Jun 01, 2019 at 09:14:17AM -0700, Linus Torvalds wrote:
> On Sat, Jun 1, 2019 at 12:50 AM Christoph Hellwig <hch@lst.de> wrote:
> >
> > Pass in the already calculated end value instead of recomputing it, and
> > leave the end > start check in the callers instead of duplicating them
> > in the arch code.
>
> Good cleanup, except it's wrong.
>
> > - if (nr_pages <= 0)
> > + if (end < start)
> > return 0;
>
> You moved the overflow test to generic code - good.
>
> You removed the sign and zero test on nr_pages - bad.
I only removed a duplicate of it. The full (old) code in
get_user_pages_fast() looks like this:
if (nr_pages <= 0)
return 0;
if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
if (gup_fast_permitted(start, nr_pages)) {
^ permalink raw reply
* Re: [RFC PATCH] powerpc/64s: __find_linux_pte synchronization vs pmdp_invalidate
From: Christophe Leroy @ 2019-06-03 7:41 UTC (permalink / raw)
To: Nicholas Piggin, linuxppc-dev; +Cc: Aneesh Kumar K . V
In-Reply-To: <20190603064408.14735-1-npiggin@gmail.com>
Le 03/06/2019 à 08:44, Nicholas Piggin a écrit :
> The pmd_none check does not catch hugepage collapse, nor does the
> pmd_present check in pmd_trans_huge, because hugepage collapse sets
> !_PAGE_PRESENT && _PAGE_INVALID (which results in !pmd_none and
> pmd_present).
>
> Aneesh noticed we might need this check as well.
>
> ---
> arch/powerpc/mm/pgtable.c | 15 +++++++++++----
> 1 file changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> index db4a6253df92..7a702d21400a 100644
> --- a/arch/powerpc/mm/pgtable.c
> +++ b/arch/powerpc/mm/pgtable.c
> @@ -372,13 +372,20 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
> pdshift = PMD_SHIFT;
> pmdp = pmd_offset(&pud, ea);
> pmd = READ_ONCE(*pmdp);
> - /*
> - * A hugepage collapse is captured by pmd_none, because
> - * it mark the pmd none and do a hpte invalidate.
> - */
> +
> if (pmd_none(pmd))
> return NULL;
>
> +#ifdef CONFIG_PPC_BOOK3S_64
I can't see anything that would build fail on other subarches. Wouldn't
be better to use
if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && (pmd_val(pmd) &
(_PAGE_PRESENT|_PAGE_INVALID) == _PAGE_INVALID))
> + if (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_INVALID) == _PAGE_INVALID) {
Maybe using pmd_raw() instead as in all the book3s64 helpers ?
Christophe
> + /*
> + * A hugepage collapse is captured by this condition, see
> + * pmdp_invalidate.
> + */
> + return NULL;
> + }
> +#endif
> +
> if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) {
> if (is_thp)
> *is_thp = true;
>
^ permalink raw reply
* Re: [PATCH 08/16] sparc64: add the missing pgd_page definition
From: Christoph Hellwig @ 2019-06-03 7:44 UTC (permalink / raw)
To: Linus Torvalds
Cc: Rich Felker, Yoshinori Sato, Linux-sh list, James Hogan,
the arch/x86 maintainers, Khalid Aziz, Nicholas Piggin,
David S. Miller, Linux-MM, Paul Burton, Paul Mackerras,
Andrey Konovalov, sparclinux, linux-mips, linuxppc-dev,
Christoph Hellwig, Linux List Kernel Mailing
In-Reply-To: <CAHk-=wj9w5NxTcJsqpvYUiL3OBOH-J3=4-vXcc3GaG_U8H-gJw@mail.gmail.com>
On Sat, Jun 01, 2019 at 09:28:54AM -0700, Linus Torvalds wrote:
> Both sparc64 and sh had this pattern, but now that I look at it more
> closely, I think your version is wrong, or at least nonoptimal.
I bet it is. Then again these symbols are just required for the code
to compile, as neither sparc64 nor sh actually use the particular
variant of huge pages we need it for. Then again even actually dead
code should better be not too buggy if it isn't just a stub.
> So I thgink this would be better done with
>
> #define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd))
>
> where that "pgd_pfn()" would need to be a new (but likely very
> trivial) function. That's what we do for pte_pfn().
>
> IOW, it would likely end up something like
>
> #define pgd_to_pfn(pgd) (pgd_val(x) >> PFN_PGD_SHIFT)
True. I guess it would be best if we could get most if not all
architectures to use common versions of these macros so that we have
the issue settled once.
^ permalink raw reply
* Re: [PATCH 10/16] sparc64: use the generic get_user_pages_fast code
From: Christoph Hellwig @ 2019-06-03 7:44 UTC (permalink / raw)
To: Hillf Danton
Cc: x86, Rich Felker, Yoshinori Sato, linux-sh, James Hogan,
linuxppc-dev, Khalid Aziz, Nicholas Piggin, David S. Miller,
linux-mm, Paul Burton, Paul Mackerras, Andrey Konovalov,
sparclinux, linux-mips, Linus Torvalds, Christoph Hellwig,
linux-kernel
In-Reply-To: <20190601074959.14036-11-hch@lst.de>
On Sun, Jun 02, 2019 at 03:39:48PM +0800, Hillf Danton wrote:
>
> Hi Christoph
>
> On Sat, 1 Jun 2019 09:49:53 +0200 Christoph Hellwig wrote:
> >
> > diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
> > index a93eca29e85a..2301ab5250e4 100644
> > --- a/arch/sparc/include/asm/pgtable_64.h
> > +++ b/arch/sparc/include/asm/pgtable_64.h
> > @@ -1098,6 +1098,24 @@ static inline unsigned long untagged_addr(unsigned long start)
> > }
> > #define untagged_addr untagged_addr
> >
> > +static inline bool pte_access_permitted(pte_t pte, bool write)
> > +{
> > + u64 prot;
> > +
> > + if (tlb_type == hypervisor) {
> > + prot = _PAGE_PRESENT_4V | _PAGE_P_4V;
> > + if (prot)
>
> Feel free to correct me if I misread or miss anything.
> It looks like a typo: s/prot/write/, as checking _PAGE_PRESENT_4V and
> _PAGE_P_4V makes prot always have _PAGE_WRITE_4V set, regardless of write.
True, the if prot should be if write.
^ permalink raw reply
* Re: [PATCH BACKPORTv2 4.19, 5.0, 5.1] crypto: vmx - ghash: do nosimd fallback manually
From: Greg KH @ 2019-06-03 7:54 UTC (permalink / raw)
To: Daniel Axtens; +Cc: linuxppc-dev, Herbert Xu, stable
In-Reply-To: <20190603020848.9598-1-dja@axtens.net>
On Mon, Jun 03, 2019 at 12:08:48PM +1000, Daniel Axtens wrote:
> commit 357d065a44cdd77ed5ff35155a989f2a763e96ef upstream.
> [backported: the VMX driver did not use crypto_simd_usable() until
> after 5.1]
>
> VMX ghash was using a fallback that did not support interleaving simd
> and nosimd operations, leading to failures in the extended test suite.
>
> If I understood correctly, Eric's suggestion was to use the same
> data format that the generic code uses, allowing us to call into it
> with the same contexts. I wasn't able to get that to work - I think
> there's a very different key structure and data layout being used.
>
> So instead steal the arm64 approach and perform the fallback
> operations directly if required.
>
> Fixes: cc333cd68dfa ("crypto: vmx - Adding GHASH routines for VMX module")
> Cc: stable@vger.kernel.org # v4.1+
> Reported-by: Eric Biggers <ebiggers@google.com>
> Signed-off-by: Daniel Axtens <dja@axtens.net>
> Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
> Tested-by: Michael Ellerman <mpe@ellerman.id.au>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> Signed-off-by: Daniel Axtens <dja@axtens.net>
> ---
>
> v2: do stable backport form correctly.
Thanks for all of these, now queued up.
greg k-h
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox