LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v2] powerpc: select ARCH_HAS_MEMBARRIER_SYNC_CORE
From: Nicholas Piggin @ 2020-07-16  0:56 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: linux-arch, Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <87zh816qgv.fsf@igel.home>

Excerpts from Andreas Schwab's message of July 15, 2020 8:08 pm:
> On Jul 15 2020, Nicholas Piggin wrote:
> 
>> diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h
>> index 54a98ef7d7fe..071d7ccb830f 100644
>> --- a/arch/powerpc/include/asm/exception-64e.h
>> +++ b/arch/powerpc/include/asm/exception-64e.h
>> @@ -204,7 +204,11 @@ exc_##label##_book3e:
>>  	LOAD_REG_ADDR(r3,interrupt_base_book3e);\
>>  	ori	r3,r3,vector_offset@l;		\
>>  	mtspr	SPRN_IVOR##vector_number,r3;
>> -
>> +/*
>> + * powerpc relies on return from interrupt/syscall being context synchronising
>> + * (which rfi is) to support ARCH_HAS_MEMBARRIER_SYNC_CORE without additional
>> + * additional synchronisation instructions.
> 
> s/additonal//

Will fix in a respin.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v2] powerpc: select ARCH_HAS_MEMBARRIER_SYNC_CORE
From: Nicholas Piggin @ 2020-07-16  0:57 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: linux-arch, Peter Zijlstra, linuxppc-dev, Andy Lutomirski
In-Reply-To: <849841781.14062.1594816035327.JavaMail.zimbra@efficios.com>

Excerpts from Mathieu Desnoyers's message of July 15, 2020 10:27 pm:
> ----- On Jul 15, 2020, at 5:48 AM, Nicholas Piggin npiggin@gmail.com wrote:
> [...]
>> index 47bd4ea0837d..a4704f405e8d 100644
>> --- a/arch/powerpc/include/asm/exception-64s.h
>> +++ b/arch/powerpc/include/asm/exception-64s.h
>> @@ -68,6 +68,13 @@
>>  *
>>  * The nop instructions allow us to insert one or more instructions to flush the
>>  * L1-D cache when returning to userspace or a guest.
>> + *
>> + * powerpc relies on return from interrupt/syscall being context synchronising
>> + * (which hrfid, rfid, and rfscv are) to support ARCH_HAS_MEMBARRIER_SYNC_CORE
>> + * without additional additional synchronisation instructions. soft-masked
>> + * interrupt replay does not include a context-synchronising rfid, but those
>> + * always return to kernel, the context sync is only required for IPIs which
>> + * return to user.
>>  */
>> #define RFI_FLUSH_SLOT							\
>> 	RFI_FLUSH_FIXUP_SECTION;					\
> 
> I suspect the statement "the context sync is only required for IPIs which return to
> user." is misleading.
> 
> As I recall that we need more than just context sync after IPI. We need context sync
> in return path of any trap/interrupt/system call which returns to user-space, else
> we'd need to add the proper core serializing barriers in the scheduler, as we had
> to do for lazy tlb on x86.
> 
> Or am I missing something ?

Maybe ambiguous wording. For IPIs, the context synch is only required 
for those which return to user. Other things also require context sync.

I will try to improve it.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH net-next] ibmvnic: Increase driver logging
From: David Miller @ 2020-07-16  1:29 UTC (permalink / raw)
  To: kuba; +Cc: drt, netdev, tlfalcon, linuxppc-dev
In-Reply-To: <20200715170632.11f0bf19@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com>

From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 15 Jul 2020 17:06:32 -0700

> On Wed, 15 Jul 2020 18:51:55 -0500 Thomas Falcon wrote:
>>  	free_netdev(netdev);
>>  	dev_set_drvdata(&dev->dev, NULL);
>> +	netdev_info(netdev, "VNIC client device has been successfully removed.\n");
> 
> A step too far, perhaps.
> 
> In general this patch looks a little questionable IMHO, this amount of
> logging output is not commonly seen in drivers. All the the info
> messages are just static text, not even carrying any extra information.
> In an era of ftrace, and bpftrace, do we really need this?

Agreed, this is too much.  This is debugging, and thus suitable for tracing
facilities, at best.

^ permalink raw reply

* [PATCH v3] powerpc: select ARCH_HAS_MEMBARRIER_SYNC_CORE
From: Nicholas Piggin @ 2020-07-16  1:35 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: linux-arch, Andreas Schwab, Mathieu Desnoyers, Nicholas Piggin

powerpc return from interrupt and return from system call sequences are
context synchronising.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---

v3: more comment fixes

 .../features/sched/membarrier-sync-core/arch-support.txt  | 4 ++--
 arch/powerpc/Kconfig                                      | 1 +
 arch/powerpc/include/asm/exception-64e.h                  | 6 +++++-
 arch/powerpc/include/asm/exception-64s.h                  | 8 ++++++++
 arch/powerpc/kernel/entry_32.S                            | 6 ++++++
 5 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
index 8a521a622966..52ad74a25f54 100644
--- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -5,7 +5,7 @@
 #
 # Architecture requirements
 #
-# * arm/arm64
+# * arm/arm64/powerpc
 #
 # Rely on implicit context synchronization as a result of exception return
 # when returning from IPI handler, and when returning to user-space.
@@ -45,7 +45,7 @@
     |       nios2: | TODO |
     |    openrisc: | TODO |
     |      parisc: | TODO |
-    |     powerpc: | TODO |
+    |     powerpc: |  ok  |
     |       riscv: | TODO |
     |        s390: | TODO |
     |          sh: | TODO |
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9fa23eb320ff..920c4e3ca4ef 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -131,6 +131,7 @@ config PPC
 	select ARCH_HAS_PTE_DEVMAP		if PPC_BOOK3S_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
+	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
 	select ARCH_HAS_STRICT_KERNEL_RWX	if (PPC32 && !HIBERNATION)
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h
index 54a98ef7d7fe..72b6657acd2d 100644
--- a/arch/powerpc/include/asm/exception-64e.h
+++ b/arch/powerpc/include/asm/exception-64e.h
@@ -204,7 +204,11 @@ exc_##label##_book3e:
 	LOAD_REG_ADDR(r3,interrupt_base_book3e);\
 	ori	r3,r3,vector_offset@l;		\
 	mtspr	SPRN_IVOR##vector_number,r3;
-
+/*
+ * powerpc relies on return from interrupt/syscall being context synchronising
+ * (which rfi is) to support ARCH_HAS_MEMBARRIER_SYNC_CORE without additional
+ * synchronisation instructions.
+ */
 #define RFI_TO_KERNEL							\
 	rfi
 
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 47bd4ea0837d..d7a1a427a690 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -68,6 +68,14 @@
  *
  * The nop instructions allow us to insert one or more instructions to flush the
  * L1-D cache when returning to userspace or a guest.
+ *
+ * powerpc relies on return from interrupt/syscall being context synchronising
+ * (which hrfid, rfid, and rfscv are) to support ARCH_HAS_MEMBARRIER_SYNC_CORE
+ * without additional synchronisation instructions.
+ *
+ * soft-masked interrupt replay does not include a context-synchronising rfid,
+ * but those always return to kernel, the sync is only required when returning
+ * to user.
  */
 #define RFI_FLUSH_SLOT							\
 	RFI_FLUSH_FIXUP_SECTION;					\
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 217ebdf5b00b..f4d0af8e1136 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -35,6 +35,12 @@
 
 #include "head_32.h"
 
+/*
+ * powerpc relies on return from interrupt/syscall being context synchronising
+ * (which rfi is) to support ARCH_HAS_MEMBARRIER_SYNC_CORE without additional
+ * synchronisation instructions.
+ */
+
 /*
  * Align to 4k in order to ensure that all functions modyfing srr0/srr1
  * fit into one page in order to not encounter a TLB miss between the
-- 
2.23.0


^ permalink raw reply related

* Re: [PATCH v3 09/12] ppc64/kexec_file: setup backup region for kdump kernel
From: Thiago Jung Bauermann @ 2020-07-16  1:38 UTC (permalink / raw)
  To: Hari Bathini
  Cc: kernel test robot, Pingfan Liu, Nayna Jain, Kexec-ml,
	Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
	Petr Tesarik, Andrew Morton, Dave Young, Vivek Goyal,
	Eric Biederman
In-Reply-To: <159466096898.24747.16701009925943468066.stgit@hbathini.in.ibm.com>


Hari Bathini <hbathini@linux.ibm.com> writes:

> @@ -968,7 +1040,7 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
>
>  	/*
>  	 * Restrict memory usage for kdump kernel by setting up
> -	 * usable memory ranges.
> +	 * usable memory ranges and memory reserve map.
>  	 */
>  	if (image->type == KEXEC_TYPE_CRASH) {
>  		ret = get_usable_memory_ranges(&umem);
> @@ -980,6 +1052,24 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
>  			pr_err("Error setting up usable-memory property for kdump kernel\n");
>  			goto out;
>  		}
> +
> +		ret = fdt_add_mem_rsv(fdt, BACKUP_SRC_START + BACKUP_SRC_SIZE,
> +				      crashk_res.start - BACKUP_SRC_SIZE);

I believe this answers my question from the other email about how the
crashkernel is prevented from stomping in the crashed kernel's memory,
right? I needed to think for a bit to understand what the above
reservation was protecting. I think it's worth adding a comment.

> +		if (ret) {
> +			pr_err("Error reserving crash memory: %s\n",
> +			       fdt_strerror(ret));
> +			goto out;
> +		}
> +	}
> +
> +	if (image->arch.backup_start) {
> +		ret = fdt_add_mem_rsv(fdt, image->arch.backup_start,
> +				      BACKUP_SRC_SIZE);
> +		if (ret) {
> +			pr_err("Error reserving memory for backup: %s\n",
> +			       fdt_strerror(ret));
> +			goto out;
> +		}
>  	}

This is only true for KEXEC_TYPE_CRASH, if I'm following the code
correctly. I think it would be clearer to put the if above inside the if
for KEXEC_TYPE_CRASH to make it clearer.

>
>  	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len,

<snip>

> diff --git a/arch/powerpc/purgatory/purgatory_64.c b/arch/powerpc/purgatory/purgatory_64.c
> new file mode 100644
> index 0000000..1eca74c
> --- /dev/null
> +++ b/arch/powerpc/purgatory/purgatory_64.c
> @@ -0,0 +1,36 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * purgatory: Runs between two kernels
> + *
> + * Copyright 2020, Hari Bathini, IBM Corporation.
> + */
> +
> +#include <asm/purgatory.h>
> +#include <asm/crashdump-ppc64.h>
> +
> +extern unsigned long backup_start;
> +
> +static void *__memcpy(void *dest, const void *src, unsigned long n)
> +{
> +	unsigned long i;
> +	unsigned char *d;
> +	const unsigned char *s;
> +
> +	d = dest;
> +	s = src;
> +	for (i = 0; i < n; i++)
> +		d[i] = s[i];
> +
> +	return dest;
> +}
> +
> +void purgatory(void)
> +{
> +	void *dest, *src;
> +
> +	src = (void *)BACKUP_SRC_START;
> +	if (backup_start) {
> +		dest = (void *)backup_start;
> +		__memcpy(dest, src, BACKUP_SRC_SIZE);
> +	}
> +}

In general I'm in favor of using C code over assembly, but having to
bring in that relocation support just for the above makes me wonder if
it's worth it in this case.

--
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH v3 08/12] ppc64/kexec_file: setup the stack for purgatory
From: Thiago Jung Bauermann @ 2020-07-16  1:40 UTC (permalink / raw)
  To: Hari Bathini
  Cc: Pingfan Liu, Nayna Jain, Kexec-ml, Mahesh J Salgaonkar,
	Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain, Petr Tesarik,
	Andrew Morton, Dave Young, Vivek Goyal, Eric Biederman
In-Reply-To: <159466095278.24747.9161591016931052627.stgit@hbathini.in.ibm.com>


Sorry, forgot to send one comment for this patch:

Hari Bathini <hbathini@linux.ibm.com> writes:

> @@ -898,10 +900,37 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
>  			goto out;
>  	}
>
> +	/* Setup the stack top */
> +	stack_buf = kexec_purgatory_get_symbol_addr(image, "stack_buf");
> +	if (!stack_buf)
> +		goto out;
> +
> +	val = (u64)stack_buf + KEXEC_PURGATORY_STACK_SIZE;
> +	ret = kexec_purgatory_get_set_symbol(image, "stack", &val, sizeof(val),
> +					     false);
> +	if (ret)
> +		goto out;
> +
>  	/* Setup the TOC pointer */
>  	val = get_toc_ptr(&(image->purgatory_info));
>  	ret = kexec_purgatory_get_set_symbol(image, "my_toc", &val, sizeof(val),
>  					     false);
> +	if (ret)
> +		goto out;
> +
> +	/* Setup OPAL base & entry values */
> +	dn = of_find_node_by_path("/ibm,opal");
> +	if (dn) {
> +		of_property_read_u64(dn, "opal-base-address", &val);
> +		ret = kexec_purgatory_get_set_symbol(image, "opal_base", &val,
> +						     sizeof(val), false);
> +		if (ret)
> +			goto out;
> +
> +		of_property_read_u64(dn, "opal-entry-address", &val);
> +		ret = kexec_purgatory_get_set_symbol(image, "opal_entry", &val,
> +						     sizeof(val), false);

You need to call of_node_put(dn) here and in the if (ret) case above.

> +	}
>  out:
>  	if (ret)
>  		pr_err("Failed to setup purgatory symbols");

--
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH v3 02/12] powerpc/kexec_file: mark PPC64 specific code
From: Thiago Jung Bauermann @ 2020-07-16  1:49 UTC (permalink / raw)
  To: Hari Bathini
  Cc: Pingfan Liu, Nayna Jain, Kexec-ml, Mahesh J Salgaonkar,
	Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain, Petr Tesarik,
	Andrew Morton, Dave Young, Vivek Goyal, Eric Biederman
In-Reply-To: <159466085652.24747.2414199807974963385.stgit@hbathini.in.ibm.com>


I didn't forget about this patch. I just wanted to see more of the
changes before comenting on it.

Hari Bathini <hbathini@linux.ibm.com> writes:

> Some of the kexec_file_load code isn't PPC64 specific. Move PPC64
> specific code from kexec/file_load.c to kexec/file_load_64.c. Also,
> rename purgatory/trampoline.S to purgatory/trampoline_64.S in the
> same spirit.

There's only a 64 bit implementation of kexec_file_load() so this is a
somewhat theoretical exercise, but there's no harm in getting the code
organized, so:

Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>

I have just one question below.

> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
> Tested-by: Pingfan Liu <piliu@redhat.com>
> ---
>
> v2 -> v3:
> * Unchanged. Added Tested-by tag from Pingfan.
>
> v1 -> v2:
> * No changes.
>
>
>  arch/powerpc/include/asm/kexec.h       |   11 +++
>  arch/powerpc/kexec/Makefile            |    2 -
>  arch/powerpc/kexec/elf_64.c            |    7 +-
>  arch/powerpc/kexec/file_load.c         |   37 ++--------
>  arch/powerpc/kexec/file_load_64.c      |  108 ++++++++++++++++++++++++++++++
>  arch/powerpc/purgatory/Makefile        |    4 +
>  arch/powerpc/purgatory/trampoline.S    |  117 --------------------------------
>  arch/powerpc/purgatory/trampoline_64.S |  117 ++++++++++++++++++++++++++++++++
>  8 files changed, 248 insertions(+), 155 deletions(-)
>  create mode 100644 arch/powerpc/kexec/file_load_64.c
>  delete mode 100644 arch/powerpc/purgatory/trampoline.S
>  create mode 100644 arch/powerpc/purgatory/trampoline_64.S

<snip>

> diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
> new file mode 100644
> index 0000000..e6bff960
> --- /dev/null
> +++ b/arch/powerpc/kexec/file_load_64.c
> @@ -0,0 +1,108 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * ppc64 code to implement the kexec_file_load syscall
> + *
> + * Copyright (C) 2004  Adam Litke (agl@us.ibm.com)
> + * Copyright (C) 2004  IBM Corp.
> + * Copyright (C) 2004,2005  Milton D Miller II, IBM Corporation
> + * Copyright (C) 2005  R Sharada (sharada@in.ibm.com)
> + * Copyright (C) 2006  Mohan Kumar M (mohan@in.ibm.com)
> + * Copyright (C) 2020  IBM Corporation
> + *
> + * Based on kexec-tools' kexec-ppc64.c, kexec-elf-rel-ppc64.c, fs2dt.c.
> + * Heavily modified for the kernel by
> + * Hari Bathini <hbathini@linux.ibm.com>.
> + */
> +
> +#include <linux/kexec.h>
> +#include <linux/of_fdt.h>
> +#include <linux/libfdt.h>
> +
> +const struct kexec_file_ops * const kexec_file_loaders[] = {
> +	&kexec_elf64_ops,
> +	NULL
> +};
> +
> +/**
> + * setup_purgatory_ppc64 - initialize PPC64 specific purgatory's global
> + *                         variables and call setup_purgatory() to initialize
> + *                         common global variable.
> + * @image:                 kexec image.
> + * @slave_code:            Slave code for the purgatory.
> + * @fdt:                   Flattened device tree for the next kernel.
> + * @kernel_load_addr:      Address where the kernel is loaded.
> + * @fdt_load_addr:         Address where the flattened device tree is loaded.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
> +			  const void *fdt, unsigned long kernel_load_addr,
> +			  unsigned long fdt_load_addr)
> +{
> +	int ret;
> +
> +	ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
> +			      fdt_load_addr);
> +	if (ret)
> +		pr_err("Failed to setup purgatory symbols");
> +	return ret;
> +}
> +
> +/**
> + * setup_new_fdt_ppc64 - Update the flattend device-tree of the kernel
> + *                       being loaded.
> + * @image:               kexec image being loaded.
> + * @fdt:                 Flattened device tree for the next kernel.
> + * @initrd_load_addr:    Address where the next initrd will be loaded.
> + * @initrd_len:          Size of the next initrd, or 0 if there will be none.
> + * @cmdline:             Command line for the next kernel, or NULL if there will
> + *                       be none.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
> +			unsigned long initrd_load_addr,
> +			unsigned long initrd_len, const char *cmdline)
> +{
> +	int chosen_node, ret;
> +
> +	/* Remove memory reservation for the current device tree. */
> +	ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params),
> +				 fdt_totalsize(initial_boot_params));
> +	if (ret == 0)
> +		pr_debug("Removed old device tree reservation.\n");
> +	else if (ret != -ENOENT) {
> +		pr_err("Failed to remove old device-tree reservation.\n");
> +		return ret;
> +	}
> +
> +	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len,
> +			    cmdline, &chosen_node);
> +	if (ret)
> +		return ret;
> +
> +	ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
> +	if (ret)
> +		pr_err("Failed to update device-tree with linux,booted-from-kexec\n");
> +
> +	return ret;
> +}

For setup_purgatory_ppc64() you start with an empty function and build
from there, but for setup_new_fdt_ppc64() you moved some code here. Is
the code above 64 bit specific?

--
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH v3 10/12] ppc64/kexec_file: prepare elfcore header for crashing kernel
From: Thiago Jung Bauermann @ 2020-07-16  2:22 UTC (permalink / raw)
  To: Hari Bathini
  Cc: Pingfan Liu, Nayna Jain, Kexec-ml, Mahesh J Salgaonkar,
	Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain, Petr Tesarik,
	Andrew Morton, Dave Young, Vivek Goyal, Eric Biederman
In-Reply-To: <159466098739.24747.5860501703617893464.stgit@hbathini.in.ibm.com>


Hari Bathini <hbathini@linux.ibm.com> writes:

>  /**
> + * get_crash_memory_ranges - Get crash memory ranges. This list includes
> + *                           first/crashing kernel's memory regions that
> + *                           would be exported via an elfcore.
> + * @mem_ranges:              Range list to add the memory ranges to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +static int get_crash_memory_ranges(struct crash_mem **mem_ranges)
> +{
> +	struct memblock_region *reg;
> +	struct crash_mem *tmem;
> +	int ret;
> +
> +	for_each_memblock(memory, reg) {
> +		u64 base, size;
> +
> +		base = (u64)reg->base;
> +		size = (u64)reg->size;
> +
> +		/* Skip backup memory region, which needs a separate entry */
> +		if (base == BACKUP_SRC_START) {
> +			if (size > BACKUP_SRC_SIZE) {
> +				base = BACKUP_SRC_END + 1;
> +				size -= BACKUP_SRC_SIZE;
> +			} else
> +				continue;
> +		}
> +
> +		ret = add_mem_range(mem_ranges, base, size);
> +		if (ret)
> +			goto out;
> +
> +		/* Try merging adjacent ranges before reallocation attempt */
> +		if ((*mem_ranges)->nr_ranges == (*mem_ranges)->max_nr_ranges)
> +			sort_memory_ranges(*mem_ranges, true);
> +	}
> +
> +	/* Reallocate memory ranges if there is no space to split ranges */
> +	tmem = *mem_ranges;
> +	if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) {
> +		tmem = realloc_mem_ranges(mem_ranges);
> +		if (!tmem)
> +			goto out;
> +	}
> +
> +	/* Exclude crashkernel region */
> +	ret = crash_exclude_mem_range(tmem, crashk_res.start, crashk_res.end);
> +	if (ret)
> +		goto out;
> +
> +	ret = add_rtas_mem_range(mem_ranges);
> +	if (ret)
> +		goto out;
> +
> +	ret = add_opal_mem_range(mem_ranges);
> +	if (ret)
> +		goto out;

Maybe I'm confused, but don't you add the RTAS and OPAL regions as
usable memory for the crashkernel? In that case they shouldn't show up
in the core file.

> +
> +	/* create a separate program header for the backup region */
> +	ret = add_mem_range(mem_ranges, BACKUP_SRC_START, BACKUP_SRC_SIZE);
> +	if (ret)
> +		goto out;
> +
> +	sort_memory_ranges(*mem_ranges, false);
> +out:
> +	if (ret)
> +		pr_err("Failed to setup crash memory ranges\n");
> +	return ret;
> +}

<snip>

> +/**
> + * prepare_elf_headers - Prepare headers for the elfcore to be exported as
> + *                       /proc/vmcore by the kdump kernel.
> + * @image:               Kexec image.
> + * @cmem:                Crash memory ranges to be exported via elfcore.
> + * @addr:                Vmalloc'd memory allocated by crash_prepare_elf64_headers
> + *                       to prepare the elf headers.
> + * @sz:                  Size of the vmalloc'd memory allocated.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +static int prepare_elf_headers(struct kimage *image, struct crash_mem *cmem,
> +			       void **addr, unsigned long *sz)
> +{
> +	int ret;
> +
> +	ret = crash_prepare_elf64_headers(cmem, false, addr, sz);
> +
> +	/* Fix the offset for backup region in the ELF header */
> +	if (!ret)
> +		update_backup_region_phdr(image, *addr);
> +
> +	return ret;
> +}

The code above can be inlined into its caller, I don't see a need to
have a separate function.

> +
> +/**
> + * load_elfcorehdr_segment - Setup crash memory ranges and initialize elfcorehdr
> + *                           segment needed to load kdump kernel.
> + * @image:                   Kexec image.
> + * @kbuf:                    Buffer contents and memory parameters.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +static int load_elfcorehdr_segment(struct kimage *image, struct kexec_buf *kbuf)
> +{
> +	struct crash_mem *cmem = NULL;
> +	unsigned long headers_sz;
> +	void *headers = NULL;
> +	int ret;
> +
> +	ret = get_crash_memory_ranges(&cmem);
> +	if (ret)
> +		goto out;
> +
> +	/* Setup elfcorehdr segment */
> +	ret = prepare_elf_headers(image, cmem, &headers, &headers_sz);
> +	if (ret) {
> +		pr_err("Failed to prepare elf headers for the core\n");
> +		goto out;
> +	}
> +
> +	kbuf->buffer = headers;
> +	kbuf->mem = KEXEC_BUF_MEM_UNKNOWN;
> +	kbuf->bufsz = kbuf->memsz = headers_sz;
> +	kbuf->top_down = false;
> +
> +	ret = kexec_add_buffer(kbuf);
> +	if (ret) {
> +		vfree(headers);
> +		goto out;
> +	}
> +
> +	image->arch.elfcorehdr_addr = kbuf->mem;
> +	image->arch.elf_headers_sz = headers_sz;
> +	image->arch.elf_headers = headers;
> +out:
> +	kfree(cmem);
> +	return ret;
> +}

--
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [RFC PATCH 7/7] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-07-16  2:26 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, Andy Lutomirski, linuxppc-dev
In-Reply-To: <6D3D1346-DB1E-43EB-812A-184918CCC16A@amacapital.net>

Excerpts from Andy Lutomirski's message of July 14, 2020 10:46 pm:
> 
> 
>> On Jul 13, 2020, at 11:31 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>> 
>> Excerpts from Nicholas Piggin's message of July 14, 2020 3:04 pm:
>>> Excerpts from Andy Lutomirski's message of July 14, 2020 4:18 am:
>>>> 
>>>>> On Jul 13, 2020, at 9:48 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>>>> 
>>>>> Excerpts from Andy Lutomirski's message of July 14, 2020 1:59 am:
>>>>>>> On Thu, Jul 9, 2020 at 6:57 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>>>>>>> 
>>>>>>> On big systems, the mm refcount can become highly contented when doing
>>>>>>> a lot of context switching with threaded applications (particularly
>>>>>>> switching between the idle thread and an application thread).
>>>>>>> 
>>>>>>> Abandoning lazy tlb slows switching down quite a bit in the important
>>>>>>> user->idle->user cases, so so instead implement a non-refcounted scheme
>>>>>>> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>>>>>>> any remaining lazy ones.
>>>>>>> 
>>>>>>> On a 16-socket 192-core POWER8 system, a context switching benchmark
>>>>>>> with as many software threads as CPUs (so each switch will go in and
>>>>>>> out of idle), upstream can achieve a rate of about 1 million context
>>>>>>> switches per second. After this patch it goes up to 118 million.
>>>>>>> 
>>>>>> 
>>>>>> I read the patch a couple of times, and I have a suggestion that could
>>>>>> be nonsense.  You are, effectively, using mm_cpumask() as a sort of
>>>>>> refcount.  You're saying "hey, this mm has no more references, but it
>>>>>> still has nonempty mm_cpumask(), so let's send an IPI and shoot down
>>>>>> those references too."  I'm wondering whether you actually need the
>>>>>> IPI.  What if, instead, you actually treated mm_cpumask as a refcount
>>>>>> for real?  Roughly, in __mmdrop(), you would only free the page tables
>>>>>> if mm_cpumask() is empty.  And, in the code that removes a CPU from
>>>>>> mm_cpumask(), you would check if mm_users == 0 and, if so, check if
>>>>>> you just removed the last bit from mm_cpumask and potentially free the
>>>>>> mm.
>>>>>> 
>>>>>> Getting the locking right here could be a bit tricky -- you need to
>>>>>> avoid two CPUs simultaneously exiting lazy TLB and thinking they
>>>>>> should free the mm, and you also need to avoid an mm with mm_users
>>>>>> hitting zero concurrently with the last remote CPU using it lazily
>>>>>> exiting lazy TLB.  Perhaps this could be resolved by having mm_count
>>>>>> == 1 mean "mm_cpumask() is might contain bits and, if so, it owns the
>>>>>> mm" and mm_count == 0 meaning "now it's dead" and using some careful
>>>>>> cmpxchg or dec_return to make sure that only one CPU frees it.
>>>>>> 
>>>>>> Or maybe you'd need a lock or RCU for this, but the idea would be to
>>>>>> only ever take the lock after mm_users goes to zero.
>>>>> 
>>>>> I don't think it's nonsense, it could be a good way to avoid IPIs.
>>>>> 
>>>>> I haven't seen much problem here that made me too concerned about IPIs 
>>>>> yet, so I think the simple patch may be good enough to start with
>>>>> for powerpc. I'm looking at avoiding/reducing the IPIs by combining the
>>>>> unlazying with the exit TLB flush without doing anything fancy with
>>>>> ref counting, but we'll see.
>>>> 
>>>> I would be cautious with benchmarking here. I would expect that the
>>>> nasty cases may affect power consumption more than performance — the 
>>>> specific issue is IPIs hitting idle cores, and the main effects are to 
>>>> slow down exit() a bit but also to kick the idle core out of idle. 
>>>> Although, if the idle core is in a deep sleep, that IPI could be 
>>>> *very* slow.
>>> 
>>> It will tend to be self-limiting to some degree (deeper idle cores
>>> would tend to have less chance of IPI) but we have bigger issues on
>>> powerpc with that, like broadcast IPIs to the mm cpumask for THP
>>> management. Power hasn't really shown up as an issue but powerpc
>>> CPUs may have their own requirements and issues there, shall we say.
>>> 
>>>> So I think it’s worth at least giving this a try.
>>> 
>>> To be clear it's not a complete solution itself. The problem is of 
>>> course that mm cpumask gives you false negatives, so the bits
>>> won't always clean up after themselves as CPUs switch away from their
>>> lazy tlb mms.
>> 
>> ^^
>> 
>> False positives: CPU is in the mm_cpumask, but is not using the mm
>> as a lazy tlb. So there can be bits left and never freed.
>> 
>> If you closed the false positives, you're back to a shared mm cache
>> line on lazy mm context switches.
> 
> x86 has this exact problem. At least no more than 64*8 CPUs share the cache line :)
> 
> Can your share your benchmark?

It's just context_switch1_threads from will-it-scale, running with 1/2
the number of CPUs, and core affinity with an SMT processor (so each
thread from the switching pairs gets spread to their own CPU and so you
get the task->idle->task switching.

It's really just about the worst case, so I wouldn't say it's something
to panic about.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v3 11/12] ppc64/kexec_file: add appropriate regions for memory reserve map
From: Thiago Jung Bauermann @ 2020-07-16  2:27 UTC (permalink / raw)
  To: Hari Bathini
  Cc: Pingfan Liu, Nayna Jain, Kexec-ml, Mahesh J Salgaonkar,
	Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain, Petr Tesarik,
	Andrew Morton, Dave Young, Vivek Goyal, Eric Biederman
In-Reply-To: <159466100206.24747.3782313430191321863.stgit@hbathini.in.ibm.com>


Hari Bathini <hbathini@linux.ibm.com> writes:

> While initrd, elfcorehdr and backup regions are already added to the
> reserve map, there are a few missing regions that need to be added to
> the memory reserve map. Add them here. And now that all the changes
> to load panic kernel are in place, claim likewise.
>
> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
> Tested-by: Pingfan Liu <piliu@redhat.com>

Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>

Just one oinor nit below.

> ---
>
> v2 -> v3:
> * Unchanged. Added Tested-by tag from Pingfan.
>
> v1 -> v2:
> * Updated add_rtas_mem_range() & add_opal_mem_range() callsites based on
>   the new prototype for these functions.
>
>
>  arch/powerpc/kexec/file_load_64.c |   58 ++++++++++++++++++++++++++++++++++---
>  1 file changed, 53 insertions(+), 5 deletions(-)
>
> diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
> index 2531bb5..29e5d11 100644
> --- a/arch/powerpc/kexec/file_load_64.c
> +++ b/arch/powerpc/kexec/file_load_64.c
> @@ -193,6 +193,34 @@ static int get_crash_memory_ranges(struct crash_mem **mem_ranges)
>  }
>  
>  /**
> + * get_reserved_memory_ranges - Get reserve memory ranges. This list includes
> + *                              memory regions that should be added to the
> + *                              memory reserve map to ensure the region is
> + *                              protected from any mischeif.

s/mischeif/mischief/

> + * @mem_ranges:                 Range list to add the memory ranges to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +static int get_reserved_memory_ranges(struct crash_mem **mem_ranges)
> +{
> +	int ret;
> +
> +	ret = add_rtas_mem_range(mem_ranges);
> +	if (ret)
> +		goto out;
> +
> +	ret = add_tce_mem_ranges(mem_ranges);
> +	if (ret)
> +		goto out;
> +
> +	ret = add_reserved_ranges(mem_ranges);
> +out:
> +	if (ret)
> +		pr_err("Failed to setup reserved memory ranges\n");
> +	return ret;
> +}

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [RFC PATCH 7/7] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-07-16  2:35 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, Andy Lutomirski, linuxppc-dev
In-Reply-To: <6D3D1346-DB1E-43EB-812A-184918CCC16A@amacapital.net>

Excerpts from Andy Lutomirski's message of July 14, 2020 10:46 pm:
> 
> 
>> On Jul 13, 2020, at 11:31 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>> 
>> Excerpts from Nicholas Piggin's message of July 14, 2020 3:04 pm:
>>> Excerpts from Andy Lutomirski's message of July 14, 2020 4:18 am:
>>>> 
>>>>> On Jul 13, 2020, at 9:48 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>>>> 
>>>>> Excerpts from Andy Lutomirski's message of July 14, 2020 1:59 am:
>>>>>>> On Thu, Jul 9, 2020 at 6:57 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>>>>>>> 
>>>>>>> On big systems, the mm refcount can become highly contented when doing
>>>>>>> a lot of context switching with threaded applications (particularly
>>>>>>> switching between the idle thread and an application thread).
>>>>>>> 
>>>>>>> Abandoning lazy tlb slows switching down quite a bit in the important
>>>>>>> user->idle->user cases, so so instead implement a non-refcounted scheme
>>>>>>> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>>>>>>> any remaining lazy ones.
>>>>>>> 
>>>>>>> On a 16-socket 192-core POWER8 system, a context switching benchmark
>>>>>>> with as many software threads as CPUs (so each switch will go in and
>>>>>>> out of idle), upstream can achieve a rate of about 1 million context
>>>>>>> switches per second. After this patch it goes up to 118 million.
>>>>>>> 
>>>>>> 
>>>>>> I read the patch a couple of times, and I have a suggestion that could
>>>>>> be nonsense.  You are, effectively, using mm_cpumask() as a sort of
>>>>>> refcount.  You're saying "hey, this mm has no more references, but it
>>>>>> still has nonempty mm_cpumask(), so let's send an IPI and shoot down
>>>>>> those references too."  I'm wondering whether you actually need the
>>>>>> IPI.  What if, instead, you actually treated mm_cpumask as a refcount
>>>>>> for real?  Roughly, in __mmdrop(), you would only free the page tables
>>>>>> if mm_cpumask() is empty.  And, in the code that removes a CPU from
>>>>>> mm_cpumask(), you would check if mm_users == 0 and, if so, check if
>>>>>> you just removed the last bit from mm_cpumask and potentially free the
>>>>>> mm.
>>>>>> 
>>>>>> Getting the locking right here could be a bit tricky -- you need to
>>>>>> avoid two CPUs simultaneously exiting lazy TLB and thinking they
>>>>>> should free the mm, and you also need to avoid an mm with mm_users
>>>>>> hitting zero concurrently with the last remote CPU using it lazily
>>>>>> exiting lazy TLB.  Perhaps this could be resolved by having mm_count
>>>>>> == 1 mean "mm_cpumask() is might contain bits and, if so, it owns the
>>>>>> mm" and mm_count == 0 meaning "now it's dead" and using some careful
>>>>>> cmpxchg or dec_return to make sure that only one CPU frees it.
>>>>>> 
>>>>>> Or maybe you'd need a lock or RCU for this, but the idea would be to
>>>>>> only ever take the lock after mm_users goes to zero.
>>>>> 
>>>>> I don't think it's nonsense, it could be a good way to avoid IPIs.
>>>>> 
>>>>> I haven't seen much problem here that made me too concerned about IPIs 
>>>>> yet, so I think the simple patch may be good enough to start with
>>>>> for powerpc. I'm looking at avoiding/reducing the IPIs by combining the
>>>>> unlazying with the exit TLB flush without doing anything fancy with
>>>>> ref counting, but we'll see.
>>>> 
>>>> I would be cautious with benchmarking here. I would expect that the
>>>> nasty cases may affect power consumption more than performance — the 
>>>> specific issue is IPIs hitting idle cores, and the main effects are to 
>>>> slow down exit() a bit but also to kick the idle core out of idle. 
>>>> Although, if the idle core is in a deep sleep, that IPI could be 
>>>> *very* slow.
>>> 
>>> It will tend to be self-limiting to some degree (deeper idle cores
>>> would tend to have less chance of IPI) but we have bigger issues on
>>> powerpc with that, like broadcast IPIs to the mm cpumask for THP
>>> management. Power hasn't really shown up as an issue but powerpc
>>> CPUs may have their own requirements and issues there, shall we say.
>>> 
>>>> So I think it’s worth at least giving this a try.
>>> 
>>> To be clear it's not a complete solution itself. The problem is of 
>>> course that mm cpumask gives you false negatives, so the bits
>>> won't always clean up after themselves as CPUs switch away from their
>>> lazy tlb mms.
>> 
>> ^^
>> 
>> False positives: CPU is in the mm_cpumask, but is not using the mm
>> as a lazy tlb. So there can be bits left and never freed.
>> 
>> If you closed the false positives, you're back to a shared mm cache
>> line on lazy mm context switches.
> 
> x86 has this exact problem. At least no more than 64*8 CPUs share the cache line :)
> 
> Can your share your benchmark?

Just testing the IPI rates (on a smaller 176 CPU system), on a
kernel compile, it causes about 300 shootdown interrupts (not
300 broadcasts but total interrupts).

And very short lived fork;exec;exit things like typical scripting
commands doesn't typically generate any.

So yeah the really high exit rate things self-limit pretty well.

I documented the concern and added a few of the possible ways to
further reduce IPIs in the comments though.

Thanks,
Nick

^ permalink raw reply

* [powerpc:next-test] BUILD SUCCESS 085563a9059b42c635151e970fd3af941f46a0fd
From: kernel test robot @ 2020-07-16  2:52 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  next-test
branch HEAD: 085563a9059b42c635151e970fd3af941f46a0fd  powerpc/perf: Add kernel support for new MSR[HV PR] bits in trace-imc

elapsed time: 735m

configs tested: 88
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

arm                                 defconfig
arm                              allyesconfig
arm                              allmodconfig
arm                               allnoconfig
arm64                            allyesconfig
arm64                               defconfig
arm64                            allmodconfig
arm64                             allnoconfig
arc                          axs101_defconfig
c6x                        evmc6457_defconfig
sh                 kfr2r09-romimage_defconfig
powerpc                    gamecube_defconfig
arm                          lpd270_defconfig
arm                        clps711x_defconfig
arm                           corgi_defconfig
riscv                            allyesconfig
arm                         orion5x_defconfig
i386                              allnoconfig
i386                             allyesconfig
i386                                defconfig
i386                              debian-10.3
ia64                             allmodconfig
ia64                                defconfig
ia64                              allnoconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                              allnoconfig
m68k                           sun3_defconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
nios2                            allyesconfig
openrisc                            defconfig
c6x                              allyesconfig
c6x                               allnoconfig
openrisc                         allyesconfig
nds32                               defconfig
nds32                             allnoconfig
csky                             allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
h8300                            allmodconfig
xtensa                              defconfig
arc                                 defconfig
arc                              allyesconfig
sh                               allmodconfig
sh                                allnoconfig
microblaze                        allnoconfig
mips                             allyesconfig
mips                              allnoconfig
mips                             allmodconfig
parisc                            allnoconfig
parisc                              defconfig
parisc                           allyesconfig
parisc                           allmodconfig
powerpc                          allyesconfig
powerpc                          rhel-kconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
powerpc                             defconfig
i386                 randconfig-a016-20200715
i386                 randconfig-a011-20200715
i386                 randconfig-a015-20200715
i386                 randconfig-a012-20200715
i386                 randconfig-a013-20200715
i386                 randconfig-a014-20200715
riscv                             allnoconfig
riscv                               defconfig
riscv                            allmodconfig
s390                             allyesconfig
s390                              allnoconfig
s390                             allmodconfig
s390                                defconfig
sparc                            allyesconfig
sparc                               defconfig
sparc64                             defconfig
sparc64                           allnoconfig
sparc64                          allyesconfig
sparc64                          allmodconfig
x86_64                    rhel-7.6-kselftests
x86_64                               rhel-8.3
x86_64                                  kexec
x86_64                                   rhel
x86_64                                    lkp
x86_64                              fedora-25

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* [powerpc:next] BUILD SUCCESS f88223979044bfae32cb16f814c43739e5ba1301
From: kernel test robot @ 2020-07-16  2:52 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  next
branch HEAD: f88223979044bfae32cb16f814c43739e5ba1301  powerpc: re-initialise lazy FPU/VEC counters on every fault

elapsed time: 724m

configs tested: 100
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

arm                                 defconfig
arm                              allyesconfig
arm                              allmodconfig
arm                               allnoconfig
arm64                            allyesconfig
arm64                               defconfig
arm64                            allmodconfig
arm64                             allnoconfig
arc                          axs101_defconfig
c6x                        evmc6457_defconfig
sh                 kfr2r09-romimage_defconfig
powerpc                    gamecube_defconfig
arm                          lpd270_defconfig
arm                        clps711x_defconfig
arm                           corgi_defconfig
riscv                            allyesconfig
arm                         orion5x_defconfig
i386                              allnoconfig
i386                             allyesconfig
i386                                defconfig
i386                              debian-10.3
ia64                             allmodconfig
ia64                                defconfig
ia64                              allnoconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                              allnoconfig
m68k                           sun3_defconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
nios2                            allyesconfig
openrisc                            defconfig
c6x                              allyesconfig
c6x                               allnoconfig
openrisc                         allyesconfig
nds32                               defconfig
nds32                             allnoconfig
csky                             allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
h8300                            allmodconfig
xtensa                              defconfig
arc                                 defconfig
arc                              allyesconfig
sh                               allmodconfig
sh                                allnoconfig
microblaze                        allnoconfig
mips                             allyesconfig
mips                              allnoconfig
mips                             allmodconfig
parisc                            allnoconfig
parisc                              defconfig
parisc                           allyesconfig
parisc                           allmodconfig
powerpc                             defconfig
powerpc                          allyesconfig
powerpc                          rhel-kconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
i386                 randconfig-a001-20200715
i386                 randconfig-a005-20200715
i386                 randconfig-a002-20200715
i386                 randconfig-a006-20200715
i386                 randconfig-a003-20200715
i386                 randconfig-a004-20200715
x86_64               randconfig-a005-20200715
x86_64               randconfig-a006-20200715
x86_64               randconfig-a002-20200715
x86_64               randconfig-a001-20200715
x86_64               randconfig-a003-20200715
x86_64               randconfig-a004-20200715
i386                 randconfig-a016-20200715
i386                 randconfig-a011-20200715
i386                 randconfig-a015-20200715
i386                 randconfig-a012-20200715
i386                 randconfig-a013-20200715
i386                 randconfig-a014-20200715
riscv                             allnoconfig
riscv                               defconfig
riscv                            allmodconfig
s390                             allyesconfig
s390                              allnoconfig
s390                             allmodconfig
s390                                defconfig
sparc                            allyesconfig
sparc                               defconfig
sparc64                             defconfig
sparc64                           allnoconfig
sparc64                          allyesconfig
sparc64                          allmodconfig
x86_64                    rhel-7.6-kselftests
x86_64                               rhel-8.3
x86_64                                  kexec
x86_64                                   rhel
x86_64                                    lkp
x86_64                              fedora-25

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* [powerpc:fixes-test] BUILD SUCCESS f0479c4bcbd92d1a457d4a43bcab79f29d11334a
From: kernel test robot @ 2020-07-16  2:52 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  fixes-test
branch HEAD: f0479c4bcbd92d1a457d4a43bcab79f29d11334a  selftests/powerpc: Use proper error code to check fault address

elapsed time: 737m

configs tested: 100
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

arm                                 defconfig
arm                              allyesconfig
arm                              allmodconfig
arm                               allnoconfig
arm64                            allyesconfig
arm64                               defconfig
arm64                            allmodconfig
arm64                             allnoconfig
arc                          axs101_defconfig
c6x                        evmc6457_defconfig
sh                 kfr2r09-romimage_defconfig
powerpc                    gamecube_defconfig
arm                          lpd270_defconfig
arm                        clps711x_defconfig
arm                           corgi_defconfig
riscv                            allyesconfig
arm                         orion5x_defconfig
i386                              allnoconfig
i386                             allyesconfig
i386                                defconfig
i386                              debian-10.3
ia64                             allmodconfig
ia64                                defconfig
ia64                              allnoconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                              allnoconfig
m68k                           sun3_defconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
nios2                            allyesconfig
openrisc                            defconfig
c6x                              allyesconfig
c6x                               allnoconfig
openrisc                         allyesconfig
nds32                               defconfig
nds32                             allnoconfig
csky                             allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
h8300                            allmodconfig
xtensa                              defconfig
arc                                 defconfig
arc                              allyesconfig
sh                               allmodconfig
sh                                allnoconfig
microblaze                        allnoconfig
mips                             allyesconfig
mips                              allnoconfig
mips                             allmodconfig
parisc                            allnoconfig
parisc                              defconfig
parisc                           allyesconfig
parisc                           allmodconfig
powerpc                          allyesconfig
powerpc                          rhel-kconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
powerpc                             defconfig
i386                 randconfig-a001-20200715
i386                 randconfig-a005-20200715
i386                 randconfig-a002-20200715
i386                 randconfig-a006-20200715
i386                 randconfig-a003-20200715
i386                 randconfig-a004-20200715
x86_64               randconfig-a005-20200715
x86_64               randconfig-a006-20200715
x86_64               randconfig-a002-20200715
x86_64               randconfig-a001-20200715
x86_64               randconfig-a003-20200715
x86_64               randconfig-a004-20200715
i386                 randconfig-a016-20200715
i386                 randconfig-a011-20200715
i386                 randconfig-a015-20200715
i386                 randconfig-a012-20200715
i386                 randconfig-a013-20200715
i386                 randconfig-a014-20200715
riscv                             allnoconfig
riscv                               defconfig
riscv                            allmodconfig
s390                             allyesconfig
s390                              allnoconfig
s390                             allmodconfig
s390                                defconfig
sparc                            allyesconfig
sparc                               defconfig
sparc64                             defconfig
sparc64                           allnoconfig
sparc64                          allyesconfig
sparc64                          allmodconfig
x86_64                    rhel-7.6-kselftests
x86_64                               rhel-8.3
x86_64                                  kexec
x86_64                                   rhel
x86_64                                    lkp
x86_64                              fedora-25

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* Re: [PATCH v8 2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()
From: Michael Ellerman @ 2020-07-16  2:59 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras, nathanl
  Cc: linux-arch, arnd, linux-kernel, luto, tglx, vincenzo.frascino,
	linuxppc-dev
In-Reply-To: <0d2201efe3c7727f2acc718aefd7c5bb22c66c57.1588079622.git.christophe.leroy@c-s.fr>

Christophe Leroy <christophe.leroy@c-s.fr> writes:
> The VDSO datapage and the text pages are always located immediately
> next to each other, so it can be hardcoded without an indirection
> through __kernel_datapage_offset
>
> In order to ease things, move the data page in front like other
> arches, that way there is no need to know the size of the library
> to locate the data page.
>
> Before:
> clock-getres-realtime-coarse:    vdso: 714 nsec/call
> clock-gettime-realtime-coarse:    vdso: 792 nsec/call
> clock-gettime-realtime:    vdso: 1243 nsec/call
>
> After:
> clock-getres-realtime-coarse:    vdso: 699 nsec/call
> clock-gettime-realtime-coarse:    vdso: 784 nsec/call
> clock-gettime-realtime:    vdso: 1231 nsec/call
>
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
> v7:
> - Moved the removal of the tmp param of __get_datapage()
> in a subsequent patch
> - Included the addition of the offset param to __get_datapage()
> in the further preparatory patch
> ---
>  arch/powerpc/include/asm/vdso_datapage.h |  8 ++--
>  arch/powerpc/kernel/vdso.c               | 53 ++++--------------------
>  arch/powerpc/kernel/vdso32/datapage.S    |  3 --
>  arch/powerpc/kernel/vdso32/vdso32.lds.S  |  7 +---
>  arch/powerpc/kernel/vdso64/datapage.S    |  3 --
>  arch/powerpc/kernel/vdso64/vdso64.lds.S  |  7 +---
>  6 files changed, 16 insertions(+), 65 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
> index b9ef6cf50ea5..11886467dfdf 100644
> --- a/arch/powerpc/include/asm/vdso_datapage.h
> +++ b/arch/powerpc/include/asm/vdso_datapage.h
> @@ -118,10 +118,12 @@ extern struct vdso_data *vdso_data;
>  
>  .macro get_datapage ptr, tmp
>  	bcl	20, 31, .+4
> +999:
>  	mflr	\ptr
> -	addi	\ptr, \ptr, (__kernel_datapage_offset - (.-4))@l
> -	lwz	\tmp, 0(\ptr)
> -	add	\ptr, \tmp, \ptr
> +#if CONFIG_PPC_PAGE_SHIFT > 14
> +	addis	\ptr, \ptr, (_vdso_datapage - 999b)@ha
> +#endif
> +	addi	\ptr, \ptr, (_vdso_datapage - 999b)@l
>  .endm
>  
>  #endif /* __ASSEMBLY__ */
> diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
> index f38f26e844b6..d33fa22ddbed 100644
> --- a/arch/powerpc/kernel/vdso.c
> +++ b/arch/powerpc/kernel/vdso.c
> @@ -190,7 +190,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
>  	 * install_special_mapping or the perf counter mmap tracking code
>  	 * will fail to recognise it as a vDSO (since arch_vma_name fails).
>  	 */
> -	current->mm->context.vdso_base = vdso_base;
> +	current->mm->context.vdso_base = vdso_base + PAGE_SIZE;

I merged this but then realised it breaks the display of the vdso in /proc/self/maps.

ie. the vdso vma gets no name:

  # cat /proc/self/maps
  110f90000-110fa0000 r-xp 00000000 08:03 17021844                         /usr/bin/cat
  110fa0000-110fb0000 r--p 00000000 08:03 17021844                         /usr/bin/cat
  110fb0000-110fc0000 rw-p 00010000 08:03 17021844                         /usr/bin/cat
  126000000-126030000 rw-p 00000000 00:00 0                                [heap]
  7fffa8790000-7fffa87d0000 rw-p 00000000 00:00 0 
  7fffa87d0000-7fffa8830000 r--p 00000000 08:03 17521786                   /usr/lib/locale/en_AU.utf8/LC_CTYPE
  7fffa8830000-7fffa8840000 r--p 00000000 08:03 16958337                   /usr/lib/locale/en_AU.utf8/LC_NUMERIC
  7fffa8840000-7fffa8850000 r--p 00000000 08:03 8501358                    /usr/lib/locale/en_AU.utf8/LC_TIME
  7fffa8850000-7fffa8ad0000 r--p 00000000 08:03 16870886                   /usr/lib/locale/en_AU.utf8/LC_COLLATE
  7fffa8ad0000-7fffa8ae0000 r--p 00000000 08:03 8509433                    /usr/lib/locale/en_AU.utf8/LC_MONETARY
  7fffa8ae0000-7fffa8af0000 r--p 00000000 08:03 25383753                   /usr/lib/locale/en_AU.utf8/LC_MESSAGES/SYS_LC_MESSAGES
  7fffa8af0000-7fffa8b00000 r--p 00000000 08:03 17521790                   /usr/lib/locale/en_AU.utf8/LC_PAPER
  7fffa8b00000-7fffa8b10000 r--p 00000000 08:03 8501354                    /usr/lib/locale/en_AU.utf8/LC_NAME
  7fffa8b10000-7fffa8b20000 r--p 00000000 08:03 8509431                    /usr/lib/locale/en_AU.utf8/LC_ADDRESS
  7fffa8b20000-7fffa8b30000 r--p 00000000 08:03 8509434                    /usr/lib/locale/en_AU.utf8/LC_TELEPHONE
  7fffa8b30000-7fffa8b40000 r--p 00000000 08:03 17521787                   /usr/lib/locale/en_AU.utf8/LC_MEASUREMENT
  7fffa8b40000-7fffa8b50000 r--s 00000000 08:03 25623315                   /usr/lib64/gconv/gconv-modules.cache
  7fffa8b50000-7fffa8d40000 r-xp 00000000 08:03 25383789                   /usr/lib64/libc-2.30.so
  7fffa8d40000-7fffa8d50000 r--p 001e0000 08:03 25383789                   /usr/lib64/libc-2.30.so
  7fffa8d50000-7fffa8d60000 rw-p 001f0000 08:03 25383789                   /usr/lib64/libc-2.30.so
  7fffa8d60000-7fffa8d70000 r--p 00000000 08:03 8509432                    /usr/lib/locale/en_AU.utf8/LC_IDENTIFICATION
  7fffa8d70000-7fffa8d90000 r-xp 00000000 00:00 0						<--- missing
  7fffa8d90000-7fffa8dc0000 r-xp 00000000 08:03 25383781                   /usr/lib64/ld-2.30.so
  7fffa8dc0000-7fffa8dd0000 r--p 00020000 08:03 25383781                   /usr/lib64/ld-2.30.so
  7fffa8dd0000-7fffa8de0000 rw-p 00030000 08:03 25383781                   /usr/lib64/ld-2.30.so
  7fffc31c0000-7fffc31f0000 rw-p 00000000 00:00 0                          [stack]


And it's also going to break the logic in arch_unmap() to detect if
we're unmapping (part of) the VDSO. And it will break arch_remap() too.

And the logic to recognise the signal trampoline in
arch/powerpc/perf/callchain_*.c as well.

So I'm going to rebase and drop this for now.

Basically we have a bunch of places that assume that vdso_base is == the
start of the VDSO vma, and also that the code starts there. So that will
need some work to tease out all those assumptions and make them work
with this change.

cheers

^ permalink raw reply

* Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-07-16  4:15 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, linux-kernel,
	linux-mm, Andy Lutomirski, linuxppc-dev
In-Reply-To: <284592761.9860.1594649601492.JavaMail.zimbra@efficios.com>

Excerpts from Mathieu Desnoyers's message of July 14, 2020 12:13 am:
> ----- On Jul 13, 2020, at 9:47 AM, Nicholas Piggin npiggin@gmail.com wrote:
> 
>> Excerpts from Nicholas Piggin's message of July 13, 2020 2:45 pm:
>>> Excerpts from Andy Lutomirski's message of July 11, 2020 3:04 am:
>>>> Also, as it stands, I can easily see in_irq() ceasing to promise to
>>>> serialize.  There are older kernels for which it does not promise to
>>>> serialize.  And I have plans to make it stop serializing in the
>>>> nearish future.
>>> 
>>> You mean x86's return from interrupt? Sounds fun... you'll konw where to
>>> update the membarrier sync code, at least :)
>> 
>> Oh, I should actually say Mathieu recently clarified a return from
>> interrupt doesn't fundamentally need to serialize in order to support
>> membarrier sync core.
> 
> Clarification to your statement:
> 
> Return from interrupt to kernel code does not need to be context serializing
> as long as kernel serializes before returning to user-space.
> 
> However, return from interrupt to user-space needs to be context serializing.

Hmm, I'm not sure it's enough even with the sync in the exit_lazy_tlb
in the right places.

A kernel thread does a use_mm, then it blocks and the user process with
the same mm runs on that CPU, and then it calls into the kernel, blocks,
the kernel thread runs again, another CPU issues a membarrier which does
not IPI this one because it's running a kthread, and then the kthread
switches back to the user process (still without having unused the mm),
and then the user process returns from syscall without having done a 
core synchronising instruction.

The cause of the problem is you want to avoid IPI'ing kthreads. Why?
I'm guessing it really only matters as an optimisation in case of idle
threads. Idle thread is easy (well, easier) because it won't use_mm, so 
you could check for rq->curr == rq->idle in your loop (in a suitable 
sched accessor function).

But... I'm not really liking this subtlety in the scheduler for all this 
(the scheduler still needs the barriers when switching out of idle).

Can it be improved somehow? Let me forget x86 core sync problem for now
(that _may_ be a bit harder), and step back and look at what we're doing.
The memory barrier case would actually suffer from the same problem as
core sync, because in the same situation it has no implicit mmdrop in
the scheduler switch code either.

So what are we doing with membarrier? We want any activity caused by the 
set of CPUs/threads specified that can be observed by this thread before 
calling membarrier is appropriately fenced from activity that can be 
observed to happen after the call returns.

CPU0                     CPU1
                         1. user stuff
a. membarrier()          2. enter kernel
b. read rq->curr         3. rq->curr switched to kthread
c. is kthread, skip IPI  4. switch_to kthread
d. return to user        5. rq->curr switched to user thread
		         6. switch_to user thread
		         7. exit kernel
                         8. more user stuff

As far as I can see, the problem is CPU1 might reorder step 5 and step
8, so you have mmdrop of lazy mm be a mb after step 6.

But why? The membarrier call only cares that there is a full barrier
between 1 and 8, right? Which it will get from the previous context
switch to the kthread.

I must say the memory barrier comments in membarrier could be improved
a bit (unless I'm missing where the main comment is). It's fine to know
what barriers pair with one another, but we need to know which exact
memory accesses it is ordering

       /*
         * Matches memory barriers around rq->curr modification in
         * scheduler.
         */

Sure, but it doesn't say what else is being ordered. I think it's just
the user memory accesses, but would be nice to make that a bit more
explicit. If we had such comments then we might know this case is safe.

I think the funny powerpc barrier is a similar case of this. If we
ever see remote_rq->curr->flags & PF_KTHREAD, then we _know_ that
CPU has or will have issued a memory barrier between running user
code.

So AFAIKS all this membarrier stuff in kernel/sched/core.c could
just go away. Except x86 because thread switch doesn't imply core
sync, so CPU1 between 1 and 8 may never issue a core sync instruction
the same way a context switch must be a full mb.

Before getting to x86 -- Am I right, or way off track here? 

Thanks,
Nick

^ permalink raw reply

* Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-07-16  4:42 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, linux-kernel,
	linux-mm, Andy Lutomirski, linuxppc-dev
In-Reply-To: <1594868476.6k5kvx8684.astroid@bobo.none>

Excerpts from Nicholas Piggin's message of July 16, 2020 2:15 pm:
> Excerpts from Mathieu Desnoyers's message of July 14, 2020 12:13 am:
>> ----- On Jul 13, 2020, at 9:47 AM, Nicholas Piggin npiggin@gmail.com wrote:
>> 
>>> Excerpts from Nicholas Piggin's message of July 13, 2020 2:45 pm:
>>>> Excerpts from Andy Lutomirski's message of July 11, 2020 3:04 am:
>>>>> Also, as it stands, I can easily see in_irq() ceasing to promise to
>>>>> serialize.  There are older kernels for which it does not promise to
>>>>> serialize.  And I have plans to make it stop serializing in the
>>>>> nearish future.
>>>> 
>>>> You mean x86's return from interrupt? Sounds fun... you'll konw where to
>>>> update the membarrier sync code, at least :)
>>> 
>>> Oh, I should actually say Mathieu recently clarified a return from
>>> interrupt doesn't fundamentally need to serialize in order to support
>>> membarrier sync core.
>> 
>> Clarification to your statement:
>> 
>> Return from interrupt to kernel code does not need to be context serializing
>> as long as kernel serializes before returning to user-space.
>> 
>> However, return from interrupt to user-space needs to be context serializing.
> 
> Hmm, I'm not sure it's enough even with the sync in the exit_lazy_tlb
> in the right places.
> 
> A kernel thread does a use_mm, then it blocks and the user process with
> the same mm runs on that CPU, and then it calls into the kernel, blocks,
> the kernel thread runs again, another CPU issues a membarrier which does
> not IPI this one because it's running a kthread, and then the kthread
> switches back to the user process (still without having unused the mm),
> and then the user process returns from syscall without having done a 
> core synchronising instruction.
> 
> The cause of the problem is you want to avoid IPI'ing kthreads. Why?
> I'm guessing it really only matters as an optimisation in case of idle
> threads. Idle thread is easy (well, easier) because it won't use_mm, so 
> you could check for rq->curr == rq->idle in your loop (in a suitable 
> sched accessor function).
> 
> But... I'm not really liking this subtlety in the scheduler for all this 
> (the scheduler still needs the barriers when switching out of idle).
> 
> Can it be improved somehow? Let me forget x86 core sync problem for now
> (that _may_ be a bit harder), and step back and look at what we're doing.
> The memory barrier case would actually suffer from the same problem as
> core sync, because in the same situation it has no implicit mmdrop in
> the scheduler switch code either.
> 
> So what are we doing with membarrier? We want any activity caused by the 
> set of CPUs/threads specified that can be observed by this thread before 
> calling membarrier is appropriately fenced from activity that can be 
> observed to happen after the call returns.
> 
> CPU0                     CPU1
>                          1. user stuff
> a. membarrier()          2. enter kernel
> b. read rq->curr         3. rq->curr switched to kthread
> c. is kthread, skip IPI  4. switch_to kthread
> d. return to user        5. rq->curr switched to user thread
> 		         6. switch_to user thread
> 		         7. exit kernel
>                          8. more user stuff
> 
> As far as I can see, the problem is CPU1 might reorder step 5 and step
> 8, so you have mmdrop of lazy mm be a mb after step 6.
> 
> But why? The membarrier call only cares that there is a full barrier
> between 1 and 8, right? Which it will get from the previous context
> switch to the kthread.

I should be more complete here, especially since I was complaining
about unclear barrier comment :)


CPU0                     CPU1
a. user stuff            1. user stuff
b. membarrier()          2. enter kernel
c. smp_mb()              3. smp_mb__after_spinlock(); // in __schedule
d. read rq->curr         4. rq->curr switched to kthread
e. is kthread, skip IPI  5. switch_to kthread
f. return to user        6. rq->curr switched to user thread
g. user stuff            7. switch_to user thread
                         8. exit kernel
                         9. more user stuff

What you're really ordering is a, g vs 1, 9 right?

In other words, 9 must see a if it sees g, g must see 1 if it saw 9,
etc.

Userspace does not care where the barriers are exactly or what kernel 
memory accesses might be being ordered by them, so long as there is a
mb somewhere between a and g, and 1 and 9. Right?

^ permalink raw reply

* Re: [PATCH v3] powerpc/pseries: detect secure and trusted boot state of the system.
From: Michael Ellerman @ 2020-07-16  4:53 UTC (permalink / raw)
  To: Daniel Axtens, Nayna Jain, linuxppc-dev
  Cc: Nayna Jain, linux-kernel, Mimi Zohar
In-Reply-To: <87v9iothc1.fsf@dja-thinkpad.axtens.net>

Daniel Axtens <dja@axtens.net> writes:
> Hi Nayna,
>
> Looks good to me.
>
> Sorry for not noticing this before, but I think
>> +#include <asm/machdep.h>

> is now superfluous (I think it's leftover from the machine_is
> version?). Maybe mpe will take pity on you and remove it when he picks
> up your patch.

Yeah I did that.

cheers

^ permalink raw reply

* Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Andy Lutomirski @ 2020-07-16  5:18 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, linux-kernel,
	linux-mm, Mathieu Desnoyers, Andy Lutomirski, linuxppc-dev
In-Reply-To: <1594868476.6k5kvx8684.astroid@bobo.none>



> On Jul 15, 2020, at 9:15 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> 
> Excerpts from Mathieu Desnoyers's message of July 14, 2020 12:13 am:
>> ----- On Jul 13, 2020, at 9:47 AM, Nicholas Piggin npiggin@gmail.com wrote:
>> 
>>> Excerpts from Nicholas Piggin's message of July 13, 2020 2:45 pm:
>>>> Excerpts from Andy Lutomirski's message of July 11, 2020 3:04 am:
>>>>> Also, as it stands, I can easily see in_irq() ceasing to promise to
>>>>> serialize.  There are older kernels for which it does not promise to
>>>>> serialize.  And I have plans to make it stop serializing in the
>>>>> nearish future.
>>>> 
>>>> You mean x86's return from interrupt? Sounds fun... you'll konw where to
>>>> update the membarrier sync code, at least :)
>>> 
>>> Oh, I should actually say Mathieu recently clarified a return from
>>> interrupt doesn't fundamentally need to serialize in order to support
>>> membarrier sync core.
>> 
>> Clarification to your statement:
>> 
>> Return from interrupt to kernel code does not need to be context serializing
>> as long as kernel serializes before returning to user-space.
>> 
>> However, return from interrupt to user-space needs to be context serializing.
> 
> Hmm, I'm not sure it's enough even with the sync in the exit_lazy_tlb
> in the right places.
> 
> A kernel thread does a use_mm, then it blocks and the user process with
> the same mm runs on that CPU, and then it calls into the kernel, blocks,
> the kernel thread runs again, another CPU issues a membarrier which does
> not IPI this one because it's running a kthread, and then the kthread
> switches back to the user process (still without having unused the mm),
> and then the user process returns from syscall without having done a 
> core synchronising instruction.
> 
> The cause of the problem is you want to avoid IPI'ing kthreads. Why?
> I'm guessing it really only matters as an optimisation in case of idle
> threads. Idle thread is easy (well, easier) because it won't use_mm, so 
> you could check for rq->curr == rq->idle in your loop (in a suitable 
> sched accessor function).
> 
> But... I'm not really liking this subtlety in the scheduler for all this 
> (the scheduler still needs the barriers when switching out of idle).
> 
> Can it be improved somehow? Let me forget x86 core sync problem for now
> (that _may_ be a bit harder), and step back and look at what we're doing.
> The memory barrier case would actually suffer from the same problem as
> core sync, because in the same situation it has no implicit mmdrop in
> the scheduler switch code either.
> 
> So what are we doing with membarrier? We want any activity caused by the 
> set of CPUs/threads specified that can be observed by this thread before 
> calling membarrier is appropriately fenced from activity that can be 
> observed to happen after the call returns.
> 
> CPU0                     CPU1
>                         1. user stuff
> a. membarrier()          2. enter kernel
> b. read rq->curr         3. rq->curr switched to kthread
> c. is kthread, skip IPI  4. switch_to kthread
> d. return to user        5. rq->curr switched to user thread
>                 6. switch_to user thread
>                 7. exit kernel
>                         8. more user stuff
> 
> As far as I can see, the problem is CPU1 might reorder step 5 and step
> 8, so you have mmdrop of lazy mm be a mb after step 6.
> 
> But why? The membarrier call only cares that there is a full barrier
> between 1 and 8, right? Which it will get from the previous context
> switch to the kthread.
> 
> I must say the memory barrier comments in membarrier could be improved
> a bit (unless I'm missing where the main comment is). It's fine to know
> what barriers pair with one another, but we need to know which exact
> memory accesses it is ordering
> 
>       /*
>         * Matches memory barriers around rq->curr modification in
>         * scheduler.
>         */
> 
> Sure, but it doesn't say what else is being ordered. I think it's just
> the user memory accesses, but would be nice to make that a bit more
> explicit. If we had such comments then we might know this case is safe.
> 
> I think the funny powerpc barrier is a similar case of this. If we
> ever see remote_rq->curr->flags & PF_KTHREAD, then we _know_ that
> CPU has or will have issued a memory barrier between running user
> code.
> 
> So AFAIKS all this membarrier stuff in kernel/sched/core.c could
> just go away. Except x86 because thread switch doesn't imply core
> sync, so CPU1 between 1 and 8 may never issue a core sync instruction
> the same way a context switch must be a full mb.
> 
> Before getting to x86 -- Am I right, or way off track here?

I find it hard to believe that this is x86 only. Why would thread switch imply core sync on any architecture?  Is x86 unique in having a stupid expensive core sync that is heavier than smp_mb()?

But I’m wondering if all this deferred sync stuff is wrong. In the brave new world of io_uring and such, perhaps kernel access matter too.  Heck, even:

int a[2];

Thread A:
a[0] = 1;
a[1] = 2:

Thread B:

write(fd, a, sizeof(a));

Doesn’t do what thread A is expecting.  Admittedly this particular example is nonsense, but maybe there are sensible cases that matter to someone.

—Andy

> 
> Thanks,
> Nick

^ permalink raw reply

* Re: [PATCH v3 12/12] ppc64/kexec_file: fix kexec load failure with lack of memory hole
From: Thiago Jung Bauermann @ 2020-07-16  5:43 UTC (permalink / raw)
  To: Hari Bathini
  Cc: Pingfan Liu, Nayna Jain, Kexec-ml, Mahesh J Salgaonkar,
	Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain, Petr Tesarik,
	Andrew Morton, Dave Young, Vivek Goyal, Eric Biederman
In-Reply-To: <159466101903.24747.7234708045729315954.stgit@hbathini.in.ibm.com>


Hari Bathini <hbathini@linux.ibm.com> writes:

> The kexec purgatory has to run in real mode. Only the first memory
> block maybe accessible in real mode. And, unlike the case with panic
> kernel, no memory is set aside for regular kexec load. Another thing
> to note is, the memory for crashkernel is reserved at an offset of
> 128MB. So, when crashkernel memory is reserved, the memory ranges to
> load kexec segments shrink further as the generic code only looks for
> memblock free memory ranges and in all likelihood only a tiny bit of
> memory from 0 to 128MB would be available to load kexec segments.
>
> With kdump being used by default in general, kexec file load is likely
> to fail almost always.

Ah. I wasn't aware of this problem.

> This can be fixed by changing the memory hole
> lookup logic for regular kexec to use the same method as kdump.

Right. It doesn't make that much sense to use memblock to find free
memory areas for the kexec kernel, because memblock tracks which memory
areas are free for the currently running kernel. But that's not what
matters for the kernel that will be kexec'd into. In this case, regions
which may be reserved for the current OS instance may well be free for a
freshly started kernel. The kdump method is better at knowing which
memory regions are actually reserved by the firmware/hardware.

> This
> would mean that most kexec segments will overlap with crashkernel
> memory region. That should still be ok as the pages, whose destination
> address isn't available while loading, are placed in an intermediate
> location till a flush to the actual destination address happens during
> kexec boot sequence.

Yes, since the kdump kernel and the "regular" kexec kernel can't be both
booted at the same time, it's not a problem if both plan to use the same
region of memory.

>
> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
> Tested-by: Pingfan Liu <piliu@redhat.com>

Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>

> ---
>
> v2 -> v3:
> * Unchanged. Added Tested-by tag from Pingfan.
>
> v1 -> v2:
> * New patch to fix locating memory hole for kexec_file_load (kexec -s -l)
>   when memory is reserved for crashkernel.
>
>
>  arch/powerpc/kexec/file_load_64.c |   33 ++++++++++++++-------------------
>  1 file changed, 14 insertions(+), 19 deletions(-)

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH v3 04/12] ppc64/kexec_file: avoid stomping memory used by special regions
From: Thiago Jung Bauermann @ 2020-07-16  5:58 UTC (permalink / raw)
  To: Hari Bathini
  Cc: Pingfan Liu, Petr Tesarik, Nayna Jain, Kexec-ml,
	Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
	Andrew Morton, Dave Young, Vivek Goyal, Eric Biederman
In-Reply-To: <87365t8pse.fsf@morokweng.localdomain>


Thiago Jung Bauermann <bauerman@linux.ibm.com> writes:

> Hari Bathini <hbathini@linux.ibm.com> writes:
>
>> diff --git a/arch/powerpc/include/asm/crashdump-ppc64.h b/arch/powerpc/include/asm/crashdump-ppc64.h
>> new file mode 100644
>> index 0000000..90deb46
>> --- /dev/null
>> +++ b/arch/powerpc/include/asm/crashdump-ppc64.h
>> @@ -0,0 +1,10 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +#ifndef _ASM_POWERPC_CRASHDUMP_PPC64_H
>> +#define _ASM_POWERPC_CRASHDUMP_PPC64_H
>> +
>> +/* min & max addresses for kdump load segments */
>> +#define KDUMP_BUF_MIN		(crashk_res.start)
>> +#define KDUMP_BUF_MAX		((crashk_res.end < ppc64_rma_size) ? \
>> +				 crashk_res.end : (ppc64_rma_size - 1))
>> +
>> +#endif /* __ASM_POWERPC_CRASHDUMP_PPC64_H */
>> diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
>> index 7008ea1..bf47a01 100644
>> --- a/arch/powerpc/include/asm/kexec.h
>> +++ b/arch/powerpc/include/asm/kexec.h
>> @@ -100,14 +100,16 @@ void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_co
>>  #ifdef CONFIG_KEXEC_FILE
>>  extern const struct kexec_file_ops kexec_elf64_ops;
>>
>> -#ifdef CONFIG_IMA_KEXEC
>>  #define ARCH_HAS_KIMAGE_ARCH
>>
>>  struct kimage_arch {
>> +	struct crash_mem *exclude_ranges;
>> +
>> +#ifdef CONFIG_IMA_KEXEC
>>  	phys_addr_t ima_buffer_addr;
>>  	size_t ima_buffer_size;
>> -};
>>  #endif
>> +};
>>
>>  int setup_purgatory(struct kimage *image, const void *slave_code,
>>  		    const void *fdt, unsigned long kernel_load_addr,
>> @@ -125,6 +127,7 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
>>  			unsigned long initrd_load_addr,
>>  			unsigned long initrd_len, const char *cmdline);
>>  #endif /* CONFIG_PPC64 */
>> +
>>  #endif /* CONFIG_KEXEC_FILE */
>>
>>  #else /* !CONFIG_KEXEC_CORE */
>> diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
>> index 23ad04c..c695f94 100644
>> --- a/arch/powerpc/kexec/elf_64.c
>> +++ b/arch/powerpc/kexec/elf_64.c
>> @@ -22,6 +22,7 @@
>>  #include <linux/of_fdt.h>
>>  #include <linux/slab.h>
>>  #include <linux/types.h>
>> +#include <asm/crashdump-ppc64.h>
>>
>>  static void *elf64_load(struct kimage *image, char *kernel_buf,
>>  			unsigned long kernel_len, char *initrd,
>> @@ -46,6 +47,12 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
>>  	if (ret)
>>  		goto out;
>>
>> +	if (image->type == KEXEC_TYPE_CRASH) {
>> +		/* min & max buffer values for kdump case */
>> +		kbuf.buf_min = pbuf.buf_min = KDUMP_BUF_MIN;
>> +		kbuf.buf_max = pbuf.buf_max = KDUMP_BUF_MAX;
>
> This is only my personal opinion and an actual maintainer may disagree,
> but just looking at the lines above, I would assume that KDUMP_BUF_MIN
> and KDUMP_BUF_MAX were constants, when in fact they aren't.
>
> I suggest using static inline macros in <asm/crashdump-ppc64.h>, for
> example:
>
> static inline resource_size_t get_kdump_buf_min(void)
> {
> 	return crashk_res.start;
> }
>
> static inline resource_size_t get_kdump_buf_max(void)
> {
> 	return (crashk_res.end < ppc64_rma_size) ? \
> 		 crashk_res.end : (ppc64_rma_size - 1)
> }

I later noticed that KDUMP_BUF_MIN and KDUMP_BUF_MAX are only used here.
In this case, I think the best option is to avoid the macros and inline
functions and just use the actual expressions in the code.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-07-16  6:06 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, linux-kernel,
	linux-mm, Mathieu Desnoyers, Andy Lutomirski, linuxppc-dev
In-Reply-To: <EFAD6E2F-EC08-4EB3-9ECC-2A963C023FC5@amacapital.net>

Excerpts from Andy Lutomirski's message of July 16, 2020 3:18 pm:
> 
> 
>> On Jul 15, 2020, at 9:15 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>> 
>> Excerpts from Mathieu Desnoyers's message of July 14, 2020 12:13 am:
>>> ----- On Jul 13, 2020, at 9:47 AM, Nicholas Piggin npiggin@gmail.com wrote:
>>> 
>>>> Excerpts from Nicholas Piggin's message of July 13, 2020 2:45 pm:
>>>>> Excerpts from Andy Lutomirski's message of July 11, 2020 3:04 am:
>>>>>> Also, as it stands, I can easily see in_irq() ceasing to promise to
>>>>>> serialize.  There are older kernels for which it does not promise to
>>>>>> serialize.  And I have plans to make it stop serializing in the
>>>>>> nearish future.
>>>>> 
>>>>> You mean x86's return from interrupt? Sounds fun... you'll konw where to
>>>>> update the membarrier sync code, at least :)
>>>> 
>>>> Oh, I should actually say Mathieu recently clarified a return from
>>>> interrupt doesn't fundamentally need to serialize in order to support
>>>> membarrier sync core.
>>> 
>>> Clarification to your statement:
>>> 
>>> Return from interrupt to kernel code does not need to be context serializing
>>> as long as kernel serializes before returning to user-space.
>>> 
>>> However, return from interrupt to user-space needs to be context serializing.
>> 
>> Hmm, I'm not sure it's enough even with the sync in the exit_lazy_tlb
>> in the right places.
>> 
>> A kernel thread does a use_mm, then it blocks and the user process with
>> the same mm runs on that CPU, and then it calls into the kernel, blocks,
>> the kernel thread runs again, another CPU issues a membarrier which does
>> not IPI this one because it's running a kthread, and then the kthread
>> switches back to the user process (still without having unused the mm),
>> and then the user process returns from syscall without having done a 
>> core synchronising instruction.
>> 
>> The cause of the problem is you want to avoid IPI'ing kthreads. Why?
>> I'm guessing it really only matters as an optimisation in case of idle
>> threads. Idle thread is easy (well, easier) because it won't use_mm, so 
>> you could check for rq->curr == rq->idle in your loop (in a suitable 
>> sched accessor function).
>> 
>> But... I'm not really liking this subtlety in the scheduler for all this 
>> (the scheduler still needs the barriers when switching out of idle).
>> 
>> Can it be improved somehow? Let me forget x86 core sync problem for now
>> (that _may_ be a bit harder), and step back and look at what we're doing.
>> The memory barrier case would actually suffer from the same problem as
>> core sync, because in the same situation it has no implicit mmdrop in
>> the scheduler switch code either.
>> 
>> So what are we doing with membarrier? We want any activity caused by the 
>> set of CPUs/threads specified that can be observed by this thread before 
>> calling membarrier is appropriately fenced from activity that can be 
>> observed to happen after the call returns.
>> 
>> CPU0                     CPU1
>>                         1. user stuff
>> a. membarrier()          2. enter kernel
>> b. read rq->curr         3. rq->curr switched to kthread
>> c. is kthread, skip IPI  4. switch_to kthread
>> d. return to user        5. rq->curr switched to user thread
>>                 6. switch_to user thread
>>                 7. exit kernel
>>                         8. more user stuff
>> 
>> As far as I can see, the problem is CPU1 might reorder step 5 and step
>> 8, so you have mmdrop of lazy mm be a mb after step 6.
>> 
>> But why? The membarrier call only cares that there is a full barrier
>> between 1 and 8, right? Which it will get from the previous context
>> switch to the kthread.
>> 
>> I must say the memory barrier comments in membarrier could be improved
>> a bit (unless I'm missing where the main comment is). It's fine to know
>> what barriers pair with one another, but we need to know which exact
>> memory accesses it is ordering
>> 
>>       /*
>>         * Matches memory barriers around rq->curr modification in
>>         * scheduler.
>>         */
>> 
>> Sure, but it doesn't say what else is being ordered. I think it's just
>> the user memory accesses, but would be nice to make that a bit more
>> explicit. If we had such comments then we might know this case is safe.
>> 
>> I think the funny powerpc barrier is a similar case of this. If we
>> ever see remote_rq->curr->flags & PF_KTHREAD, then we _know_ that
>> CPU has or will have issued a memory barrier between running user
>> code.
>> 
>> So AFAIKS all this membarrier stuff in kernel/sched/core.c could
>> just go away. Except x86 because thread switch doesn't imply core
>> sync, so CPU1 between 1 and 8 may never issue a core sync instruction
>> the same way a context switch must be a full mb.
>> 
>> Before getting to x86 -- Am I right, or way off track here?
> 
> I find it hard to believe that this is x86 only. Why would thread switch imply core sync on any architecture?  Is x86 unique in having a stupid expensive core sync that is heavier than smp_mb()?

It's not the thread switch but the return from kernel to user -- at 
least of architectures that implement membarrier SYNC_CORE, x86 can do 
that without serializing.

The thread switch is muddying the waters a bit, it's not the actual 
thread switch we care about, that just happens to be used as a point
where we try to catch the membarrier IPIs that were skipped due to the
PF_KTHREAD optimisation.

I think that doing said check in the lazy tlb exit code is both
unnecessary for the memory ordering and insufficient for pipeline 
serialization.

> But I’m wondering if all this deferred sync stuff is wrong. In the brave new world of io_uring and such, perhaps kernel access matter too.  Heck, even:
> 
> int a[2];
> 
> Thread A:
> a[0] = 1;
> a[1] = 2:
> 
> Thread B:
> 
> write(fd, a, sizeof(a));
> 
> Doesn’t do what thread A is expecting.  Admittedly this particular example is nonsense, but maybe there are sensible cases that matter to someone.

I think kernel accesses probably do matter (or at least they should by 
principle of least surprise). And so I was doubly misleading by labeling
it as "user stuff". I should have distinguished between previous user or
kernel accesses, as opposed to the kernel accesses specifically for the
implementation of the membarrier call.

So I think the membarrier code gets *that* part right (modulo what we 
have seen already) if the kernel access is being done from process
context.

But yes if the access is coming from io_uring that has done
kthread_use_mm or some other random code running in a kernel thread
working on get_user_pages memory or any similar shared vm_insert_pfn
memory, then it goes completely to hell.

So good catch, PF_KTHREAD check is problematic there even if no actual
users exist today. rq->curr == rq->idle test might be better, but can
we have interrupts writing completions into user memory? For performance
I would hope so, so that makes even that test problematic.

Maybe membarrier should close that gap entirely, and work around performance
issue by adding _USER_ONLY flags which explicitly only order user mode
accesess vs other user accesses.

Thanks,
Nick


^ permalink raw reply

* [PATCH v2 0/4] VSX 32-byte vector paired load/store instructions
From: Balamuruhan S @ 2020-07-16  6:15 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev

VSX vector paired instructions operates with octword (32-byte) operand
for loads and stores between storage and a pair of two sequential Vector-Scalar
Registers (VSRs). There are 4 word instructions and 2 prefixed instructions
that provides this 32-byte storage access operations - lxvp, lxvpx, stxvp,
stxvpx, plxvpx, pstxvpx.

Emulation infrastructure doesn't have support for these instructions, to
operate with 32-byte storage access and to operate with 2 VSX registers.
This patch series enables the instruction emulation support and adds test
cases for them respectively.

Changes in v2:
-------------
* Fix suggestion from Sandipan, wrap ISA 3.1 instructions with
  cpu_has_feature(CPU_FTR_ARCH_31) check.

* Rebase on latest powerpc next branch.

Balamuruhan S (4):
  powerpc/sstep: support new VSX vector paired storage access
    instructions
  powerpc/sstep: support emulation for vsx vector paired storage access
    instructions
  powerpc ppc-opcode: add opcodes for vsx vector paired instructions
  powerpc sstep: add testcases for vsx load/store instructions

 arch/powerpc/include/asm/ppc-opcode.h |  11 ++
 arch/powerpc/include/asm/sstep.h      |   2 +-
 arch/powerpc/lib/sstep.c              | 110 ++++++++++-
 arch/powerpc/lib/test_emulate_step.c  | 273 ++++++++++++++++++++++++++
 4 files changed, 386 insertions(+), 10 deletions(-)


base-commit: b2b46304e9360f3dda49c9d8ba4a1478b9eecf1d
-- 
2.24.1


^ permalink raw reply

* [PATCH v2 1/4] powerpc/sstep: support new VSX vector paired storage access instructions
From: Balamuruhan S @ 2020-07-16  6:15 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200716061558.1532199-1-bala24@linux.ibm.com>

VSX Vector Paired instructions loads/stores an octword (32 bytes)
from/to storage into two sequential VSRs. Add `analyse_instr()` support
to these new instructions,
        * Load VSX Vector Paired (lxvp)
        * Load VSX Vector Paired Indexed (lxvpx)
        * Prefixed Load VSX Vector Paired (plxvp)
        * Store VSX Vector Paired (stxvp)
        * Store VSX Vector Paired Indexed (stxvpx)
        * Prefixed Store VSX Vector Paired (pstxvp)

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/lib/sstep.c | 52 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 5abe98216dc2..1af8c1920b36 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -31,6 +31,10 @@ extern char system_call_common[];
 #define XER_OV32	0x00080000U
 #define XER_CA32	0x00040000U
 
+#ifdef CONFIG_VSX
+#define VSX_REGISTER_XTP(rd)   ((((rd) & 1) << 5) | ((rd) & 0xfe))
+#endif
+
 #ifdef CONFIG_PPC_FPU
 /*
  * Functions in ldstfp.S
@@ -2382,6 +2386,15 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			op->vsx_flags = VSX_SPLAT;
 			break;
 
+		case 333:       /* lxvpx */
+			if (!cpu_has_feature(CPU_FTR_ARCH_31))
+				return -1;
+			op->reg = VSX_REGISTER_XTP(rd);
+			op->type = MKOP(LOAD_VSX, 0, 32);
+			op->element_size = 32;
+			op->vsx_flags = VSX_CHECK_VEC;
+			break;
+
 		case 364:	/* lxvwsx */
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(LOAD_VSX, 0, 4);
@@ -2410,6 +2423,14 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 				VSX_CHECK_VEC;
 			break;
 		}
+		case 461:       /* stxvpx */
+			if (!cpu_has_feature(CPU_FTR_ARCH_31))
+				return -1;
+			op->reg = VSX_REGISTER_XTP(rd);
+			op->type = MKOP(STORE_VSX, 0, 32);
+			op->element_size = 32;
+			op->vsx_flags = VSX_CHECK_VEC;
+			break;
 		case 524:	/* lxsspx */
 			op->reg = rd | ((word & 1) << 5);
 			op->type = MKOP(LOAD_VSX, 0, 4);
@@ -2651,6 +2672,23 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 #endif
 
 #ifdef CONFIG_VSX
+	case 6:
+		if (!cpu_has_feature(CPU_FTR_ARCH_31))
+			return -1;
+		op->ea = dqform_ea(word, regs);
+		op->reg = VSX_REGISTER_XTP(rd);
+		op->element_size = 32;
+		op->vsx_flags = VSX_CHECK_VEC;
+		switch (word & 0xf) {
+		case 0:         /* lxvp */
+			op->type = MKOP(LOAD_VSX, 0, 32);
+			break;
+		case 1:         /* stxvp */
+			op->type = MKOP(STORE_VSX, 0, 32);
+			break;
+		}
+		break;
+
 	case 61:	/* stfdp, lxv, stxsd, stxssp, stxv */
 		switch (word & 7) {
 		case 0:		/* stfdp with LSB of DS field = 0 */
@@ -2715,6 +2753,8 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 		}
 		break;
 	case 1: /* Prefixed instructions */
+		if (!cpu_has_feature(CPU_FTR_ARCH_31))
+			return -1;
 		prefix_r = word & (1ul << 20);
 		ra = (suffix >> 16) & 0x1f;
 		op->update_reg = ra;
@@ -2779,12 +2819,24 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			case 57:	/* pld */
 				op->type = MKOP(LOAD, PREFIXED, 8);
 				break;
+			case 58:        /* plxvp */
+				op->reg = VSX_REGISTER_XTP(rd);
+				op->type = MKOP(LOAD_VSX, PREFIXED, 32);
+				op->element_size = 32;
+				op->vsx_flags = VSX_CHECK_VEC;
+				break;
 			case 60:        /* stq */
 				op->type = MKOP(STORE, PREFIXED, 16);
 				break;
 			case 61:	/* pstd */
 				op->type = MKOP(STORE, PREFIXED, 8);
 				break;
+			case 62:        /* pstxvp */
+				op->reg = VSX_REGISTER_XTP(rd);
+				op->type = MKOP(STORE_VSX, PREFIXED, 32);
+				op->element_size = 32;
+				op->vsx_flags = VSX_CHECK_VEC;
+				break;
 			}
 			break;
 		case 1: /* Type 01 Eight-Byte Register-to-Register */
-- 
2.24.1


^ permalink raw reply related

* [PATCH v2 2/4] powerpc/sstep: support emulation for vsx vector paired storage access instructions
From: Balamuruhan S @ 2020-07-16  6:15 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200716061558.1532199-1-bala24@linux.ibm.com>

add emulate_step() changes to support vsx vector paired storage
access instructions that provides octword operands loads/stores
between storage and set of 2 Vector Scalar Registers (VSRs).

Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
---
 arch/powerpc/include/asm/sstep.h |  2 +-
 arch/powerpc/lib/sstep.c         | 58 +++++++++++++++++++++++++++-----
 2 files changed, 50 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 3b01c69a44aa..a6c0b299bcc9 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -126,7 +126,7 @@ union vsx_reg {
 	unsigned long d[2];
 	float	fp[4];
 	double	dp[2];
-	__vector128 v;
+	__vector128 v[2];
 };
 
 /*
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 1af8c1920b36..010ce81aeffb 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -279,6 +279,19 @@ static nokprobe_inline void do_byte_reverse(void *ptr, int nb)
 		up[1] = tmp;
 		break;
 	}
+	case 32: {
+		unsigned long *up = (unsigned long *)ptr;
+		unsigned long tmp;
+
+		tmp = byterev_8(up[0]);
+		up[0] = byterev_8(up[3]);
+		up[3] = tmp;
+		tmp = byterev_8(up[2]);
+		up[2] = byterev_8(up[1]);
+		up[1] = tmp;
+		break;
+	}
+
 #endif
 	default:
 		WARN_ON_ONCE(1);
@@ -709,6 +722,8 @@ void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg,
 	reg->d[0] = reg->d[1] = 0;
 
 	switch (op->element_size) {
+	case 32:
+		/* [p]lxvp[x] or [p]stxvp[x] */
 	case 16:
 		/* whole vector; lxv[x] or lxvl[l] */
 		if (size == 0)
@@ -717,7 +732,7 @@ void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg,
 		if (IS_LE && (op->vsx_flags & VSX_LDLEFT))
 			rev = !rev;
 		if (rev)
-			do_byte_reverse(reg, 16);
+			do_byte_reverse(reg, size);
 		break;
 	case 8:
 		/* scalar loads, lxvd2x, lxvdsx */
@@ -793,6 +808,22 @@ void emulate_vsx_store(struct instruction_op *op, const union vsx_reg *reg,
 	size = GETSIZE(op->type);
 
 	switch (op->element_size) {
+	case 32:
+		/* [p]lxvp[x] or [p]stxvp[x] */
+		if (size == 0)
+			break;
+		if (IS_LE && (op->vsx_flags & VSX_LDLEFT))
+			rev = !rev;
+		if (rev) {
+			/* reverse 32 bytes */
+			buf.d[0] = byterev_8(reg->d[3]);
+			buf.d[1] = byterev_8(reg->d[2]);
+			buf.d[2] = byterev_8(reg->d[1]);
+			buf.d[3] = byterev_8(reg->d[0]);
+			reg = &buf;
+		}
+		memcpy(mem, reg, size);
+		break;
 	case 16:
 		/* stxv, stxvx, stxvl, stxvll */
 		if (size == 0)
@@ -861,28 +892,33 @@ static nokprobe_inline int do_vsx_load(struct instruction_op *op,
 				       bool cross_endian)
 {
 	int reg = op->reg;
-	u8 mem[16];
+	int i, nr_vsx_regs;
+	u8 mem[32];
 	union vsx_reg buf;
 	int size = GETSIZE(op->type);
 
 	if (!address_ok(regs, ea, size) || copy_mem_in(mem, ea, size, regs))
 		return -EFAULT;
 
+	nr_vsx_regs = size / sizeof(__vector128);
 	emulate_vsx_load(op, &buf, mem, cross_endian);
 	preempt_disable();
 	if (reg < 32) {
 		/* FP regs + extensions */
 		if (regs->msr & MSR_FP) {
-			load_vsrn(reg, &buf);
+			for (i = 0; i < nr_vsx_regs; i++)
+				load_vsrn(reg + i, &buf.v[i]);
 		} else {
 			current->thread.fp_state.fpr[reg][0] = buf.d[0];
 			current->thread.fp_state.fpr[reg][1] = buf.d[1];
 		}
 	} else {
 		if (regs->msr & MSR_VEC)
-			load_vsrn(reg, &buf);
+			for (i = 0; i < nr_vsx_regs; i++)
+				load_vsrn(reg + i, &buf.v[i]);
+
 		else
-			current->thread.vr_state.vr[reg - 32] = buf.v;
+			current->thread.vr_state.vr[reg - 32] = buf.v[0];
 	}
 	preempt_enable();
 	return 0;
@@ -893,27 +929,31 @@ static nokprobe_inline int do_vsx_store(struct instruction_op *op,
 					bool cross_endian)
 {
 	int reg = op->reg;
-	u8 mem[16];
+	int i, nr_vsx_regs;
+	u8 mem[32];
 	union vsx_reg buf;
 	int size = GETSIZE(op->type);
 
 	if (!address_ok(regs, ea, size))
 		return -EFAULT;
 
+	nr_vsx_regs = size / sizeof(__vector128);
 	preempt_disable();
 	if (reg < 32) {
 		/* FP regs + extensions */
 		if (regs->msr & MSR_FP) {
-			store_vsrn(reg, &buf);
+			for (i = 0; i < nr_vsx_regs; i++)
+				store_vsrn(reg + i, &buf.v[i]);
 		} else {
 			buf.d[0] = current->thread.fp_state.fpr[reg][0];
 			buf.d[1] = current->thread.fp_state.fpr[reg][1];
 		}
 	} else {
 		if (regs->msr & MSR_VEC)
-			store_vsrn(reg, &buf);
+			for (i = 0; i < nr_vsx_regs; i++)
+				store_vsrn(reg + i, &buf.v[i]);
 		else
-			buf.v = current->thread.vr_state.vr[reg - 32];
+			buf.v[0] = current->thread.vr_state.vr[reg - 32];
 	}
 	preempt_enable();
 	emulate_vsx_store(op, &buf, mem, cross_endian);
-- 
2.24.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox