* [PATCH v3 07/12] ppc64/kexec_file: add support to relocate purgatory
From: Hari Bathini @ 2020-07-13 17:22 UTC (permalink / raw)
To: Michael Ellerman, Andrew Morton
Cc: kernel test robot, Pingfan Liu, Kexec-ml, Nayna Jain,
Petr Tesarik, Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev,
Sourabh Jain, Vivek Goyal, Dave Young, Thiago Jung Bauermann,
Eric Biederman
In-Reply-To: <159466074408.24747.10036072269371204890.stgit@hbathini.in.ibm.com>
Right now purgatory implementation is only minimal. But if purgatory
code is to be enhanced to copy memory to the backup region and verify
sha256 digest, relocations may have to be applied to the purgatory.
So, add support to relocate purgatory in kexec_file_load system call
by setting up TOC pointer and applying RELA relocations as needed.
Reported-by: kernel test robot <lkp@intel.com>
[lkp: In v1, 'struct mem_sym' was declared in parameter list]
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
---
v2 -> v3:
* Fixed get_toc_section() to return the section info that had relocations
applied, to calculate the correct toc pointer.
* Fixed how relocation value is converted to relative while applying
R_PPC64_REL64 & R_PPC64_REL32 relocations.
v1 -> v2:
* Fixed wrong use of 'struct mem_sym' in local_entry_offset() as
reported by lkp. lkp report for reference:
- https://lore.kernel.org/patchwork/patch/1264421/
arch/powerpc/kexec/file_load_64.c | 337 ++++++++++++++++++++++++++++++++
arch/powerpc/purgatory/trampoline_64.S | 8 +
2 files changed, 345 insertions(+)
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index 1c4e3eb..8bff29e 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -20,6 +20,7 @@
#include <linux/of_device.h>
#include <linux/memblock.h>
#include <linux/slab.h>
+#include <asm/types.h>
#include <asm/drmem.h>
#include <asm/kexec_ranges.h>
#include <asm/crashdump-ppc64.h>
@@ -621,6 +622,244 @@ static int update_usable_mem_fdt(void *fdt, struct crash_mem *usable_mem)
}
/**
+ * get_toc_section - Look for ".toc" symbol and return the corresponding section
+ * in the purgatory.
+ * @pi: Purgatory Info.
+ *
+ * Returns TOC section on success, NULL otherwise.
+ */
+static const Elf_Shdr *get_toc_section(const struct purgatory_info *pi)
+{
+ const Elf_Shdr *sechdrs;
+ const char *secstrings;
+ int i;
+
+ if (!pi->ehdr) {
+ pr_err("Purgatory elf load info missing?\n");
+ return NULL;
+ }
+
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ secstrings = (void *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if ((sechdrs[i].sh_size != 0) &&
+ (strcmp(secstrings + sechdrs[i].sh_name, ".toc") == 0)) {
+ /* Return the relocated ".toc" section */
+ return &(pi->sechdrs[i]);
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * get_toc_ptr - Get the TOC pointer (r2) of purgatory.
+ * @pi: Purgatory Info.
+ *
+ * Returns r2 on success, 0 otherwise.
+ */
+static unsigned long get_toc_ptr(const struct purgatory_info *pi)
+{
+ unsigned long toc_ptr = 0;
+ const Elf_Shdr *sechdr;
+
+ sechdr = get_toc_section(pi);
+ if (!sechdr)
+ pr_err("Could not get the TOC section!\n");
+ else
+ toc_ptr = sechdr->sh_addr + 0x8000; /* 0x8000 into TOC */
+
+ pr_debug("TOC pointer (r2) is 0x%lx\n", toc_ptr);
+ return toc_ptr;
+}
+
+/* Helper functions to apply relocations */
+static int do_relative_toc(unsigned long val, uint16_t *loc,
+ unsigned long mask, int complain_signed)
+{
+ if (complain_signed && (val + 0x8000 > 0xffff)) {
+ pr_err("TOC16 relocation overflows (%lu)\n", val);
+ return -ENOEXEC;
+ }
+
+ if ((~mask & 0xffff) & val) {
+ pr_err("Bad TOC16 relocation (%lu)\n", val);
+ return -ENOEXEC;
+ }
+
+ *loc = (*loc & ~mask) | (val & mask);
+ return 0;
+}
+#ifdef PPC64_ELF_ABI_v2
+/* PowerPC64 specific values for the Elf64_Sym st_other field. */
+#define STO_PPC64_LOCAL_BIT 5
+#define STO_PPC64_LOCAL_MASK (7 << STO_PPC64_LOCAL_BIT)
+#define PPC64_LOCAL_ENTRY_OFFSET(other) \
+ (((1 << (((other) & STO_PPC64_LOCAL_MASK) >> STO_PPC64_LOCAL_BIT)) \
+ >> 2) << 2)
+
+static unsigned int local_entry_offset(const Elf64_Sym *sym)
+{
+ /* If this symbol has a local entry point, use it. */
+ return PPC64_LOCAL_ENTRY_OFFSET(sym->st_other);
+}
+#else
+static unsigned int local_entry_offset(const Elf64_Sym *sym)
+{
+ return 0;
+}
+#endif
+
+/**
+ * __kexec_do_relocs - Apply relocations based on relocation type.
+ * @my_r2: TOC pointer.
+ * @sym: Symbol to relocate.
+ * @r_type: Relocation type.
+ * @loc: Location to modify.
+ * @val: Relocated symbol value.
+ * @addr: Final location after relocation.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int __kexec_do_relocs(unsigned long my_r2, const Elf_Sym *sym,
+ int r_type, void *loc, unsigned long val,
+ unsigned long addr)
+{
+ int ret = 0;
+
+ switch (r_type) {
+ case R_PPC64_ADDR32:
+ /* Simply set it */
+ *(uint32_t *)loc = val;
+ break;
+
+ case R_PPC64_ADDR64:
+ /* Simply set it */
+ *(uint64_t *)loc = val;
+ break;
+
+ case R_PPC64_REL64:
+ *(uint64_t *)loc = val - (uint64_t)addr;
+ break;
+
+ case R_PPC64_REL32:
+ /* Convert value to relative */
+ val -= addr;
+ if (val + 0x80000000 > 0xffffffff) {
+ pr_err("REL32 %li out of range!\n", val);
+ return -ENOEXEC;
+ }
+
+ *(uint32_t *)loc = val;
+ break;
+
+ case R_PPC64_TOC:
+ *(uint64_t *)loc = my_r2;
+ break;
+
+ case R_PPC64_TOC16:
+ ret = do_relative_toc(val - my_r2, loc, 0xffff, 1);
+ break;
+
+ case R_PPC64_TOC16_DS:
+ ret = do_relative_toc(val - my_r2, loc, 0xfffc, 1);
+ break;
+
+ case R_PPC64_TOC16_LO:
+ ret = do_relative_toc(val - my_r2, loc, 0xffff, 0);
+ break;
+
+ case R_PPC64_TOC16_LO_DS:
+ ret = do_relative_toc(val - my_r2, loc, 0xfffc, 0);
+ break;
+
+ case R_PPC64_TOC16_HI:
+ ret = do_relative_toc((val - my_r2) >> 16, loc,
+ 0xffff, 0);
+ break;
+
+ case R_PPC64_TOC16_HA:
+ ret = do_relative_toc((val - my_r2 + 0x8000) >> 16, loc,
+ 0xffff, 0);
+ break;
+
+ case R_PPC64_REL24:
+ val += local_entry_offset(sym);
+ /* Convert value to relative */
+ val -= addr;
+ if (val + 0x2000000 > 0x3ffffff || (val & 3) != 0) {
+ pr_err("REL24 %li out of range!\n", val);
+ return -ENOEXEC;
+ }
+
+ /* Only replace bits 2 through 26 */
+ *(uint32_t *)loc = ((*(uint32_t *)loc & ~0x03fffffc) |
+ (val & 0x03fffffc));
+ break;
+
+ case R_PPC64_ADDR16_LO:
+ *(uint16_t *)loc = val & 0xffff;
+ break;
+
+ case R_PPC64_ADDR16_HI:
+ *(uint16_t *)loc = (val >> 16) & 0xffff;
+ break;
+
+ case R_PPC64_ADDR16_HA:
+ *(uint16_t *)loc = (((val + 0x8000) >> 16) & 0xffff);
+ break;
+
+ case R_PPC64_ADDR16_HIGHER:
+ *(uint16_t *)loc = (((uint64_t)val >> 32) & 0xffff);
+ break;
+
+ case R_PPC64_ADDR16_HIGHEST:
+ *(uint16_t *)loc = (((uint64_t)val >> 48) & 0xffff);
+ break;
+
+ /* R_PPC64_REL16_HA and R_PPC64_REL16_LO are handled to support
+ * ABIv2 r2 assignment based on r12 for PIC executable.
+ * Here address is known, so replace
+ * 0: addis 2,12,.TOC.-0b@ha
+ * addi 2,2,.TOC.-0b@l
+ * by
+ * lis 2,.TOC.@ha
+ * addi 2,2,.TOC.@l
+ */
+ case R_PPC64_REL16_HA:
+ /* check that we are dealing with the addis 2,12 instruction */
+ if (((*(uint32_t *)loc) & 0xffff0000) != 0x3c4c0000) {
+ pr_err("Unexpected instruction for R_PPC64_REL16_HA");
+ return -ENOEXEC;
+ }
+
+ val += my_r2;
+ /* replacing by lis 2 */
+ *(uint32_t *)loc = 0x3c400000 + ((val >> 16) & 0xffff);
+ break;
+
+ case R_PPC64_REL16_LO:
+ /* check that we are dealing with the addi 2,2 instruction */
+ if (((*(uint32_t *)loc) & 0xffff0000) != 0x38420000) {
+ pr_err("Unexpected instruction for R_PPC64_REL16_LO");
+ return -ENOEXEC;
+ }
+
+ val += my_r2 - 4;
+ *(uint16_t *)loc = val & 0xffff;
+ break;
+
+ default:
+ pr_err("Unknown rela relocation: %d\n", r_type);
+ ret = -ENOEXEC;
+ break;
+ }
+
+ return ret;
+}
+
+/**
* setup_purgatory_ppc64 - initialize PPC64 specific purgatory's global
* variables and call setup_purgatory() to initialize
* common global variable.
@@ -636,6 +875,7 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
const void *fdt, unsigned long kernel_load_addr,
unsigned long fdt_load_addr)
{
+ uint64_t val;
int ret;
ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
@@ -658,6 +898,10 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
goto out;
}
+ /* Setup the TOC pointer */
+ val = get_toc_ptr(&(image->purgatory_info));
+ ret = kexec_purgatory_get_set_symbol(image, "my_toc", &val, sizeof(val),
+ false);
out:
if (ret)
pr_err("Failed to setup purgatory symbols");
@@ -784,6 +1028,99 @@ int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
}
/**
+ * arch_kexec_apply_relocations_add - Apply relocations of type RELA
+ * @pi: Purgatory Info.
+ * @section: Section relocations applying to.
+ * @relsec: Section containing RELAs.
+ * @symtab: Corresponding symtab.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab)
+{
+ const char *strtab, *name, *shstrtab;
+ int i, r_type, ret, err = -ENOEXEC;
+ const Elf_Shdr *sechdrs;
+ unsigned long my_r2;
+ Elf_Rela *relas;
+
+ /* String & section header string table */
+ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+ strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset;
+ shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
+
+ relas = (void *)pi->ehdr + relsec->sh_offset;
+
+ pr_debug("Applying relocate section %s to %u\n",
+ shstrtab + relsec->sh_name, relsec->sh_info);
+
+ /* Get the TOC pointer (r2) */
+ my_r2 = get_toc_ptr(pi);
+ if (!my_r2)
+ return err;
+
+ for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) {
+ const Elf_Sym *sym; /* symbol to relocate */
+ unsigned long addr; /* final location after relocation */
+ unsigned long val; /* relocated symbol value */
+ void *loc; /* tmp location to modify */
+
+ sym = (void *)pi->ehdr + symtab->sh_offset;
+ sym += ELF64_R_SYM(relas[i].r_info);
+
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ pr_debug("Symbol: %s info: %x shndx: %x value=%llx size: %llx\n",
+ name, sym->st_info, sym->st_shndx, sym->st_value,
+ sym->st_size);
+
+ if ((sym->st_shndx == SHN_UNDEF) &&
+ (ELF_ST_TYPE(sym->st_info) != STT_NOTYPE)) {
+ pr_err("Undefined symbol: %s\n", name);
+ return err;
+ }
+
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
+ return err;
+ }
+
+ if ((sym->st_shndx >= pi->ehdr->e_shnum) &&
+ (sym->st_shndx != SHN_ABS)) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
+ return err;
+ }
+
+ loc = pi->purgatory_buf;
+ loc += section->sh_offset;
+ loc += relas[i].r_offset;
+
+ val = sym->st_value;
+ if (sym->st_shndx != SHN_ABS)
+ val += pi->sechdrs[sym->st_shndx].sh_addr;
+ val += relas[i].r_addend;
+
+ addr = section->sh_addr + relas[i].r_offset;
+
+ pr_debug("Symbol: %s value=%lx address=%lx\n", name, val, addr);
+
+ r_type = ELF64_R_TYPE(relas[i].r_info);
+ ret = __kexec_do_relocs(my_r2, sym, r_type, loc, val, addr);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
* arch_kexec_kernel_image_probe - Does additional handling needed to setup
* kexec segments.
* @image: kexec image being loaded.
diff --git a/arch/powerpc/purgatory/trampoline_64.S b/arch/powerpc/purgatory/trampoline_64.S
index a5a83c3..7b4a5f7 100644
--- a/arch/powerpc/purgatory/trampoline_64.S
+++ b/arch/powerpc/purgatory/trampoline_64.S
@@ -51,6 +51,8 @@ master:
bl 0f /* Work out where we're running */
0: mflr %r18
+ ld %r2,(my_toc - 0b)(%r18) /* setup toc */
+
/* load device-tree address */
ld %r3, (dt_offset - 0b)(%r18)
mr %r16,%r3 /* save dt address in reg16 */
@@ -103,6 +105,12 @@ dt_offset:
.size dt_offset, . - dt_offset
+ .balign 8
+ .globl my_toc
+my_toc:
+ .8byte 0x0
+ .size my_toc, . - my_toc
+
.data
.balign 8
.globl purgatory_sha256_digest
^ permalink raw reply related
* [PATCH v3 02/12] powerpc/kexec_file: mark PPC64 specific code
From: Hari Bathini @ 2020-07-13 17:21 UTC (permalink / raw)
To: Michael Ellerman, Andrew Morton
Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <159466074408.24747.10036072269371204890.stgit@hbathini.in.ibm.com>
Some of the kexec_file_load code isn't PPC64 specific. Move PPC64
specific code from kexec/file_load.c to kexec/file_load_64.c. Also,
rename purgatory/trampoline.S to purgatory/trampoline_64.S in the
same spirit.
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
---
v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.
v1 -> v2:
* No changes.
arch/powerpc/include/asm/kexec.h | 11 +++
arch/powerpc/kexec/Makefile | 2 -
arch/powerpc/kexec/elf_64.c | 7 +-
arch/powerpc/kexec/file_load.c | 37 ++--------
arch/powerpc/kexec/file_load_64.c | 108 ++++++++++++++++++++++++++++++
arch/powerpc/purgatory/Makefile | 4 +
arch/powerpc/purgatory/trampoline.S | 117 --------------------------------
arch/powerpc/purgatory/trampoline_64.S | 117 ++++++++++++++++++++++++++++++++
8 files changed, 248 insertions(+), 155 deletions(-)
create mode 100644 arch/powerpc/kexec/file_load_64.c
delete mode 100644 arch/powerpc/purgatory/trampoline.S
create mode 100644 arch/powerpc/purgatory/trampoline_64.S
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index c684768..7008ea1 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -114,8 +114,17 @@ int setup_purgatory(struct kimage *image, const void *slave_code,
unsigned long fdt_load_addr);
int setup_new_fdt(const struct kimage *image, void *fdt,
unsigned long initrd_load_addr, unsigned long initrd_len,
- const char *cmdline);
+ const char *cmdline, int *node);
int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size);
+
+#ifdef CONFIG_PPC64
+int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
+ const void *fdt, unsigned long kernel_load_addr,
+ unsigned long fdt_load_addr);
+int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
+ unsigned long initrd_load_addr,
+ unsigned long initrd_len, const char *cmdline);
+#endif /* CONFIG_PPC64 */
#endif /* CONFIG_KEXEC_FILE */
#else /* !CONFIG_KEXEC_CORE */
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
index 86380c6..67c3553 100644
--- a/arch/powerpc/kexec/Makefile
+++ b/arch/powerpc/kexec/Makefile
@@ -7,7 +7,7 @@ obj-y += core.o crash.o core_$(BITS).o
obj-$(CONFIG_PPC32) += relocate_32.o
-obj-$(CONFIG_KEXEC_FILE) += file_load.o elf_$(BITS).o
+obj-$(CONFIG_KEXEC_FILE) += file_load.o file_load_$(BITS).o elf_$(BITS).o
ifdef CONFIG_HAVE_IMA_KEXEC
ifdef CONFIG_IMA
diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index 3072fd6..23ad04c 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -88,7 +88,8 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
goto out;
}
- ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
+ ret = setup_new_fdt_ppc64(image, fdt, initrd_load_addr,
+ initrd_len, cmdline);
if (ret)
goto out;
@@ -107,8 +108,8 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
pr_debug("Loaded device tree at 0x%lx\n", fdt_load_addr);
slave_code = elf_info.buffer + elf_info.proghdrs[0].p_offset;
- ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
- fdt_load_addr);
+ ret = setup_purgatory_ppc64(image, slave_code, fdt, kernel_load_addr,
+ fdt_load_addr);
if (ret)
pr_err("Error setting up the purgatory.\n");
diff --git a/arch/powerpc/kexec/file_load.c b/arch/powerpc/kexec/file_load.c
index 143c917..99a2c4d 100644
--- a/arch/powerpc/kexec/file_load.c
+++ b/arch/powerpc/kexec/file_load.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * ppc64 code to implement the kexec_file_load syscall
+ * powerpc code to implement the kexec_file_load syscall
*
* Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
* Copyright (C) 2004 IBM Corp.
@@ -16,26 +16,10 @@
#include <linux/slab.h>
#include <linux/kexec.h>
-#include <linux/of_fdt.h>
#include <linux/libfdt.h>
#include <asm/ima.h>
-#define SLAVE_CODE_SIZE 256
-
-const struct kexec_file_ops * const kexec_file_loaders[] = {
- &kexec_elf64_ops,
- NULL
-};
-
-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- /* We don't support crash kernels yet. */
- if (image->type == KEXEC_TYPE_CRASH)
- return -EOPNOTSUPP;
-
- return kexec_image_probe_default(image, buf, buf_len);
-}
+#define SLAVE_CODE_SIZE 256 /* First 0x100 bytes */
/**
* setup_purgatory - initialize the purgatory's global variables
@@ -127,24 +111,17 @@ int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
* @initrd_len: Size of the next initrd, or 0 if there will be none.
* @cmdline: Command line for the next kernel, or NULL if there will
* be none.
+ * @chosen_node: Set this output parameter to chosen_node.
*
* Return: 0 on success, or negative errno on error.
*/
int setup_new_fdt(const struct kimage *image, void *fdt,
unsigned long initrd_load_addr, unsigned long initrd_len,
- const char *cmdline)
+ const char *cmdline, int *node)
{
int ret, chosen_node;
const void *prop;
- /* Remove memory reservation for the current device tree. */
- ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params),
- fdt_totalsize(initial_boot_params));
- if (ret == 0)
- pr_debug("Removed old device tree reservation.\n");
- else if (ret != -ENOENT)
- return ret;
-
chosen_node = fdt_path_offset(fdt, "/chosen");
if (chosen_node == -FDT_ERR_NOTFOUND) {
chosen_node = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
@@ -157,6 +134,8 @@ int setup_new_fdt(const struct kimage *image, void *fdt,
pr_err("Malformed device tree: error reading /chosen.\n");
return -EINVAL;
}
+ if (node)
+ *node = chosen_node;
/* Did we boot using an initrd? */
prop = fdt_getprop(fdt, chosen_node, "linux,initrd-start", NULL);
@@ -242,10 +221,6 @@ int setup_new_fdt(const struct kimage *image, void *fdt,
return ret;
}
- ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
- if (ret)
- goto err;
-
return 0;
err:
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
new file mode 100644
index 0000000..e6bff960
--- /dev/null
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ppc64 code to implement the kexec_file_load syscall
+ *
+ * Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004 IBM Corp.
+ * Copyright (C) 2004,2005 Milton D Miller II, IBM Corporation
+ * Copyright (C) 2005 R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2020 IBM Corporation
+ *
+ * Based on kexec-tools' kexec-ppc64.c, kexec-elf-rel-ppc64.c, fs2dt.c.
+ * Heavily modified for the kernel by
+ * Hari Bathini <hbathini@linux.ibm.com>.
+ */
+
+#include <linux/kexec.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+
+const struct kexec_file_ops * const kexec_file_loaders[] = {
+ &kexec_elf64_ops,
+ NULL
+};
+
+/**
+ * setup_purgatory_ppc64 - initialize PPC64 specific purgatory's global
+ * variables and call setup_purgatory() to initialize
+ * common global variable.
+ * @image: kexec image.
+ * @slave_code: Slave code for the purgatory.
+ * @fdt: Flattened device tree for the next kernel.
+ * @kernel_load_addr: Address where the kernel is loaded.
+ * @fdt_load_addr: Address where the flattened device tree is loaded.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
+ const void *fdt, unsigned long kernel_load_addr,
+ unsigned long fdt_load_addr)
+{
+ int ret;
+
+ ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
+ fdt_load_addr);
+ if (ret)
+ pr_err("Failed to setup purgatory symbols");
+ return ret;
+}
+
+/**
+ * setup_new_fdt_ppc64 - Update the flattend device-tree of the kernel
+ * being loaded.
+ * @image: kexec image being loaded.
+ * @fdt: Flattened device tree for the next kernel.
+ * @initrd_load_addr: Address where the next initrd will be loaded.
+ * @initrd_len: Size of the next initrd, or 0 if there will be none.
+ * @cmdline: Command line for the next kernel, or NULL if there will
+ * be none.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
+ unsigned long initrd_load_addr,
+ unsigned long initrd_len, const char *cmdline)
+{
+ int chosen_node, ret;
+
+ /* Remove memory reservation for the current device tree. */
+ ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params),
+ fdt_totalsize(initial_boot_params));
+ if (ret == 0)
+ pr_debug("Removed old device tree reservation.\n");
+ else if (ret != -ENOENT) {
+ pr_err("Failed to remove old device-tree reservation.\n");
+ return ret;
+ }
+
+ ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len,
+ cmdline, &chosen_node);
+ if (ret)
+ return ret;
+
+ ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
+ if (ret)
+ pr_err("Failed to update device-tree with linux,booted-from-kexec\n");
+
+ return ret;
+}
+
+/**
+ * arch_kexec_kernel_image_probe - Does additional handling needed to setup
+ * kexec segments.
+ * @image: kexec image being loaded.
+ * @buf: Buffer pointing to elf data.
+ * @buf_len: Length of the buffer.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ /* We don't support crash kernels yet. */
+ if (image->type == KEXEC_TYPE_CRASH)
+ return -EOPNOTSUPP;
+
+ return kexec_image_probe_default(image, buf, buf_len);
+}
diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile
index 7c6d8b1..348f5958 100644
--- a/arch/powerpc/purgatory/Makefile
+++ b/arch/powerpc/purgatory/Makefile
@@ -2,11 +2,11 @@
KASAN_SANITIZE := n
-targets += trampoline.o purgatory.ro kexec-purgatory.c
+targets += trampoline_$(BITS).o purgatory.ro kexec-purgatory.c
LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined
-$(obj)/purgatory.ro: $(obj)/trampoline.o FORCE
+$(obj)/purgatory.ro: $(obj)/trampoline_$(BITS).o FORCE
$(call if_changed,ld)
quiet_cmd_bin2c = BIN2C $@
diff --git a/arch/powerpc/purgatory/trampoline.S b/arch/powerpc/purgatory/trampoline.S
deleted file mode 100644
index a5a83c3..0000000
--- a/arch/powerpc/purgatory/trampoline.S
+++ /dev/null
@@ -1,117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * kexec trampoline
- *
- * Based on code taken from kexec-tools and kexec-lite.
- *
- * Copyright (C) 2004 - 2005, Milton D Miller II, IBM Corporation
- * Copyright (C) 2006, Mohan Kumar M, IBM Corporation
- * Copyright (C) 2013, Anton Blanchard, IBM Corporation
- */
-
-#include <asm/asm-compat.h>
-
- .machine ppc64
- .balign 256
- .globl purgatory_start
-purgatory_start:
- b master
-
- /* ABI: possible run_at_load flag at 0x5c */
- .org purgatory_start + 0x5c
- .globl run_at_load
-run_at_load:
- .long 0
- .size run_at_load, . - run_at_load
-
- /* ABI: slaves start at 60 with r3=phys */
- .org purgatory_start + 0x60
-slave:
- b .
- /* ABI: end of copied region */
- .org purgatory_start + 0x100
- .size purgatory_start, . - purgatory_start
-
-/*
- * The above 0x100 bytes at purgatory_start are replaced with the
- * code from the kernel (or next stage) by setup_purgatory().
- */
-
-master:
- or %r1,%r1,%r1 /* low priority to let other threads catchup */
- isync
- mr %r17,%r3 /* save cpu id to r17 */
- mr %r15,%r4 /* save physical address in reg15 */
-
- or %r3,%r3,%r3 /* ok now to high priority, lets boot */
- lis %r6,0x1
- mtctr %r6 /* delay a bit for slaves to catch up */
- bdnz . /* before we overwrite 0-100 again */
-
- bl 0f /* Work out where we're running */
-0: mflr %r18
-
- /* load device-tree address */
- ld %r3, (dt_offset - 0b)(%r18)
- mr %r16,%r3 /* save dt address in reg16 */
- li %r4,20
- LWZX_BE %r6,%r3,%r4 /* fetch __be32 version number at byte 20 */
- cmpwi %cr0,%r6,2 /* v2 or later? */
- blt 1f
- li %r4,28
- STWX_BE %r17,%r3,%r4 /* Store my cpu as __be32 at byte 28 */
-1:
- /* load the kernel address */
- ld %r4,(kernel - 0b)(%r18)
-
- /* load the run_at_load flag */
- /* possibly patched by kexec */
- ld %r6,(run_at_load - 0b)(%r18)
- /* and patch it into the kernel */
- stw %r6,(0x5c)(%r4)
-
- mr %r3,%r16 /* restore dt address */
-
- li %r5,0 /* r5 will be 0 for kernel */
-
- mfmsr %r11
- andi. %r10,%r11,1 /* test MSR_LE */
- bne .Little_endian
-
- mtctr %r4 /* prepare branch to */
- bctr /* start kernel */
-
-.Little_endian:
- mtsrr0 %r4 /* prepare branch to */
-
- clrrdi %r11,%r11,1 /* clear MSR_LE */
- mtsrr1 %r11
-
- rfid /* update MSR and start kernel */
-
-
- .balign 8
- .globl kernel
-kernel:
- .8byte 0x0
- .size kernel, . - kernel
-
- .balign 8
- .globl dt_offset
-dt_offset:
- .8byte 0x0
- .size dt_offset, . - dt_offset
-
-
- .data
- .balign 8
-.globl purgatory_sha256_digest
-purgatory_sha256_digest:
- .skip 32
- .size purgatory_sha256_digest, . - purgatory_sha256_digest
-
- .balign 8
-.globl purgatory_sha_regions
-purgatory_sha_regions:
- .skip 8 * 2 * 16
- .size purgatory_sha_regions, . - purgatory_sha_regions
diff --git a/arch/powerpc/purgatory/trampoline_64.S b/arch/powerpc/purgatory/trampoline_64.S
new file mode 100644
index 0000000..a5a83c3
--- /dev/null
+++ b/arch/powerpc/purgatory/trampoline_64.S
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * kexec trampoline
+ *
+ * Based on code taken from kexec-tools and kexec-lite.
+ *
+ * Copyright (C) 2004 - 2005, Milton D Miller II, IBM Corporation
+ * Copyright (C) 2006, Mohan Kumar M, IBM Corporation
+ * Copyright (C) 2013, Anton Blanchard, IBM Corporation
+ */
+
+#include <asm/asm-compat.h>
+
+ .machine ppc64
+ .balign 256
+ .globl purgatory_start
+purgatory_start:
+ b master
+
+ /* ABI: possible run_at_load flag at 0x5c */
+ .org purgatory_start + 0x5c
+ .globl run_at_load
+run_at_load:
+ .long 0
+ .size run_at_load, . - run_at_load
+
+ /* ABI: slaves start at 60 with r3=phys */
+ .org purgatory_start + 0x60
+slave:
+ b .
+ /* ABI: end of copied region */
+ .org purgatory_start + 0x100
+ .size purgatory_start, . - purgatory_start
+
+/*
+ * The above 0x100 bytes at purgatory_start are replaced with the
+ * code from the kernel (or next stage) by setup_purgatory().
+ */
+
+master:
+ or %r1,%r1,%r1 /* low priority to let other threads catchup */
+ isync
+ mr %r17,%r3 /* save cpu id to r17 */
+ mr %r15,%r4 /* save physical address in reg15 */
+
+ or %r3,%r3,%r3 /* ok now to high priority, lets boot */
+ lis %r6,0x1
+ mtctr %r6 /* delay a bit for slaves to catch up */
+ bdnz . /* before we overwrite 0-100 again */
+
+ bl 0f /* Work out where we're running */
+0: mflr %r18
+
+ /* load device-tree address */
+ ld %r3, (dt_offset - 0b)(%r18)
+ mr %r16,%r3 /* save dt address in reg16 */
+ li %r4,20
+ LWZX_BE %r6,%r3,%r4 /* fetch __be32 version number at byte 20 */
+ cmpwi %cr0,%r6,2 /* v2 or later? */
+ blt 1f
+ li %r4,28
+ STWX_BE %r17,%r3,%r4 /* Store my cpu as __be32 at byte 28 */
+1:
+ /* load the kernel address */
+ ld %r4,(kernel - 0b)(%r18)
+
+ /* load the run_at_load flag */
+ /* possibly patched by kexec */
+ ld %r6,(run_at_load - 0b)(%r18)
+ /* and patch it into the kernel */
+ stw %r6,(0x5c)(%r4)
+
+ mr %r3,%r16 /* restore dt address */
+
+ li %r5,0 /* r5 will be 0 for kernel */
+
+ mfmsr %r11
+ andi. %r10,%r11,1 /* test MSR_LE */
+ bne .Little_endian
+
+ mtctr %r4 /* prepare branch to */
+ bctr /* start kernel */
+
+.Little_endian:
+ mtsrr0 %r4 /* prepare branch to */
+
+ clrrdi %r11,%r11,1 /* clear MSR_LE */
+ mtsrr1 %r11
+
+ rfid /* update MSR and start kernel */
+
+
+ .balign 8
+ .globl kernel
+kernel:
+ .8byte 0x0
+ .size kernel, . - kernel
+
+ .balign 8
+ .globl dt_offset
+dt_offset:
+ .8byte 0x0
+ .size dt_offset, . - dt_offset
+
+
+ .data
+ .balign 8
+.globl purgatory_sha256_digest
+purgatory_sha256_digest:
+ .skip 32
+ .size purgatory_sha256_digest, . - purgatory_sha256_digest
+
+ .balign 8
+.globl purgatory_sha_regions
+purgatory_sha_regions:
+ .skip 8 * 2 * 16
+ .size purgatory_sha_regions, . - purgatory_sha_regions
^ permalink raw reply related
* [PATCH v3 06/12] ppc64/kexec_file: restrict memory usage of kdump kernel
From: Hari Bathini @ 2020-07-13 17:22 UTC (permalink / raw)
To: Michael Ellerman, Andrew Morton
Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <159466074408.24747.10036072269371204890.stgit@hbathini.in.ibm.com>
Kdump kernel, used for capturing the kernel core image, is supposed
to use only specific memory regions to avoid corrupting the image to
be captured. The regions are crashkernel range - the memory reserved
explicitly for kdump kernel, memory used for the tce-table, the OPAL
region and RTAS region as applicable. Restrict kdump kernel memory
to use only these regions by setting up usable-memory DT property.
Also, tell the kdump kernel to run at the loaded address by setting
the magic word at 0x5c.
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
---
v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.
v1 -> v2:
* Fixed off-by-one error while setting up usable-memory properties.
* Updated add_rtas_mem_range() & add_opal_mem_range() callsites based on
the new prototype for these functions.
arch/powerpc/kexec/file_load_64.c | 401 +++++++++++++++++++++++++++++++++++++
1 file changed, 399 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index 7673481..1c4e3eb 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -17,10 +17,22 @@
#include <linux/kexec.h>
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
+#include <linux/of_device.h>
#include <linux/memblock.h>
+#include <linux/slab.h>
+#include <asm/drmem.h>
#include <asm/kexec_ranges.h>
#include <asm/crashdump-ppc64.h>
+struct umem_info {
+ uint64_t *buf; /* data buffer for usable-memory property */
+ uint32_t idx; /* current index */
+ uint32_t size; /* size allocated for the data buffer */
+
+ /* usable memory ranges to look up */
+ const struct crash_mem *umrngs;
+};
+
const struct kexec_file_ops * const kexec_file_loaders[] = {
&kexec_elf64_ops,
NULL
@@ -76,6 +88,38 @@ static int get_exclude_memory_ranges(struct crash_mem **mem_ranges)
}
/**
+ * get_usable_memory_ranges - Get usable memory ranges. This list includes
+ * regions like crashkernel, opal/rtas & tce-table,
+ * that kdump kernel could use.
+ * @mem_ranges: Range list to add the memory ranges to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int get_usable_memory_ranges(struct crash_mem **mem_ranges)
+{
+ int ret;
+
+ /* First memory block & crashkernel region */
+ ret = add_mem_range(mem_ranges, 0, crashk_res.end + 1);
+ if (ret)
+ goto out;
+
+ ret = add_rtas_mem_range(mem_ranges);
+ if (ret)
+ goto out;
+
+ ret = add_opal_mem_range(mem_ranges);
+ if (ret)
+ goto out;
+
+ ret = add_tce_mem_ranges(mem_ranges);
+out:
+ if (ret)
+ pr_err("Failed to setup usable memory ranges\n");
+ return ret;
+}
+
+/**
* __locate_mem_hole_top_down - Looks top down for a large enough memory hole
* in the memory regions between buf_min & buf_max
* for the buffer. If found, sets kbuf->mem.
@@ -261,6 +305,322 @@ static int locate_mem_hole_bottom_up_ppc64(struct kexec_buf *kbuf,
}
/**
+ * check_realloc_usable_mem - Reallocate buffer if it can't accommodate entries
+ * @um_info: Usable memory buffer and ranges info.
+ * @cnt: No. of entries to accommodate.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static uint64_t *check_realloc_usable_mem(struct umem_info *um_info, int cnt)
+{
+ void *tbuf;
+
+ if (um_info->size >=
+ ((um_info->idx + cnt) * sizeof(*(um_info->buf))))
+ return um_info->buf;
+
+ um_info->size += MEM_RANGE_CHUNK_SZ;
+ tbuf = krealloc(um_info->buf, um_info->size, GFP_KERNEL);
+ if (!tbuf) {
+ um_info->size -= MEM_RANGE_CHUNK_SZ;
+ return NULL;
+ }
+
+ memset(tbuf + um_info->idx, 0, MEM_RANGE_CHUNK_SZ);
+ return tbuf;
+}
+
+/**
+ * add_usable_mem - Add the usable memory ranges within the given memory range
+ * to the buffer
+ * @um_info: Usable memory buffer and ranges info.
+ * @base: Base address of memory range to look for.
+ * @end: End address of memory range to look for.
+ * @cnt: No. of usable memory ranges added to buffer.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int add_usable_mem(struct umem_info *um_info, uint64_t base,
+ uint64_t end, int *cnt)
+{
+ uint64_t loc_base, loc_end, *buf;
+ const struct crash_mem *umrngs;
+ int i, add;
+
+ *cnt = 0;
+ umrngs = um_info->umrngs;
+ for (i = 0; i < umrngs->nr_ranges; i++) {
+ add = 0;
+ loc_base = umrngs->ranges[i].start;
+ loc_end = umrngs->ranges[i].end;
+ if (loc_base >= base && loc_end <= end)
+ add = 1;
+ else if (base < loc_end && end > loc_base) {
+ if (loc_base < base)
+ loc_base = base;
+ if (loc_end > end)
+ loc_end = end;
+ add = 1;
+ }
+
+ if (add) {
+ buf = check_realloc_usable_mem(um_info, 2);
+ if (!buf)
+ return -ENOMEM;
+
+ um_info->buf = buf;
+ buf[um_info->idx++] = cpu_to_be64(loc_base);
+ buf[um_info->idx++] =
+ cpu_to_be64(loc_end - loc_base + 1);
+ (*cnt)++;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * kdump_setup_usable_lmb - This is a callback function that gets called by
+ * walk_drmem_lmbs for every LMB to set its
+ * usable memory ranges.
+ * @lmb: LMB info.
+ * @usm: linux,drconf-usable-memory property value.
+ * @data: Pointer to usable memory buffer and ranges info.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int kdump_setup_usable_lmb(struct drmem_lmb *lmb, const __be32 **usm,
+ void *data)
+{
+ struct umem_info *um_info;
+ uint64_t base, end, *buf;
+ int cnt, tmp_idx, ret;
+
+ /*
+ * kdump load isn't supported on kernels already booted with
+ * linux,drconf-usable-memory property.
+ */
+ if (*usm) {
+ pr_err("Trying kdump load from a kdump kernel?\n");
+ return -EINVAL;
+ }
+
+ um_info = data;
+ tmp_idx = um_info->idx;
+ buf = check_realloc_usable_mem(um_info, 1);
+ if (!buf)
+ return -ENOMEM;
+
+ um_info->idx++;
+ um_info->buf = buf;
+ base = lmb->base_addr;
+ end = base + drmem_lmb_size() - 1;
+ ret = add_usable_mem(um_info, base, end, &cnt);
+ if (!ret)
+ um_info->buf[tmp_idx] = cpu_to_be64(cnt);
+
+ return ret;
+}
+
+/**
+ * get_node_path - Get the full path of the given node.
+ * @dn: Node.
+ * @path: Updated with the full path of the node.
+ *
+ * Returns nothing.
+ */
+static void get_node_path(struct device_node *dn, char *path)
+{
+ if (!dn)
+ return;
+
+ get_node_path(dn->parent, path);
+ sprintf(path, "/%s", dn->full_name);
+}
+
+/**
+ * get_node_pathlen - Get the full path length of the given node.
+ * @dn: Node.
+ *
+ * Returns the length of the full path of the node.
+ */
+static int get_node_pathlen(struct device_node *dn)
+{
+ int len = 0;
+
+ while (dn) {
+ len += strlen(dn->full_name) + 1;
+ dn = dn->parent;
+ }
+ len++;
+
+ return len;
+}
+
+/**
+ * add_usable_mem_property - Add usable memory property for the given
+ * memory node.
+ * @fdt: Flattened device tree for the kdump kernel.
+ * @dn: Memory node.
+ * @um_info: Usable memory buffer and ranges info.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int add_usable_mem_property(void *fdt, struct device_node *dn,
+ struct umem_info *um_info)
+{
+ int n_mem_addr_cells, n_mem_size_cells, node;
+ int i, len, ranges, cnt, ret;
+ uint64_t base, end, *buf;
+ const __be32 *prop;
+ char *pathname;
+
+ /* Allocate memory for node path */
+ pathname = kzalloc(ALIGN(get_node_pathlen(dn), 8), GFP_KERNEL);
+ if (!pathname)
+ return -ENOMEM;
+
+ /* Get the full path of the memory node */
+ get_node_path(dn, pathname);
+ pr_debug("Memory node path: %s\n", pathname);
+
+ /* Now that we know the path, find its offset in kdump kernel's fdt */
+ node = fdt_path_offset(fdt, pathname);
+ if (node < 0) {
+ pr_err("Malformed device tree: error reading %s\n",
+ pathname);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Get the address & size cells */
+ n_mem_addr_cells = of_n_addr_cells(dn);
+ n_mem_size_cells = of_n_size_cells(dn);
+ pr_debug("address cells: %d, size cells: %d\n", n_mem_addr_cells,
+ n_mem_size_cells);
+
+ um_info->idx = 0;
+ buf = check_realloc_usable_mem(um_info, 2);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ um_info->buf = buf;
+
+ prop = of_get_property(dn, "reg", &len);
+ if (!prop || len <= 0) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * "reg" property represents sequence of (addr,size) duples
+ * each representing a memory range.
+ */
+ ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
+
+ for (i = 0; i < ranges; i++) {
+ base = of_read_number(prop, n_mem_addr_cells);
+ prop += n_mem_addr_cells;
+ end = base + of_read_number(prop, n_mem_size_cells) - 1;
+
+ ret = add_usable_mem(um_info, base, end, &cnt);
+ if (ret) {
+ ret = ret;
+ goto out;
+ }
+ }
+
+ /*
+ * No kdump kernel usable memory found in this memory node.
+ * Write (0,0) duple in linux,usable-memory property for
+ * this region to be ignored.
+ */
+ if (um_info->idx == 0) {
+ um_info->buf[0] = 0;
+ um_info->buf[1] = 0;
+ um_info->idx = 2;
+ }
+
+ ret = fdt_setprop(fdt, node, "linux,usable-memory", um_info->buf,
+ (um_info->idx * sizeof(*(um_info->buf))));
+
+out:
+ kfree(pathname);
+ return ret;
+}
+
+
+/**
+ * update_usable_mem_fdt - Updates kdump kernel's fdt with linux,usable-memory
+ * and linux,drconf-usable-memory DT properties as
+ * appropriate to restrict its memory usage.
+ * @fdt: Flattened device tree for the kdump kernel.
+ * @usable_mem: Usable memory ranges for kdump kernel.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int update_usable_mem_fdt(void *fdt, struct crash_mem *usable_mem)
+{
+ struct umem_info um_info;
+ struct device_node *dn;
+ int node, ret = 0;
+
+ if (!usable_mem) {
+ pr_err("Usable memory ranges for kdump kernel not found\n");
+ return -ENOENT;
+ }
+
+ node = fdt_path_offset(fdt, "/ibm,dynamic-reconfiguration-memory");
+ if (node == -FDT_ERR_NOTFOUND)
+ pr_debug("No dynamic reconfiguration memory found\n");
+ else if (node < 0) {
+ pr_err("Malformed device tree: error reading /ibm,dynamic-reconfiguration-memory.\n");
+ return -EINVAL;
+ }
+
+ um_info.size = 0;
+ um_info.idx = 0;
+ um_info.buf = NULL;
+ um_info.umrngs = usable_mem;
+
+ dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (dn) {
+ ret = walk_drmem_lmbs(dn, &um_info, kdump_setup_usable_lmb);
+ of_node_put(dn);
+
+ if (ret)
+ goto out;
+
+ ret = fdt_setprop(fdt, node, "linux,drconf-usable-memory",
+ um_info.buf,
+ (um_info.idx * sizeof(*(um_info.buf))));
+ if (ret) {
+ pr_err("Failed to set linux,drconf-usable-memory property");
+ goto out;
+ }
+ }
+
+ /*
+ * Walk through each memory node and set linux,usable-memory property
+ * for the corresponding node in kdump kernel's fdt.
+ */
+ for_each_node_by_type(dn, "memory") {
+ ret = add_usable_mem_property(fdt, dn, &um_info);
+ if (ret) {
+ pr_err("Failed to set linux,usable-memory property for %s node",
+ dn->full_name);
+ goto out;
+ }
+ }
+
+out:
+ kfree(um_info.buf);
+ return ret;
+}
+
+/**
* setup_purgatory_ppc64 - initialize PPC64 specific purgatory's global
* variables and call setup_purgatory() to initialize
* common global variable.
@@ -281,6 +641,25 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
fdt_load_addr);
if (ret)
+ goto out;
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ uint32_t my_run_at_load = 1;
+
+ /*
+ * Tell relocatable kernel to run at load address
+ * via the word meant for that at 0x5c.
+ */
+ ret = kexec_purgatory_get_set_symbol(image, "run_at_load",
+ &my_run_at_load,
+ sizeof(my_run_at_load),
+ false);
+ if (ret)
+ goto out;
+ }
+
+out:
+ if (ret)
pr_err("Failed to setup purgatory symbols");
return ret;
}
@@ -301,6 +680,7 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
unsigned long initrd_load_addr,
unsigned long initrd_len, const char *cmdline)
{
+ struct crash_mem *umem = NULL;
int chosen_node, ret;
/* Remove memory reservation for the current device tree. */
@@ -313,15 +693,32 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
return ret;
}
+ /*
+ * Restrict memory usage for kdump kernel by setting up
+ * usable memory ranges.
+ */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = get_usable_memory_ranges(&umem);
+ if (ret)
+ goto out;
+
+ ret = update_usable_mem_fdt(fdt, umem);
+ if (ret) {
+ pr_err("Error setting up usable-memory property for kdump kernel\n");
+ goto out;
+ }
+ }
+
ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len,
cmdline, &chosen_node);
if (ret)
- return ret;
+ goto out;
ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
if (ret)
pr_err("Failed to update device-tree with linux,booted-from-kexec\n");
-
+out:
+ kfree(umem);
return ret;
}
^ permalink raw reply related
* [PATCH v3 03/12] powerpc/kexec_file: add helper functions for getting memory ranges
From: Hari Bathini @ 2020-07-13 17:21 UTC (permalink / raw)
To: Michael Ellerman, Andrew Morton
Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <159466074408.24747.10036072269371204890.stgit@hbathini.in.ibm.com>
In kexec case, the kernel to be loaded uses the same memory layout as
the running kernel. So, passing on the DT of the running kernel would
be good enough.
But in case of kdump, different memory ranges are needed to manage
loading the kdump kernel, booting into it and exporting the elfcore
of the crashing kernel. The ranges are exlude memory ranges, usable
memory ranges, reserved memory ranges and crash memory ranges.
Exclude memory ranges specify the list of memory ranges to avoid while
loading kdump segments. Usable memory ranges list the memory ranges
that could be used for booting kdump kernel. Reserved memory ranges
list the memory regions for the loading kernel's reserve map. Crash
memory ranges list the memory ranges to be exported as the crashing
kernel's elfcore.
Add helper functions for setting up the above mentioned memory ranges.
This helpers facilitate in understanding the subsequent changes better
and make it easy to setup the different memory ranges listed above, as
and when appropriate.
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
---
v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.
v1 -> v2:
* Added an option to merge ranges while sorting to minimize reallocations
for memory ranges list.
* Dropped within_crashkernel option for add_opal_mem_range() &
add_rtas_mem_range() as it is not really needed.
arch/powerpc/include/asm/kexec_ranges.h | 18 +
arch/powerpc/kexec/Makefile | 2
arch/powerpc/kexec/ranges.c | 397 +++++++++++++++++++++++++++++++
3 files changed, 416 insertions(+), 1 deletion(-)
create mode 100644 arch/powerpc/include/asm/kexec_ranges.h
create mode 100644 arch/powerpc/kexec/ranges.c
diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h
new file mode 100644
index 0000000..799dc40
--- /dev/null
+++ b/arch/powerpc/include/asm/kexec_ranges.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_POWERPC_KEXEC_RANGES_H
+#define _ASM_POWERPC_KEXEC_RANGES_H
+
+#define MEM_RANGE_CHUNK_SZ 2048 /* Memory ranges size chunk */
+
+struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges);
+int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size);
+int add_tce_mem_ranges(struct crash_mem **mem_ranges);
+int add_initrd_mem_range(struct crash_mem **mem_ranges);
+int add_htab_mem_range(struct crash_mem **mem_ranges);
+int add_kernel_mem_range(struct crash_mem **mem_ranges);
+int add_rtas_mem_range(struct crash_mem **mem_ranges);
+int add_opal_mem_range(struct crash_mem **mem_ranges);
+int add_reserved_ranges(struct crash_mem **mem_ranges);
+void sort_memory_ranges(struct crash_mem *mrngs, bool merge);
+
+#endif /* _ASM_POWERPC_KEXEC_RANGES_H */
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
index 67c3553..4aff684 100644
--- a/arch/powerpc/kexec/Makefile
+++ b/arch/powerpc/kexec/Makefile
@@ -7,7 +7,7 @@ obj-y += core.o crash.o core_$(BITS).o
obj-$(CONFIG_PPC32) += relocate_32.o
-obj-$(CONFIG_KEXEC_FILE) += file_load.o file_load_$(BITS).o elf_$(BITS).o
+obj-$(CONFIG_KEXEC_FILE) += file_load.o ranges.o file_load_$(BITS).o elf_$(BITS).o
ifdef CONFIG_HAVE_IMA_KEXEC
ifdef CONFIG_IMA
diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c
new file mode 100644
index 0000000..a704819
--- /dev/null
+++ b/arch/powerpc/kexec/ranges.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * powerpc code to implement the kexec_file_load syscall
+ *
+ * Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004 IBM Corp.
+ * Copyright (C) 2004,2005 Milton D Miller II, IBM Corporation
+ * Copyright (C) 2005 R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2020 IBM Corporation
+ *
+ * Based on kexec-tools' kexec-ppc64.c, fs2dt.c.
+ * Heavily modified for the kernel by
+ * Hari Bathini <hbathini@linux.ibm.com>.
+ */
+
+#include <linux/kexec.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+#include <asm/sections.h>
+#include <asm/kexec_ranges.h>
+
+/**
+ * get_max_nr_ranges - Get the max no. of ranges crash_mem structure
+ * could hold, given the size allocated for it.
+ * @size: Allocation size of crash_mem structure.
+ *
+ * Returns the maximum no. of ranges.
+ */
+static inline unsigned int get_max_nr_ranges(size_t size)
+{
+ return ((size - sizeof(struct crash_mem)) /
+ sizeof(struct crash_mem_range));
+}
+
+/**
+ * get_mem_rngs_size - Get the allocated size of mrngs based on
+ * max_nr_ranges and chunk size.
+ * @mrngs: Memory ranges.
+ *
+ * Returns the maximum no. of ranges.
+ */
+static inline size_t get_mem_rngs_size(struct crash_mem *mrngs)
+{
+ size_t size;
+
+ if (!mrngs)
+ return 0;
+
+ size = (sizeof(struct crash_mem) +
+ (mrngs->max_nr_ranges * sizeof(struct crash_mem_range)));
+
+ /*
+ * Memory is allocated in size multiple of MEM_RANGE_CHUNK_SZ.
+ * So, align to get the actual length.
+ */
+ return ALIGN(size, MEM_RANGE_CHUNK_SZ);
+}
+
+/**
+ * __add_mem_range - add a memory range to memory ranges list.
+ * @mem_ranges: Range list to add the memory range to.
+ * @base: Base address of the range to add.
+ * @size: Size of the memory range to add.
+ *
+ * (Re)allocates memory, if needed.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int __add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
+{
+ struct crash_mem *mrngs = *mem_ranges;
+
+ if ((mrngs == NULL) || (mrngs->nr_ranges == mrngs->max_nr_ranges)) {
+ mrngs = realloc_mem_ranges(mem_ranges);
+ if (!mrngs)
+ return -ENOMEM;
+ }
+
+ mrngs->ranges[mrngs->nr_ranges].start = base;
+ mrngs->ranges[mrngs->nr_ranges].end = base + size - 1;
+ mrngs->nr_ranges++;
+ return 0;
+}
+
+/**
+ * __merge_memory_ranges - Merges the given memory ranges list.
+ * @mem_ranges: Range list to merge.
+ *
+ * Assumes a sorted range list.
+ *
+ * Returns nothing.
+ */
+static void __merge_memory_ranges(struct crash_mem *mrngs)
+{
+ struct crash_mem_range *rngs;
+ int i, idx;
+
+ if (!mrngs)
+ return;
+
+ idx = 0;
+ rngs = &mrngs->ranges[0];
+ for (i = 1; i < mrngs->nr_ranges; i++) {
+ if (rngs[i].start <= (rngs[i-1].end + 1))
+ rngs[idx].end = rngs[i].end;
+ else {
+ idx++;
+ if (i == idx)
+ continue;
+
+ rngs[idx] = rngs[i];
+ }
+ }
+ mrngs->nr_ranges = idx + 1;
+}
+
+/**
+ * realloc_mem_ranges - reallocate mem_ranges with size incremented
+ * by MEM_RANGE_CHUNK_SZ. Frees up the old memory,
+ * if memory allocation fails.
+ * @mem_ranges: Memory ranges to reallocate.
+ *
+ * Returns pointer to reallocated memory on success, NULL otherwise.
+ */
+struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges)
+{
+ struct crash_mem *mrngs = *mem_ranges;
+ unsigned int nr_ranges;
+ size_t size;
+
+ size = get_mem_rngs_size(mrngs);
+ nr_ranges = mrngs ? mrngs->nr_ranges : 0;
+
+ size += MEM_RANGE_CHUNK_SZ;
+ mrngs = krealloc(*mem_ranges, size, GFP_KERNEL);
+ if (!mrngs) {
+ kfree(*mem_ranges);
+ *mem_ranges = NULL;
+ return NULL;
+ }
+
+ mrngs->nr_ranges = nr_ranges;
+ mrngs->max_nr_ranges = get_max_nr_ranges(size);
+ *mem_ranges = mrngs;
+
+ return mrngs;
+}
+
+/**
+ * add_mem_range - Updates existing memory range, if there is an overlap.
+ * Else, adds a new memory range.
+ * @mem_ranges: Range list to add the memory range to.
+ * @base: Base address of the range to add.
+ * @size: Size of the memory range to add.
+ *
+ * (Re)allocates memory, if needed.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
+{
+ struct crash_mem *mrngs = *mem_ranges;
+ u64 mstart, mend, end;
+ unsigned int i;
+
+ if (!size)
+ return 0;
+
+ end = base + size - 1;
+
+ if ((mrngs == NULL) || (mrngs->nr_ranges == 0))
+ return __add_mem_range(mem_ranges, base, size);
+
+ for (i = 0; i < mrngs->nr_ranges; i++) {
+ mstart = mrngs->ranges[i].start;
+ mend = mrngs->ranges[i].end;
+ if (base < mend && end > mstart) {
+ if (base < mstart)
+ mrngs->ranges[i].start = base;
+ if (end > mend)
+ mrngs->ranges[i].end = end;
+ return 0;
+ }
+ }
+
+ return __add_mem_range(mem_ranges, base, size);
+}
+
+/**
+ * add_tce_mem_ranges - Adds tce-table range to the given memory ranges list.
+ * @mem_ranges: Range list to add the memory range(s) to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_tce_mem_ranges(struct crash_mem **mem_ranges)
+{
+ struct device_node *dn;
+ int ret;
+
+ for_each_node_by_type(dn, "pci") {
+ u64 base;
+ u32 size;
+
+ ret = of_property_read_u64(dn, "linux,tce-base", &base);
+ ret |= of_property_read_u32(dn, "linux,tce-size", &size);
+ if (!ret)
+ continue;
+
+ ret = add_mem_range(mem_ranges, base, size);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/**
+ * add_initrd_mem_range - Adds initrd range to the given memory ranges list,
+ * if the initrd was retained.
+ * @mem_ranges: Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_initrd_mem_range(struct crash_mem **mem_ranges)
+{
+ u64 base, end;
+ int ret = 0;
+ char *str;
+
+ /* This range means something only if initrd was retained */
+ str = strstr(saved_command_line, "retain_initrd");
+ if (!str)
+ return 0;
+
+ ret = of_property_read_u64(of_chosen, "linux,initrd-start", &base);
+ ret |= of_property_read_u64(of_chosen, "linux,initrd-end", &end);
+ if (!ret)
+ ret = add_mem_range(mem_ranges, base, end - base + 1);
+ return ret;
+}
+
+/**
+ * add_htab_mem_range - Adds htab range to the given memory ranges list,
+ * if it exists
+ * @mem_ranges: Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_htab_mem_range(struct crash_mem **mem_ranges)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+ int ret;
+
+ if (!htab_address)
+ return 0;
+
+ ret = add_mem_range(mem_ranges, __pa(htab_address), htab_size_bytes);
+ return ret;
+#else
+ return 0;
+#endif
+}
+
+/**
+ * add_kernel_mem_range - Adds kernel text region to the given
+ * memory ranges list.
+ * @mem_ranges: Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_kernel_mem_range(struct crash_mem **mem_ranges)
+{
+ int ret;
+
+ ret = add_mem_range(mem_ranges, 0, __pa(_end));
+ return ret;
+}
+
+/**
+ * add_rtas_mem_range - Adds RTAS region to the given memory ranges list.
+ * @mem_ranges: Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_rtas_mem_range(struct crash_mem **mem_ranges)
+{
+ struct device_node *dn;
+ int ret = 0;
+
+ dn = of_find_node_by_path("/rtas");
+ if (dn) {
+ u32 base, size;
+
+ ret = of_property_read_u32(dn, "linux,rtas-base", &base);
+ ret |= of_property_read_u32(dn, "rtas-size", &size);
+ if (ret)
+ return ret;
+
+ ret = add_mem_range(mem_ranges, base, size);
+ }
+ return ret;
+}
+
+/**
+ * add_opal_mem_range - Adds OPAL region to the given memory ranges list.
+ * @mem_ranges: Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_opal_mem_range(struct crash_mem **mem_ranges)
+{
+ struct device_node *dn;
+ int ret = 0;
+
+ dn = of_find_node_by_path("/ibm,opal");
+ if (dn) {
+ u64 base, size;
+
+ ret = of_property_read_u64(dn, "opal-base-address", &base);
+ ret |= of_property_read_u64(dn, "opal-runtime-size", &size);
+ if (ret)
+ return ret;
+
+ ret = add_mem_range(mem_ranges, base, size);
+ }
+ return ret;
+}
+
+/**
+ * add_reserved_ranges - Adds "/reserved-ranges" regions exported by f/w
+ * to the given memory ranges list.
+ * @mem_ranges: Range list to add the memory ranges to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_reserved_ranges(struct crash_mem **mem_ranges)
+{
+ int i, len, ret = 0;
+ const __be32 *prop;
+
+ prop = of_get_property(of_root, "reserved-ranges", &len);
+ if (!prop)
+ return 0;
+
+ /*
+ * Each reserved range is an (address,size) pair, 2 cells each,
+ * totalling 4 cells per range.
+ */
+ for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+ u64 base, size;
+
+ base = of_read_number(prop + (i * 4) + 0, 2);
+ size = of_read_number(prop + (i * 4) + 2, 2);
+
+ ret = add_mem_range(mem_ranges, base, size);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/**
+ * sort_memory_ranges - Sorts the given memory ranges list.
+ * @mem_ranges: Range list to sort.
+ * @merge: If true, merge the list after sorting.
+ *
+ * Returns nothing.
+ */
+void sort_memory_ranges(struct crash_mem *mrngs, bool merge)
+{
+ struct crash_mem_range *rngs;
+ struct crash_mem_range rng;
+ int i, j, idx;
+
+ if (!mrngs)
+ return;
+
+ /* Sort the ranges in-place */
+ rngs = &mrngs->ranges[0];
+ for (i = 0; i < mrngs->nr_ranges; i++) {
+ idx = i;
+ for (j = (i + 1); j < mrngs->nr_ranges; j++) {
+ if (rngs[idx].start > rngs[j].start)
+ idx = j;
+ }
+ if (idx != i) {
+ rng = rngs[idx];
+ rngs[idx] = rngs[i];
+ rngs[i] = rng;
+ }
+ }
+
+ if (merge)
+ __merge_memory_ranges(mrngs);
+}
^ permalink raw reply related
* [PATCH v3 01/12] kexec_file: allow archs to handle special regions while locating memory hole
From: Hari Bathini @ 2020-07-13 17:20 UTC (permalink / raw)
To: Michael Ellerman, Andrew Morton
Cc: kernel test robot, Pingfan Liu, Kexec-ml, Nayna Jain,
Petr Tesarik, Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev,
Sourabh Jain, Vivek Goyal, Dave Young, Thiago Jung Bauermann,
Eric Biederman
In-Reply-To: <159466074408.24747.10036072269371204890.stgit@hbathini.in.ibm.com>
Some architectures may have special memory regions, within the given
memory range, which can't be used for the buffer in a kexec segment.
Implement weak arch_kexec_locate_mem_hole() definition which arch code
may override, to take care of special regions, while trying to locate
a memory hole.
Also, add the missing declarations for arch overridable functions and
and drop the __weak descriptors in the declarations to avoid non-weak
definitions from becoming weak.
Reported-by: kernel test robot <lkp@intel.com>
[lkp: In v1, arch_kimage_file_post_load_cleanup() declaration was missing]
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
---
v2 -> v3:
* Unchanged. Added Acked-by & Tested-by tags from Dave & Pingfan.
v1 -> v2:
* Introduced arch_kexec_locate_mem_hole() for override and dropped
weak arch_kexec_add_buffer().
* Dropped __weak identifier for arch overridable functions.
* Fixed the missing declaration for arch_kimage_file_post_load_cleanup()
reported by lkp. lkp report for reference:
- https://lore.kernel.org/patchwork/patch/1264418/
include/linux/kexec.h | 29 ++++++++++++++++++-----------
kernel/kexec_file.c | 16 ++++++++++++++--
2 files changed, 32 insertions(+), 13 deletions(-)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index ea67910..9e93bef 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -183,17 +183,24 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
bool get_value);
void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name);
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len);
-void * __weak arch_kexec_kernel_image_load(struct kimage *image);
-int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi,
- Elf_Shdr *section,
- const Elf_Shdr *relsec,
- const Elf_Shdr *symtab);
-int __weak arch_kexec_apply_relocations(struct purgatory_info *pi,
- Elf_Shdr *section,
- const Elf_Shdr *relsec,
- const Elf_Shdr *symtab);
+/* Architectures may override the below functions */
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len);
+void *arch_kexec_kernel_image_load(struct kimage *image);
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+int arch_kexec_apply_relocations(struct purgatory_info *pi,
+ Elf_Shdr *section,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+int arch_kimage_file_post_load_cleanup(struct kimage *image);
+#ifdef CONFIG_KEXEC_SIG
+int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len);
+#endif
+int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf);
extern int kexec_add_buffer(struct kexec_buf *kbuf);
int kexec_locate_mem_hole(struct kexec_buf *kbuf);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 09cc78d..e89912d 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -636,6 +636,19 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
}
/**
+ * arch_kexec_locate_mem_hole - Find free memory to place the segments.
+ * @kbuf: Parameters for the memory search.
+ *
+ * On success, kbuf->mem will have the start address of the memory region found.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+ return kexec_locate_mem_hole(kbuf);
+}
+
+/**
* kexec_add_buffer - place a buffer in a kexec segment
* @kbuf: Buffer contents and memory parameters.
*
@@ -647,7 +660,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
*/
int kexec_add_buffer(struct kexec_buf *kbuf)
{
-
struct kexec_segment *ksegment;
int ret;
@@ -675,7 +687,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
/* Walk the RAM ranges and allocate a suitable range for the buffer */
- ret = kexec_locate_mem_hole(kbuf);
+ ret = arch_kexec_locate_mem_hole(kbuf);
if (ret)
return ret;
^ permalink raw reply related
* [PATCH v3 00/12] ppc64: enable kdump support for kexec_file_load syscall
From: Hari Bathini @ 2020-07-13 17:20 UTC (permalink / raw)
To: Michael Ellerman, Andrew Morton
Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
This patch series enables kdump support for kexec_file_load system
call (kexec -s -p) on PPC64. The changes are inspired from kexec-tools
code but heavily modified for kernel consumption. There is scope to
expand purgatory to verify sha256 digest along with other improvements
in purgatory code. Will deal with those changes in a separate patch
series later.
The first patch adds a weak arch_kexec_locate_mem_hole() function to
override locate memory hole logic suiting arch needs. There are some
special regions in ppc64 which should be avoided while loading buffer
& there are multiple callers to kexec_add_buffer making it complicated
to maintain range sanity and using generic lookup at the same time.
The second patch marks ppc64 specific code within arch/powerpc/kexec
and arch/powerpc/purgatory to make the subsequent code changes easy
to understand.
The next patch adds helper function to setup different memory ranges
needed for loading kdump kernel, booting into it and exporting the
crashing kernel's elfcore.
The fourth patch overrides arch_kexec_locate_mem_hole() function to
locate memory hole for kdump segments by accounting for the special
memory regions, referred to as excluded memory ranges, and sets
kbuf->mem when a suitable memory region is found.
The fifth patch moves walk_drmem_lmbs() out of .init section with
a few changes to reuse it for setting up kdump kernel's usable memory
ranges. The next patch uses walk_drmem_lmbs() to look up the LMBs
and set linux,drconf-usable-memory & linux,usable-memory properties
in order to restrict kdump kernel's memory usage.
The seventh patch adds relocation support for the purgatory. Patch 8
helps setup the stack for the purgatory. The next patch setups up
backup region as a segment while loading kdump kernel and teaches
purgatory to copy it from source to destination.
Patch 10 builds the elfcore header for the running kernel & passes
the info to kdump kernel via "elfcorehdr=" parameter to export as
/proc/vmcore file. The next patch sets up the memory reserve map
for the kexec kernel and also claims kdump support for kdump as
all the necessary changes are added.
The last patch fixes a lookup issue for `kexec -l -s` case when
memory is reserved for crashkernel.
Tested the changes successfully on P8, P9 lpars, couple of OpenPOWER
boxes, one with secureboot enabled and a simulator.
v2 -> v3:
* Fixed TOC pointer calculation for purgatory by using section info
that has relocations applied.
* Fixed arch_kexec_locate_mem_hole() function to fallback to generic
kexec_locate_mem_hole() lookup if exclude ranges list is empty.
* Dropped check for backup_start in trampoline_64.S as purgatory()
function takes care of it anyway.
v1 -> v2:
* Introduced arch_kexec_locate_mem_hole() for override and dropped
weak arch_kexec_add_buffer().
* Addressed warnings reported by lkp.
* Added patch to address kexec load issue when memory is reserved
for crashkernel.
* Used the appropriate license header for the new files added.
* Added an option to merge ranges to minimize reallocations while
adding memory ranges.
* Dropped within_crashkernel parameter for add_opal_mem_range() &
add_rtas_mem_range() functions as it is not really needed.
---
Hari Bathini (12):
kexec_file: allow archs to handle special regions while locating memory hole
powerpc/kexec_file: mark PPC64 specific code
powerpc/kexec_file: add helper functions for getting memory ranges
ppc64/kexec_file: avoid stomping memory used by special regions
powerpc/drmem: make lmb walk a bit more flexible
ppc64/kexec_file: restrict memory usage of kdump kernel
ppc64/kexec_file: add support to relocate purgatory
ppc64/kexec_file: setup the stack for purgatory
ppc64/kexec_file: setup backup region for kdump kernel
ppc64/kexec_file: prepare elfcore header for crashing kernel
ppc64/kexec_file: add appropriate regions for memory reserve map
ppc64/kexec_file: fix kexec load failure with lack of memory hole
arch/powerpc/include/asm/crashdump-ppc64.h | 15
arch/powerpc/include/asm/drmem.h | 9
arch/powerpc/include/asm/kexec.h | 35 +
arch/powerpc/include/asm/kexec_ranges.h | 18
arch/powerpc/include/asm/purgatory.h | 11
arch/powerpc/kernel/prom.c | 13
arch/powerpc/kexec/Makefile | 2
arch/powerpc/kexec/elf_64.c | 35 +
arch/powerpc/kexec/file_load.c | 78 +
arch/powerpc/kexec/file_load_64.c | 1508 ++++++++++++++++++++++++++++
arch/powerpc/kexec/ranges.c | 397 +++++++
arch/powerpc/mm/drmem.c | 87 +-
arch/powerpc/mm/numa.c | 13
arch/powerpc/purgatory/Makefile | 28 -
arch/powerpc/purgatory/purgatory_64.c | 36 +
arch/powerpc/purgatory/trampoline.S | 117 --
arch/powerpc/purgatory/trampoline_64.S | 170 +++
include/linux/kexec.h | 29 -
kernel/kexec_file.c | 16
19 files changed, 2407 insertions(+), 210 deletions(-)
create mode 100644 arch/powerpc/include/asm/crashdump-ppc64.h
create mode 100644 arch/powerpc/include/asm/kexec_ranges.h
create mode 100644 arch/powerpc/include/asm/purgatory.h
create mode 100644 arch/powerpc/kexec/file_load_64.c
create mode 100644 arch/powerpc/kexec/ranges.c
create mode 100644 arch/powerpc/purgatory/purgatory_64.c
delete mode 100644 arch/powerpc/purgatory/trampoline.S
create mode 100644 arch/powerpc/purgatory/trampoline_64.S
^ permalink raw reply
* [PATCH v3 00/12] ppc64: enable kdump support for kexec_file_load syscall
From: Hari Bathini @ 2020-07-13 17:18 UTC (permalink / raw)
To: Michael Ellerman, Andrew Morton
Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
This patch series enables kdump support for kexec_file_load system
call (kexec -s -p) on PPC64. The changes are inspired from kexec-tools
code but heavily modified for kernel consumption. There is scope to
expand purgatory to verify sha256 digest along with other improvements
in purgatory code. Will deal with those changes in a separate patch
series later.
The first patch adds a weak arch_kexec_locate_mem_hole() function to
override locate memory hole logic suiting arch needs. There are some
special regions in ppc64 which should be avoided while loading buffer
& there are multiple callers to kexec_add_buffer making it complicated
to maintain range sanity and using generic lookup at the same time.
The second patch marks ppc64 specific code within arch/powerpc/kexec
and arch/powerpc/purgatory to make the subsequent code changes easy
to understand.
The next patch adds helper function to setup different memory ranges
needed for loading kdump kernel, booting into it and exporting the
crashing kernel's elfcore.
The fourth patch overrides arch_kexec_locate_mem_hole() function to
locate memory hole for kdump segments by accounting for the special
memory regions, referred to as excluded memory ranges, and sets
kbuf->mem when a suitable memory region is found.
The fifth patch moves walk_drmem_lmbs() out of .init section with
a few changes to reuse it for setting up kdump kernel's usable memory
ranges. The next patch uses walk_drmem_lmbs() to look up the LMBs
and set linux,drconf-usable-memory & linux,usable-memory properties
in order to restrict kdump kernel's memory usage.
The seventh patch adds relocation support for the purgatory. Patch 8
helps setup the stack for the purgatory. The next patch setups up
backup region as a segment while loading kdump kernel and teaches
purgatory to copy it from source to destination.
Patch 10 builds the elfcore header for the running kernel & passes
the info to kdump kernel via "elfcorehdr=" parameter to export as
/proc/vmcore file. The next patch sets up the memory reserve map
for the kexec kernel and also claims kdump support for kdump as
all the necessary changes are added.
The last patch fixes a lookup issue for `kexec -l -s` case when
memory is reserved for crashkernel.
---
Hari Bathini (12):
kexec_file: allow archs to handle special regions while locating memory hole
powerpc/kexec_file: mark PPC64 specific code
powerpc/kexec_file: add helper functions for getting memory ranges
ppc64/kexec_file: avoid stomping memory used by special regions
powerpc/drmem: make lmb walk a bit more flexible
ppc64/kexec_file: restrict memory usage of kdump kernel
ppc64/kexec_file: add support to relocate purgatory
ppc64/kexec_file: setup the stack for purgatory
ppc64/kexec_file: setup backup region for kdump kernel
ppc64/kexec_file: prepare elfcore header for crashing kernel
ppc64/kexec_file: add appropriate regions for memory reserve map
ppc64/kexec_file: fix kexec load failure with lack of memory hole
arch/powerpc/include/asm/crashdump-ppc64.h | 15
arch/powerpc/include/asm/drmem.h | 9
arch/powerpc/include/asm/kexec.h | 35 +
arch/powerpc/include/asm/kexec_ranges.h | 18
arch/powerpc/include/asm/purgatory.h | 11
arch/powerpc/kernel/prom.c | 13
arch/powerpc/kexec/Makefile | 2
arch/powerpc/kexec/elf_64.c | 35 +
arch/powerpc/kexec/file_load.c | 78 +
arch/powerpc/kexec/file_load_64.c | 1508 ++++++++++++++++++++++++++++
arch/powerpc/kexec/ranges.c | 397 +++++++
arch/powerpc/mm/drmem.c | 87 +-
arch/powerpc/mm/numa.c | 13
arch/powerpc/purgatory/Makefile | 28 -
arch/powerpc/purgatory/purgatory_64.c | 36 +
arch/powerpc/purgatory/trampoline.S | 117 --
arch/powerpc/purgatory/trampoline_64.S | 170 +++
include/linux/kexec.h | 29 -
kernel/kexec_file.c | 16
19 files changed, 2407 insertions(+), 210 deletions(-)
create mode 100644 arch/powerpc/include/asm/crashdump-ppc64.h
create mode 100644 arch/powerpc/include/asm/kexec_ranges.h
create mode 100644 arch/powerpc/include/asm/purgatory.h
create mode 100644 arch/powerpc/kexec/file_load_64.c
create mode 100644 arch/powerpc/kexec/ranges.c
create mode 100644 arch/powerpc/purgatory/purgatory_64.c
delete mode 100644 arch/powerpc/purgatory/trampoline.S
create mode 100644 arch/powerpc/purgatory/trampoline_64.S
^ permalink raw reply
* Re: [PATCH v2 0/3] Power10 basic energy management
From: Nicholas Piggin @ 2020-07-13 16:58 UTC (permalink / raw)
To: ego
Cc: ravi.bangoria, mikey, pratik.r.sampat, linux-kernel,
Pratik Rajesh Sampat, paulus, linuxppc-dev
In-Reply-To: <20200713104837.GG24866@in.ibm.com>
Excerpts from Gautham R Shenoy's message of July 13, 2020 8:48 pm:
> On Mon, Jul 13, 2020 at 03:23:21PM +1000, Nicholas Piggin wrote:
>> Excerpts from Pratik Rajesh Sampat's message of July 10, 2020 3:22 pm:
>> > Changelog v1 --> v2:
>> > 1. Save-restore DAWR and DAWRX unconditionally as they are lost in
>> > shallow idle states too
>> > 2. Rename pnv_first_spr_loss_level to pnv_first_fullstate_loss_level to
>> > correct naming terminology
>> >
>> > Pratik Rajesh Sampat (3):
>> > powerpc/powernv/idle: Exclude mfspr on HID1,4,5 on P9 and above
>> > powerpc/powernv/idle: save-restore DAWR0,DAWRX0 for P10
>> > powerpc/powernv/idle: Rename pnv_first_spr_loss_level variable
>> >
>> > arch/powerpc/platforms/powernv/idle.c | 34 +++++++++++++++++----------
>> > 1 file changed, 22 insertions(+), 12 deletions(-)
>>
>> These look okay to me, but the CPU_FTR_ARCH_300 test for
>> pnv_power9_idle_init() is actually wrong, it should be a PVR test
>> because idle is not completely architected (not even shallow stop
>> states, unfortunately).
>>
>> It doesn't look like we support POWER10 idle correctly yet, and on older
>> kernels it wouldn't work even if we fixed newer, so ideally the PVR
>> check would be backported as a fix in the front of the series.
>>
>> Sadly, we have no OPAL idle driver yet. Hopefully we will before the
>> next processor shows up :P
>
> Abhishek posted a version recently :
> https://patchwork.ozlabs.org/project/skiboot/patch/20200706043533.76539-1-huntbag@linux.vnet.ibm.com/
Yep, I saw that. Still keen to get it working, just had other priorities
in the short term. We'll need to do this OPAL v4 thing for it.
Thanks,
Nick
^ permalink raw reply
* [PATCH 12/14 v3] PCI/AER: Check the return value of pcie_capability_read_*()
From: Saheed O. Bolarinwa @ 2020-07-13 17:55 UTC (permalink / raw)
To: skhan, linux-pci, linuxppc-dev, linux-kernel-mentees,
linux-kernel
Cc: Bolarinwa Olayemi Saheed
In-Reply-To: <20200713175529.29715-1-refactormyself@gmail.com>
From: Bolarinwa Olayemi Saheed <refactormyself@gmail.com>
On failure pcie_capability_read_dword() sets it's last parameter,
val to 0.
However, with Patch 14/14, it is possible that val is set to ~0 on
failure. This would introduce a bug because (x & x) == (~0 & x).
This bug can be avoided if the return value of pcie_capability_read_word
is checked to confirm success.
Check the return value of pcie_capability_read_word() to ensure success.
Suggested-by: Bjorn Helgaas <bjorn@helgaas.com>
Signed-off-by: Bolarinwa Olayemi Saheed <refactormyself@gmail.com>
---
drivers/pci/pcie/aer.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 3acf56683915..f4beb47c622c 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -800,6 +800,7 @@ static bool is_error_source(struct pci_dev *dev, struct aer_err_info *e_info)
int aer = dev->aer_cap;
u32 status, mask;
u16 reg16;
+ int ret;
/*
* When bus id is equal to 0, it might be a bad id
@@ -828,8 +829,8 @@ static bool is_error_source(struct pci_dev *dev, struct aer_err_info *e_info)
return false;
/* Check if AER is enabled */
- pcie_capability_read_word(dev, PCI_EXP_DEVCTL, ®16);
- if (!(reg16 & PCI_EXP_AER_FLAGS))
+ ret = pcie_capability_read_word(dev, PCI_EXP_DEVCTL, ®16);
+ if (ret || !(reg16 & PCI_EXP_AER_FLAGS))
return false;
if (!aer)
--
2.18.2
^ permalink raw reply related
* Re: [PATCH v2 0/3] Power10 basic energy management
From: Nicholas Piggin @ 2020-07-13 16:50 UTC (permalink / raw)
To: benh, ego, linux-kernel, linuxppc-dev, mikey, mpe, paulus,
pratik.r.sampat, Pratik Sampat, ravi.bangoria, svaidy
In-Reply-To: <bc6494c0-9a17-2416-c6cc-15612020f497@linux.ibm.com>
Excerpts from Pratik Sampat's message of July 13, 2020 8:02 pm:
> Thank you for your comments,
>
> On 13/07/20 10:53 am, Nicholas Piggin wrote:
>> Excerpts from Pratik Rajesh Sampat's message of July 10, 2020 3:22 pm:
>>> Changelog v1 --> v2:
>>> 1. Save-restore DAWR and DAWRX unconditionally as they are lost in
>>> shallow idle states too
>>> 2. Rename pnv_first_spr_loss_level to pnv_first_fullstate_loss_level to
>>> correct naming terminology
>>>
>>> Pratik Rajesh Sampat (3):
>>> powerpc/powernv/idle: Exclude mfspr on HID1,4,5 on P9 and above
>>> powerpc/powernv/idle: save-restore DAWR0,DAWRX0 for P10
>>> powerpc/powernv/idle: Rename pnv_first_spr_loss_level variable
>>>
>>> arch/powerpc/platforms/powernv/idle.c | 34 +++++++++++++++++----------
>>> 1 file changed, 22 insertions(+), 12 deletions(-)
>> These look okay to me, but the CPU_FTR_ARCH_300 test for
>> pnv_power9_idle_init() is actually wrong, it should be a PVR test
>> because idle is not completely architected (not even shallow stop
>> states, unfortunately).
>>
>> It doesn't look like we support POWER10 idle correctly yet, and on older
>> kernels it wouldn't work even if we fixed newer, so ideally the PVR
>> check would be backported as a fix in the front of the series.
>>
>> Sadly, we have no OPAL idle driver yet. Hopefully we will before the
>> next processor shows up :P
>>
>> Thanks,
>> Nick
>
> So if I understand this correctly, in powernv/idle.c where we check for
> CPU_FTR_ARCH_300, we should rather be making a pvr_version_is(PVR_POWER9)
> check instead?
>
> Of course, the P10 PVR and its relevant checks will have to be added then too.
Yes I think so, unfortunately.
Thanks,
Nick
^ permalink raw reply
* [PATCH v6] ima: move APPRAISE_BOOTPARAM dependency on ARCH_POLICY to runtime
From: Bruno Meneguele @ 2020-07-13 16:48 UTC (permalink / raw)
To: linux-kernel, x86, linuxppc-dev, linux-s390, linux-integrity
Cc: erichte, Bruno Meneguele, nayna, stable, zohar
The IMA_APPRAISE_BOOTPARAM config allows enabling different "ima_appraise="
modes - log, fix, enforce - at run time, but not when IMA architecture
specific policies are enabled. This prevents properly labeling the
filesystem on systems where secure boot is supported, but not enabled on the
platform. Only when secure boot is actually enabled should these IMA
appraise modes be disabled.
This patch removes the compile time dependency and makes it a runtime
decision, based on the secure boot state of that platform.
Test results as follows:
-> x86-64 with secure boot enabled
[ 0.015637] Kernel command line: <...> ima_policy=appraise_tcb ima_appraise=fix
[ 0.015668] ima: Secure boot enabled: ignoring ima_appraise=fix boot parameter option
-> powerpc with secure boot disabled
[ 0.000000] Kernel command line: <...> ima_policy=appraise_tcb ima_appraise=fix
[ 0.000000] Secure boot mode disabled
-> Running the system without secure boot and with both options set:
CONFIG_IMA_APPRAISE_BOOTPARAM=y
CONFIG_IMA_ARCH_POLICY=y
Audit prompts "missing-hash" but still allow execution and, consequently,
filesystem labeling:
type=INTEGRITY_DATA msg=audit(07/09/2020 12:30:27.778:1691) : pid=4976
uid=root auid=root ses=2
subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 op=appraise_data
cause=missing-hash comm=bash name=/usr/bin/evmctl dev="dm-0" ino=493150
res=no
Cc: stable@vger.kernel.org
Fixes: d958083a8f64 ("x86/ima: define arch_get_ima_policy() for x86")
Signed-off-by: Bruno Meneguele <bmeneg@redhat.com>
---
v6:
- explictly print the bootparam being ignored to the user (Mimi)
v5:
- add pr_info() to inform user the ima_appraise= boot param is being
ignored due to secure boot enabled (Nayna)
- add some testing results to commit log
v4:
- instead of change arch_policy loading code, check secure boot state at
"ima_appraise=" parameter handler (Mimi)
v3:
- extend secure boot arch checker to also consider trusted boot
- enforce IMA appraisal when secure boot is effectively enabled (Nayna)
- fix ima_appraise flag assignment by or'ing it (Mimi)
v2:
- pr_info() message prefix correction
security/integrity/ima/Kconfig | 2 +-
security/integrity/ima/ima_appraise.c | 6 ++++++
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/security/integrity/ima/Kconfig b/security/integrity/ima/Kconfig
index edde88dbe576..62dc11a5af01 100644
--- a/security/integrity/ima/Kconfig
+++ b/security/integrity/ima/Kconfig
@@ -232,7 +232,7 @@ config IMA_APPRAISE_REQUIRE_POLICY_SIGS
config IMA_APPRAISE_BOOTPARAM
bool "ima_appraise boot parameter"
- depends on IMA_APPRAISE && !IMA_ARCH_POLICY
+ depends on IMA_APPRAISE
default y
help
This option enables the different "ima_appraise=" modes
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index a9649b04b9f1..28a59508c6bd 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -19,6 +19,12 @@
static int __init default_appraise_setup(char *str)
{
#ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
+ if (arch_ima_get_secureboot()) {
+ pr_info("Secure boot enabled: ignoring ima_appraise=%s boot parameter option",
+ str);
+ return 1;
+ }
+
if (strncmp(str, "off", 3) == 0)
ima_appraise = 0;
else if (strncmp(str, "log", 3) == 0)
--
2.26.2
^ permalink raw reply related
* Re: [RFC PATCH 7/7] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-07-13 16:48 UTC (permalink / raw)
To: Andy Lutomirski
Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <CALCETrWbD=3SUOuq9P7Syb+a1DoBjjem8hq9_HCvn7wyqETkpw@mail.gmail.com>
Excerpts from Andy Lutomirski's message of July 14, 2020 1:59 am:
> On Thu, Jul 9, 2020 at 6:57 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> On big systems, the mm refcount can become highly contented when doing
>> a lot of context switching with threaded applications (particularly
>> switching between the idle thread and an application thread).
>>
>> Abandoning lazy tlb slows switching down quite a bit in the important
>> user->idle->user cases, so so instead implement a non-refcounted scheme
>> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>> any remaining lazy ones.
>>
>> On a 16-socket 192-core POWER8 system, a context switching benchmark
>> with as many software threads as CPUs (so each switch will go in and
>> out of idle), upstream can achieve a rate of about 1 million context
>> switches per second. After this patch it goes up to 118 million.
>>
>
> I read the patch a couple of times, and I have a suggestion that could
> be nonsense. You are, effectively, using mm_cpumask() as a sort of
> refcount. You're saying "hey, this mm has no more references, but it
> still has nonempty mm_cpumask(), so let's send an IPI and shoot down
> those references too." I'm wondering whether you actually need the
> IPI. What if, instead, you actually treated mm_cpumask as a refcount
> for real? Roughly, in __mmdrop(), you would only free the page tables
> if mm_cpumask() is empty. And, in the code that removes a CPU from
> mm_cpumask(), you would check if mm_users == 0 and, if so, check if
> you just removed the last bit from mm_cpumask and potentially free the
> mm.
>
> Getting the locking right here could be a bit tricky -- you need to
> avoid two CPUs simultaneously exiting lazy TLB and thinking they
> should free the mm, and you also need to avoid an mm with mm_users
> hitting zero concurrently with the last remote CPU using it lazily
> exiting lazy TLB. Perhaps this could be resolved by having mm_count
> == 1 mean "mm_cpumask() is might contain bits and, if so, it owns the
> mm" and mm_count == 0 meaning "now it's dead" and using some careful
> cmpxchg or dec_return to make sure that only one CPU frees it.
>
> Or maybe you'd need a lock or RCU for this, but the idea would be to
> only ever take the lock after mm_users goes to zero.
I don't think it's nonsense, it could be a good way to avoid IPIs.
I haven't seen much problem here that made me too concerned about IPIs
yet, so I think the simple patch may be good enough to start with
for powerpc. I'm looking at avoiding/reducing the IPIs by combining the
unlazying with the exit TLB flush without doing anything fancy with
ref counting, but we'll see.
Thanks,
Nick
^ permalink raw reply
* Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-07-13 16:37 UTC (permalink / raw)
To: Andy Lutomirski, Mathieu Desnoyers
Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, linux-kernel,
linux-mm, linuxppc-dev
In-Reply-To: <CALCETrUHsYp0oGAiy3N-yAauPyx2nKqp1AiETgSJWc77GwO-Sg@mail.gmail.com>
Excerpts from Andy Lutomirski's message of July 14, 2020 1:48 am:
> On Mon, Jul 13, 2020 at 7:13 AM Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
>>
>> ----- On Jul 13, 2020, at 9:47 AM, Nicholas Piggin npiggin@gmail.com wrote:
>>
>> > Excerpts from Nicholas Piggin's message of July 13, 2020 2:45 pm:
>> >> Excerpts from Andy Lutomirski's message of July 11, 2020 3:04 am:
>> >>> Also, as it stands, I can easily see in_irq() ceasing to promise to
>> >>> serialize. There are older kernels for which it does not promise to
>> >>> serialize. And I have plans to make it stop serializing in the
>> >>> nearish future.
>> >>
>> >> You mean x86's return from interrupt? Sounds fun... you'll konw where to
>> >> update the membarrier sync code, at least :)
>> >
>> > Oh, I should actually say Mathieu recently clarified a return from
>> > interrupt doesn't fundamentally need to serialize in order to support
>> > membarrier sync core.
>>
>> Clarification to your statement:
>>
>> Return from interrupt to kernel code does not need to be context serializing
>> as long as kernel serializes before returning to user-space.
>>
>> However, return from interrupt to user-space needs to be context serializing.
>>
>
> Indeed, and I figured this out on the first read through because I'm
> quite familiar with the x86 entry code. But Nick somehow missed this,
> and Nick is the one who wrote the patch.
>
> Nick, I think this helps prove my point. The code you're submitting
> may well be correct, but it's unmaintainable.
It's not. The patch I wrote for x86 is a no-op, it just moves existing
x86 hook and code that's already there to a different name.
Actually it's not quite a no-op, it't changes it to use hooks that are
actually called in the right places. Because previously it was
unmaintainable from point of view of generic mm -- it was not clear at
all that the old one should have been called in other places where the
mm goes non-lazy. Now with the exit_lazy_tlb hook, it can quite easily
be spotted where it is missing.
And x86 keeps their membarrier code in x86, and uses nice well defined
lazy tlb mm hooks.
> At the very least, this
> needs a comment explaining, from the perspective of x86, *exactly*
> what exit_lazy_tlb() is promising, why it's promising it, how it
> achieves that promise, and what code cares about it. Or we could do
> something with TIF flags and make this all less magical, although that
> will probably end up very slightly slower.
It's all documented there in existing comments plus the asm-generic
exit_lazy_tlb specification added AFAIKS.
Is the membarrier comment in finish_task_switch plus these ones not
enough?
Thanks,
Nick
^ permalink raw reply
* [PATCH 1/1 V4] : PCIE PHB reset
From: wenxiong @ 2020-07-13 14:39 UTC (permalink / raw)
To: linuxppc-dev; +Cc: Wen Xiong, oohall, bobroff, brking, wenxiong
From: Wen Xiong <wenxiong@linux.vnet.ibm.com>
Several device drivers hit EEH(Extended Error handling) when triggering
kdump on Pseries PowerVM. This patch implemented a reset of the PHBs
in pci general code when triggering kdump. PHB reset stop all PCI
transactions from normal kernel. We have tested the patch in several
enviroments:
- direct slot adapters
- adapters under the switch
- a VF adapter in PowerVM
- a VF adapter/adapter in KVM guest.
Change History:
V4:
- Merge the code from pseries/pci.c to pseries/eeh_pseries.c
- Add 3 helper functions which are shared by eeh code and this path.
Reviewed by Michael Ellerman.
V3:
- Change the comments(Reviewed by Gustavo Romero)
V2:
- change to machine_postcore_initall(Reviewed by Oliver Halloran)
- change the error pathes(Reviewed by Sam Bobroff)
V1:
- initial version
Signed-off-by: Wen Xiong<wenxiong@linux.vnet.ibm.com>
---
arch/powerpc/platforms/pseries/eeh_pseries.c | 234 ++++++++++++++-----
1 file changed, 170 insertions(+), 64 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index ace117f99d94..a3ae8d206a86 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -24,6 +24,7 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/spinlock.h>
+#include <linux/crash_dump.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
@@ -80,6 +81,152 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
eeh_probe_device(pdev);
}
+
+/**
+ * pseries_eeh_get_config_addr - Retrieve config address
+ *
+ * Retrieve the assocated config address. Actually, there're 2 RTAS
+ * function calls dedicated for the purpose. We need implement
+ * it through the new function and then the old one. Besides,
+ * you should make sure the config address is figured out from
+ * FDT node before calling the function.
+ *
+ * It's notable that zero'ed return value means invalid PE config
+ * address.
+ */
+static int pseries_eeh_get_config_addr(struct pci_controller *phb, int config_addr)
+{
+ int ret = 0;
+ int rets[3];
+
+ if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
+ /*
+ * First of all, we need to make sure there has one PE
+ * associated with the device. Otherwise, PE address is
+ * meaningless.
+ */
+ ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+ config_addr, BUID_HI(phb->buid),
+ BUID_LO(phb->buid), 1);
+ if (ret || (rets[0] == 0))
+ return 0;
+
+ /* Retrieve the associated PE config address */
+ ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+ config_addr, BUID_HI(phb->buid),
+ BUID_LO(phb->buid), 0);
+ if (ret) {
+ pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
+ __func__, phb->global_number, config_addr);
+ return 0;
+ }
+
+ return rets[0];
+ }
+
+ if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
+ ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
+ config_addr, BUID_HI(phb->buid),
+ BUID_LO(phb->buid), 0);
+ if (ret) {
+ pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
+ __func__, phb->global_number, config_addr);
+ return 0;
+ }
+
+ return rets[0];
+ }
+
+ return ret;
+}
+
+/**
+ * pseries_eeh_phb_reset - Reset the specified PHB
+ * @phb: PCI controller
+ * @config_adddr: the associated config address
+ * @option: reset option
+ *
+ * Reset the specified PHB/PE
+ */
+static int pseries_eeh_phb_reset(struct pci_controller *phb, int config_addr, int option)
+{
+ int ret;
+
+ /* Reset PE through RTAS call */
+ ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+ config_addr, BUID_HI(phb->buid),
+ BUID_LO(phb->buid), option);
+
+ /* If fundamental-reset not supported, try hot-reset */
+ if (option == EEH_RESET_FUNDAMENTAL &&
+ ret == -8) {
+ option = EEH_RESET_HOT;
+ ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
+ config_addr, BUID_HI(phb->buid),
+ BUID_LO(phb->buid), option);
+ }
+
+ /* We need reset hold or settlement delay */
+ if (option == EEH_RESET_FUNDAMENTAL ||
+ option == EEH_RESET_HOT)
+ msleep(EEH_PE_RST_HOLD_TIME);
+ else
+ msleep(EEH_PE_RST_SETTLE_TIME);
+
+ return ret;
+}
+
+/**
+ * pseries_eeh_phb_configure_bridge - Configure PCI bridges in the indicated PE
+ * @phb: PCI controller
+ * @config_adddr: the associated config address
+ *
+ * The function will be called to reconfigure the bridges included
+ * in the specified PE so that the mulfunctional PE would be recovered
+ * again.
+ */
+static int pseries_eeh_phb_configure_bridge(struct pci_controller *phb, int config_addr)
+{
+ int ret;
+ /* Waiting 0.2s maximum before skipping configuration */
+ int max_wait = 200;
+
+ while (max_wait > 0) {
+ ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
+ config_addr, BUID_HI(phb->buid),
+ BUID_LO(phb->buid));
+
+ if (!ret)
+ return ret;
+ if (ret < 0)
+ break;
+
+ /*
+ * If RTAS returns a delay value that's above 100ms, cut it
+ * down to 100ms in case firmware made a mistake. For more
+ * on how these delay values work see rtas_busy_delay_time
+ */
+ if (ret > RTAS_EXTENDED_DELAY_MIN+2 &&
+ ret <= RTAS_EXTENDED_DELAY_MAX)
+ ret = RTAS_EXTENDED_DELAY_MIN+2;
+
+ max_wait -= rtas_busy_delay_time(ret);
+
+ if (max_wait < 0)
+ break;
+
+ rtas_busy_delay(ret);
+ }
+
+ pr_warn("%s: Unable to configure bridge PHB#%x-PE#%x (%d)\n",
+ __func__, phb->global_number, config_addr, ret);
+ /* PAPR defines -3 as "Parameter Error" for this function: */
+ if (ret == -3)
+ return -EINVAL;
+ else
+ return -EIO;
+}
+
/*
* Buffer for reporting slot-error-detail rtas calls. Its here
* in BSS, and not dynamically alloced, so that it ends up in
@@ -96,6 +243,10 @@ static int eeh_error_buf_size;
*/
static int pseries_eeh_init(void)
{
+ struct pci_controller *phb;
+ struct pci_dn *pdn;
+ int addr, config_addr;
+
/* figure out EEH RTAS function call tokens */
ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");
ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
@@ -148,6 +299,22 @@ static int pseries_eeh_init(void)
/* Set EEH machine dependent code */
ppc_md.pcibios_bus_add_device = pseries_pcibios_bus_add_device;
+ if (is_kdump_kernel() || reset_devices) {
+ pr_info("Issue PHB reset ...\n");
+ list_for_each_entry(phb, &hose_list, list_node) {
+ pdn = list_first_entry(&PCI_DN(phb->dn)->child_list, struct pci_dn, list);
+ addr = (pdn->busno << 16) | (pdn->devfn << 8);
+ config_addr = pseries_eeh_get_config_addr(phb, addr);
+ /* invalid PE config addr */
+ if (config_addr == 0)
+ continue;
+
+ pseries_eeh_phb_reset(phb, config_addr, EEH_RESET_FUNDAMENTAL);
+ pseries_eeh_phb_reset(phb, config_addr, EEH_RESET_DEACTIVATE);
+ pseries_eeh_phb_configure_bridge(phb, config_addr);
+ }
+ }
+
return 0;
}
@@ -569,35 +736,13 @@ static int pseries_eeh_get_state(struct eeh_pe *pe, int *delay)
static int pseries_eeh_reset(struct eeh_pe *pe, int option)
{
int config_addr;
- int ret;
/* Figure out PE address */
config_addr = pe->config_addr;
if (pe->addr)
config_addr = pe->addr;
-
- /* Reset PE through RTAS call */
- ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
- config_addr, BUID_HI(pe->phb->buid),
- BUID_LO(pe->phb->buid), option);
-
- /* If fundamental-reset not supported, try hot-reset */
- if (option == EEH_RESET_FUNDAMENTAL &&
- ret == -8) {
- option = EEH_RESET_HOT;
- ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
- config_addr, BUID_HI(pe->phb->buid),
- BUID_LO(pe->phb->buid), option);
- }
-
- /* We need reset hold or settlement delay */
- if (option == EEH_RESET_FUNDAMENTAL ||
- option == EEH_RESET_HOT)
- msleep(EEH_PE_RST_HOLD_TIME);
- else
- msleep(EEH_PE_RST_SETTLE_TIME);
-
- return ret;
+
+ return pseries_eeh_phb_reset(pe->phb, config_addr, option);
}
/**
@@ -641,56 +786,17 @@ static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, u
* pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE
* @pe: EEH PE
*
- * The function will be called to reconfigure the bridges included
- * in the specified PE so that the mulfunctional PE would be recovered
- * again.
*/
static int pseries_eeh_configure_bridge(struct eeh_pe *pe)
{
int config_addr;
- int ret;
- /* Waiting 0.2s maximum before skipping configuration */
- int max_wait = 200;
/* Figure out the PE address */
config_addr = pe->config_addr;
if (pe->addr)
config_addr = pe->addr;
- while (max_wait > 0) {
- ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
- config_addr, BUID_HI(pe->phb->buid),
- BUID_LO(pe->phb->buid));
-
- if (!ret)
- return ret;
- if (ret < 0)
- break;
-
- /*
- * If RTAS returns a delay value that's above 100ms, cut it
- * down to 100ms in case firmware made a mistake. For more
- * on how these delay values work see rtas_busy_delay_time
- */
- if (ret > RTAS_EXTENDED_DELAY_MIN+2 &&
- ret <= RTAS_EXTENDED_DELAY_MAX)
- ret = RTAS_EXTENDED_DELAY_MIN+2;
-
- max_wait -= rtas_busy_delay_time(ret);
-
- if (max_wait < 0)
- break;
-
- rtas_busy_delay(ret);
- }
-
- pr_warn("%s: Unable to configure bridge PHB#%x-PE#%x (%d)\n",
- __func__, pe->phb->global_number, pe->addr, ret);
- /* PAPR defines -3 as "Parameter Error" for this function: */
- if (ret == -3)
- return -EINVAL;
- else
- return -EIO;
+ return pseries_eeh_phb_configure_bridge(pe->phb, config_addr);
}
/**
--
2.18.1
^ permalink raw reply related
* Re: [RFC PATCH 7/7] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Andy Lutomirski @ 2020-07-13 15:59 UTC (permalink / raw)
To: Nicholas Piggin
Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <20200710015646.2020871-8-npiggin@gmail.com>
On Thu, Jul 9, 2020 at 6:57 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> On big systems, the mm refcount can become highly contented when doing
> a lot of context switching with threaded applications (particularly
> switching between the idle thread and an application thread).
>
> Abandoning lazy tlb slows switching down quite a bit in the important
> user->idle->user cases, so so instead implement a non-refcounted scheme
> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
> any remaining lazy ones.
>
> On a 16-socket 192-core POWER8 system, a context switching benchmark
> with as many software threads as CPUs (so each switch will go in and
> out of idle), upstream can achieve a rate of about 1 million context
> switches per second. After this patch it goes up to 118 million.
>
I read the patch a couple of times, and I have a suggestion that could
be nonsense. You are, effectively, using mm_cpumask() as a sort of
refcount. You're saying "hey, this mm has no more references, but it
still has nonempty mm_cpumask(), so let's send an IPI and shoot down
those references too." I'm wondering whether you actually need the
IPI. What if, instead, you actually treated mm_cpumask as a refcount
for real? Roughly, in __mmdrop(), you would only free the page tables
if mm_cpumask() is empty. And, in the code that removes a CPU from
mm_cpumask(), you would check if mm_users == 0 and, if so, check if
you just removed the last bit from mm_cpumask and potentially free the
mm.
Getting the locking right here could be a bit tricky -- you need to
avoid two CPUs simultaneously exiting lazy TLB and thinking they
should free the mm, and you also need to avoid an mm with mm_users
hitting zero concurrently with the last remote CPU using it lazily
exiting lazy TLB. Perhaps this could be resolved by having mm_count
== 1 mean "mm_cpumask() is might contain bits and, if so, it owns the
mm" and mm_count == 0 meaning "now it's dead" and using some careful
cmpxchg or dec_return to make sure that only one CPU frees it.
Or maybe you'd need a lock or RCU for this, but the idea would be to
only ever take the lock after mm_users goes to zero.
--Andy
^ permalink raw reply
* Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Andy Lutomirski @ 2020-07-13 15:48 UTC (permalink / raw)
To: Mathieu Desnoyers
Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, linux-kernel,
Nicholas Piggin, linux-mm, Andy Lutomirski, linuxppc-dev
In-Reply-To: <284592761.9860.1594649601492.JavaMail.zimbra@efficios.com>
On Mon, Jul 13, 2020 at 7:13 AM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> ----- On Jul 13, 2020, at 9:47 AM, Nicholas Piggin npiggin@gmail.com wrote:
>
> > Excerpts from Nicholas Piggin's message of July 13, 2020 2:45 pm:
> >> Excerpts from Andy Lutomirski's message of July 11, 2020 3:04 am:
> >>> Also, as it stands, I can easily see in_irq() ceasing to promise to
> >>> serialize. There are older kernels for which it does not promise to
> >>> serialize. And I have plans to make it stop serializing in the
> >>> nearish future.
> >>
> >> You mean x86's return from interrupt? Sounds fun... you'll konw where to
> >> update the membarrier sync code, at least :)
> >
> > Oh, I should actually say Mathieu recently clarified a return from
> > interrupt doesn't fundamentally need to serialize in order to support
> > membarrier sync core.
>
> Clarification to your statement:
>
> Return from interrupt to kernel code does not need to be context serializing
> as long as kernel serializes before returning to user-space.
>
> However, return from interrupt to user-space needs to be context serializing.
>
Indeed, and I figured this out on the first read through because I'm
quite familiar with the x86 entry code. But Nick somehow missed this,
and Nick is the one who wrote the patch.
Nick, I think this helps prove my point. The code you're submitting
may well be correct, but it's unmaintainable. At the very least, this
needs a comment explaining, from the perspective of x86, *exactly*
what exit_lazy_tlb() is promising, why it's promising it, how it
achieves that promise, and what code cares about it. Or we could do
something with TIF flags and make this all less magical, although that
will probably end up very slightly slower.
--Andy
^ permalink raw reply
* Re: [PATCH 00/20] Documentation: eliminate duplicated words
From: Jonathan Corbet @ 2020-07-13 15:45 UTC (permalink / raw)
To: Randy Dunlap
Cc: kvm, linux-doc, David Airlie, kgdb-bugreport, linux-fpga,
Liviu Dudau, dri-devel, Douglas Anderson, Paul Cercueil, keyrings,
Paul Mackerras, linux-i2c, Pavel Machek, Srinivas Pandruvada,
Mihail Atanassov, linux-leds, linux-s390, Daniel Thompson,
linux-scsi, Masahiro Yamada, Matthew Wilcox, Halil Pasic,
Jarkko Sakkinen, James Wang, linux-input, Mali DP Maintainers,
Derek Kiernan, linux-mips, Dragan Cvetic, Wu Hao, Tony Krowiak,
linux-kbuild, James E.J. Bottomley, Jiri Kosina, Hannes Reinecke,
linux-block, Thomas Bogendoerfer, Jacek Anaszewski, linux-mm,
Dan Williams, Andrew Morton, Mimi Zohar, Jens Axboe, Michal Marek,
Martin K. Petersen, Pierre Morel, linux-kernel, Wolfram Sang,
Daniel Vetter, Jason Wessel, Paolo Bonzini, linux-integrity,
linuxppc-dev, Mike Rapoport, Dan Murphy
In-Reply-To: <20200707180414.10467-1-rdunlap@infradead.org>
On Tue, 7 Jul 2020 11:03:54 -0700
Randy Dunlap <rdunlap@infradead.org> wrote:
> Documentation/admin-guide/mm/numaperf.rst | 2 +-
> Documentation/block/pr.rst | 2 +-
> Documentation/core-api/printk-basics.rst | 2 +-
> Documentation/dev-tools/kgdb.rst | 2 +-
> Documentation/fpga/dfl.rst | 2 +-
> Documentation/gpu/drm-uapi.rst | 2 +-
> Documentation/gpu/komeda-kms.rst | 2 +-
> Documentation/hid/intel-ish-hid.rst | 2 +-
> Documentation/i2c/upgrading-clients.rst | 2 +-
> Documentation/kbuild/kconfig-language.rst | 2 +-
> Documentation/leds/ledtrig-transient.rst | 2 +-
> Documentation/maintainer/maintainer-entry-profile.rst | 2 +-
> Documentation/mips/ingenic-tcu.rst | 2 +-
> Documentation/misc-devices/xilinx_sdfec.rst | 2 +-
> Documentation/powerpc/vas-api.rst | 2 +-
> Documentation/s390/vfio-ap.rst | 2 +-
> Documentation/scsi/advansys.rst | 2 +-
> Documentation/security/keys/trusted-encrypted.rst | 2 +-
> Documentation/virt/kvm/api.rst | 2 +-
> Documentation/vm/memory-model.rst | 2 +-
> 20 files changed, 20 insertions(+), 20 deletions(-)
I've applied this set, minus #17 that was already picked up by Martin.
Thanks,
jon
^ permalink raw reply
* Re: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86
From: Arnd Bergmann @ 2020-07-13 15:08 UTC (permalink / raw)
To: Saheed O. Bolarinwa
Cc: Rich Felker, Martin K. Petersen, Linux-sh list, linux-pci,
linux-nvme, Yicong Yang, sparclinux,
Realtek linux nic maintainers, Paul Mackerras, Linux I2C,
bcm-kernel-feedback-list, Bjorn Helgaas, rfi, Toan Le,
Greg Ungerer, Marek Vasut, Rob Herring, Stefano Stabellini,
Sagi Grimberg, Yoshinori Sato, linux-scsi, Greg Kroah-Hartman,
linux-atm-general, Russell King, Ley Foon Tan, Christoph Hellwig,
Geert Uytterhoeven, Rafał Miłecki, Chas Williams,
xen-devel, Matt Turner, open list:BROADCOM NVRAM DRIVER,
linux-kernel-mentees, Kevin Hilman, Guenter Roeck, linux-hwmon,
Jean Delvare, Andrew Donnellan, Ray Jui, James E.J. Bottomley,
Linux-Renesas, Yue Wang, Jens Axboe, Jakub Kicinski, linux-m68k,
Lorenzo Pieralisi, Ivan Kokshaysky, Michael Buesch, Shuah Khan,
bjorn, open list:ARM/Amlogic Meson SoC support, Boris Ostrovsky,
Guan Xuetao, Linux ARM, Richard Henderson, Juergen Gross,
Michal Simek, Thomas Bogendoerfer, Scott Branden, Bjorn Helgaas,
Jingoo Han, Networking, Yoshihiro Shimoda, linux-wireless,
linux-kernel@vger.kernel.org, Keith Busch, Brian King,
Philipp Zabel, alpha, Frederic Barrat, Gustavo Pimentel,
linuxppc-dev, David S. Miller, Heiner Kallweit
In-Reply-To: <20200713122247.10985-1-refactormyself@gmail.com>
On Mon, Jul 13, 2020 at 3:22 PM Saheed O. Bolarinwa
<refactormyself@gmail.com> wrote:
> This goal of these series is to move the definition of *all* PCIBIOS* from
> include/linux/pci.h to arch/x86 and limit their use within there.
> All other tree specific definition will be left for intact. Maybe they can
> be renamed.
>
> PCIBIOS* is an x86 concept as defined by the PCI spec. The returned error
> codes of PCIBIOS* are positive values and this introduces some complexities
> which other archs need not incur.
I think the intention is good, but I find the series in its current
form very hard
to review, in particular the way you touch some functions three times with
trivial changes. Instead of
1) replace PCIBIOS_SUCCESSFUL with 0
2) drop pointless 0-comparison
3) reformat whitespace
I would suggest to combine the first two steps into one patch per
subsystem and drop the third step.
> PLAN:
>
> 1. [PATCH v0 1-36] Replace all PCIBIOS_SUCCESSFUL with 0
>
> 2a. Audit all functions returning PCIBIOS_* error values directly or
> indirectly and prevent possible bug coming in (2b)
>
> 2b. Make all functions returning PCIBIOS_* error values call
> pcibios_err_to_errno(). *This will change their behaviour, for good.*
>
> 3. Clone a pcibios_err_to_errno() into arch/x86/pci/pcbios.c as _v2.
> This handles the positive error codes directly and will not use any
> PCIBIOS* definitions. So calls to it have no outside dependence.
>
> 4. Make all x86 codes that needs to convert to -E* values call the
> cloned version - pcibios_err_to_errno_v2()
>
> 5. Assign PCIBIOS_* errors values directly to generic -E* errors
>
> 6. Refactor pcibios_err_to_errno() and mark it deprecated
>
> 7. Replace all calls to pcibios_err_to_errno() with the proper -E* value
> or 0.
>
> 8. Remove all PCIBIOS* definitions in include/linux/pci.h and
> pcibios_err_to_errno() too.
>
> 9. Redefine all PCIBIOS* definitions with original values inside
> arch/x86/pci/pcbios.c
>
> 10. Redefine pcibios_err_to_errno() inside arch/x86/pci/pcbios.c
>
> 11. Replace pcibios_err_to_errno_v2() calls with pcibios_err_to_errno()
>
> 12. Remove pcibios_err_to_errno_v2()
>
> Suggested-by: Bjorn Helgaas <bjorn@helgaas.com>
> Suggested-by: Yicong Yang <yangyicong@hisilicon.com>
> Signed-off-by: "Saheed O. Bolarinwa" <refactormyself@gmail.com>
I would hope that there is a simpler procedure to get to good
code than 12 steps that rename the same things multiple times.
Maybe the work can be split up differently, with a similar end result
but fewer and easier reviewed patches. The way I'd look at the
problem, there are three main areas that can be dealt with one at
a time:
a) callers of the high-level config space accessors
pci_{write,read}_config_{byte,word,dword}, mostly in device
drivers.
b) low-level implementation of the config space accessors
through struct pci_ops
c) all other occurrences of these constants
Starting with a), my first question is whether any high-level drivers
even need to care about errors from these functions. I see 4913
callers that ignore the return code, and 576 that actually
check it, and almost none care about the specific error (as you
found as well). Unless we conclude that most PCI drivers are
wrong, could we just change the return type to 'void' and assume
they never fail for valid arguments on a valid pci_device* ?
For b), it might be nice to also change other aspects of the interface,
e.g. passing a pci_host_bridge pointer plus bus number instead of
a pci_bus pointer, or having the callback in the pci_host_bridge
structure.
> Bolarinwa Olayemi Saheed (35):
> Change PCIBIOS_SUCCESSFUL to 0
> Change PCIBIOS_SUCCESSFUL to 0
> Change PCIBIOS_SUCCESSFUL to 0
> Tidy Success/Failure checks
> Change PCIBIOS_SUCCESSFUL to 0
> Tidy Success/Failure checks
> Change PCIBIOS_SUCCESSFUL to 0
Some patches have identical subject lines including the subsystem
prefix, which you should avoid. Try to also fix the git request-pull
output to not drop that prefix here so the list makes more sense.
Arnd
^ permalink raw reply
* [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86
From: Saheed O. Bolarinwa @ 2020-07-13 12:22 UTC (permalink / raw)
To: helgaas
Cc: Rich Felker, Martin K. Petersen, linux-sh, linux-pci, linux-nvme,
Yicong Yang, Keith Busch, netdev, Paul Mackerras, linux-i2c,
bcm-kernel-feedback-list, sparclinux, rfi, Toan Le, Greg Ungerer,
Marek Vasut, Rob Herring, Stefano Stabellini, Sagi Grimberg,
Yoshinori Sato, linux-scsi, Greg Kroah-Hartman, linux-atm-general,
Russell King, Realtek linux nic maintainers, Christoph Hellwig,
Ley Foon Tan, Geert Uytterhoeven, Rafał Miłecki,
Chas Williams, xen-devel, Matt Turner, linux-mips,
linux-kernel-mentees, Kevin Hilman, Guenter Roeck, linux-hwmon,
Jean Delvare, Andrew Donnellan, Arnd Bergmann, Ray Jui,
James E.J. Bottomley, Yue Wang, Jens Axboe, Jakub Kicinski,
linux-m68k, Lorenzo Pieralisi, Ivan Kokshaysky, Michael Buesch,
skhan, bjorn, linux-amlogic, Boris Ostrovsky, Guan Xuetao,
linux-arm-kernel, Richard Henderson, Juergen Gross, Michal Simek,
Thomas Bogendoerfer, Scott Branden, Bjorn Helgaas, Jingoo Han,
Saheed O. Bolarinwa, Yoshihiro Shimoda, linux-wireless,
linux-kernel, linux-renesas-soc, Brian King, Philipp Zabel,
linux-alpha, Frederic Barrat, Gustavo Pimentel, linuxppc-dev,
David S. Miller, Heiner Kallweit
This goal of these series is to move the definition of *all* PCIBIOS* from
include/linux/pci.h to arch/x86 and limit their use within there.
All other tree specific definition will be left for intact. Maybe they can
be renamed.
PCIBIOS* is an x86 concept as defined by the PCI spec. The returned error
codes of PCIBIOS* are positive values and this introduces some complexities
which other archs need not incur.
PLAN:
1. [PATCH v0 1-36] Replace all PCIBIOS_SUCCESSFUL with 0
2a. Audit all functions returning PCIBIOS_* error values directly or
indirectly and prevent possible bug coming in (2b)
2b. Make all functions returning PCIBIOS_* error values call
pcibios_err_to_errno(). *This will change their behaviour, for good.*
3. Clone a pcibios_err_to_errno() into arch/x86/pci/pcbios.c as _v2.
This handles the positive error codes directly and will not use any
PCIBIOS* definitions. So calls to it have no outside dependence.
4. Make all x86 codes that needs to convert to -E* values call the
cloned version - pcibios_err_to_errno_v2()
5. Assign PCIBIOS_* errors values directly to generic -E* errors
6. Refactor pcibios_err_to_errno() and mark it deprecated
7. Replace all calls to pcibios_err_to_errno() with the proper -E* value
or 0.
8. Remove all PCIBIOS* definitions in include/linux/pci.h and
pcibios_err_to_errno() too.
9. Redefine all PCIBIOS* definitions with original values inside
arch/x86/pci/pcbios.c
10. Redefine pcibios_err_to_errno() inside arch/x86/pci/pcbios.c
11. Replace pcibios_err_to_errno_v2() calls with pcibios_err_to_errno()
12. Remove pcibios_err_to_errno_v2()
Suggested-by: Bjorn Helgaas <bjorn@helgaas.com>
Suggested-by: Yicong Yang <yangyicong@hisilicon.com>
Signed-off-by: "Saheed O. Bolarinwa" <refactormyself@gmail.com>
Bolarinwa Olayemi Saheed (35):
Change PCIBIOS_SUCCESSFUL to 0
Change PCIBIOS_SUCCESSFUL to 0
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Change PCIBIOS_SUCCESSFUL to 0
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Change PCIBIOS_SUCCESSFUL to 0
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Fix Style ERROR: assignment in if condition
Change PCIBIOS_SUCCESSFUL to 0
Change PCIBIOS_SUCCESSFUL to 0
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Change PCIBIOS_SUCCESSFUL to 0
Change PCIBIOS_SUCCESSFUL to 0
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
Change PCIBIOS_SUCCESSFUL to 0
Tidy Success/Failure checks
arch/alpha/kernel/core_apecs.c | 4 +--
arch/alpha/kernel/core_cia.c | 4 +--
arch/alpha/kernel/core_irongate.c | 4 +--
arch/alpha/kernel/core_lca.c | 4 +--
arch/alpha/kernel/core_marvel.c | 4 +--
arch/alpha/kernel/core_mcpcia.c | 4 +--
arch/alpha/kernel/core_polaris.c | 4 +--
arch/alpha/kernel/core_t2.c | 4 +--
arch/alpha/kernel/core_titan.c | 4 +--
arch/alpha/kernel/core_tsunami.c | 4 +--
arch/alpha/kernel/core_wildfire.c | 4 +--
arch/alpha/kernel/sys_miata.c | 2 +-
arch/arm/common/it8152.c | 4 +--
arch/arm/mach-cns3xxx/pcie.c | 2 +-
arch/arm/mach-footbridge/dc21285.c | 4 +--
arch/arm/mach-iop32x/pci.c | 6 ++--
arch/arm/mach-ixp4xx/common-pci.c | 8 ++---
arch/arm/mach-orion5x/pci.c | 4 +--
arch/arm/plat-orion/pcie.c | 8 ++---
arch/m68k/coldfire/pci.c | 8 ++---
arch/microblaze/pci/indirect_pci.c | 4 +--
arch/mips/pci/fixup-ath79.c | 2 +-
arch/mips/pci/ops-bcm63xx.c | 14 ++++----
arch/mips/pci/ops-bonito64.c | 4 +--
arch/mips/pci/ops-gt64xxx_pci0.c | 4 +--
arch/mips/pci/ops-lantiq.c | 4 +--
arch/mips/pci/ops-loongson2.c | 4 +--
arch/mips/pci/ops-mace.c | 4 +--
arch/mips/pci/ops-msc.c | 4 +--
arch/mips/pci/ops-rc32434.c | 6 ++--
arch/mips/pci/ops-sni.c | 4 +--
arch/mips/pci/ops-tx3927.c | 2 +-
arch/mips/pci/ops-tx4927.c | 2 +-
arch/mips/pci/ops-vr41xx.c | 4 +--
arch/mips/pci/pci-alchemy.c | 6 ++--
arch/mips/pci/pci-ar2315.c | 5 ++-
arch/mips/pci/pci-ar71xx.c | 4 +--
arch/mips/pci/pci-ar724x.c | 6 ++--
arch/mips/pci/pci-bcm1480.c | 4 +--
arch/mips/pci/pci-bcm1480ht.c | 4 +--
arch/mips/pci/pci-mt7620.c | 4 +--
arch/mips/pci/pci-octeon.c | 12 +++----
arch/mips/pci/pci-rt2880.c | 4 +--
arch/mips/pci/pci-rt3883.c | 4 +--
arch/mips/pci/pci-sb1250.c | 4 +--
arch/mips/pci/pci-virtio-guest.c | 4 +--
arch/mips/pci/pci-xlp.c | 4 +--
arch/mips/pci/pci-xlr.c | 4 +--
arch/mips/pci/pci-xtalk-bridge.c | 14 ++++----
arch/mips/pci/pcie-octeon.c | 4 +--
arch/mips/txx9/generic/pci.c | 5 ++-
arch/powerpc/kernel/rtas_pci.c | 4 +--
arch/powerpc/platforms/4xx/pci.c | 4 +--
arch/powerpc/platforms/52xx/efika.c | 4 +--
arch/powerpc/platforms/52xx/mpc52xx_pci.c | 4 +--
arch/powerpc/platforms/82xx/pq2.c | 2 +-
arch/powerpc/platforms/85xx/mpc85xx_cds.c | 2 +-
arch/powerpc/platforms/85xx/mpc85xx_ds.c | 2 +-
arch/powerpc/platforms/86xx/mpc86xx_hpcn.c | 2 +-
arch/powerpc/platforms/chrp/pci.c | 8 ++---
arch/powerpc/platforms/embedded6xx/holly.c | 2 +-
.../platforms/embedded6xx/mpc7448_hpc2.c | 2 +-
arch/powerpc/platforms/fsl_uli1575.c | 2 +-
arch/powerpc/platforms/maple/pci.c | 18 +++++-----
arch/powerpc/platforms/pasemi/pci.c | 6 ++--
arch/powerpc/platforms/powermac/pci.c | 8 ++---
arch/powerpc/platforms/powernv/eeh-powernv.c | 4 +--
arch/powerpc/platforms/powernv/pci.c | 4 +--
arch/powerpc/platforms/pseries/eeh_pseries.c | 4 +--
arch/powerpc/sysdev/fsl_pci.c | 2 +-
arch/powerpc/sysdev/indirect_pci.c | 4 +--
arch/powerpc/sysdev/tsi108_pci.c | 4 +--
arch/sh/drivers/pci/common.c | 3 +-
arch/sh/drivers/pci/ops-dreamcast.c | 4 +--
arch/sh/drivers/pci/ops-sh4.c | 4 +--
arch/sh/drivers/pci/ops-sh7786.c | 8 ++---
arch/sh/drivers/pci/pci.c | 2 +-
arch/sparc/kernel/pci_common.c | 28 +++++++--------
arch/unicore32/kernel/pci.c | 4 +--
drivers/atm/iphase.c | 20 ++++++-----
drivers/atm/lanai.c | 8 ++---
drivers/bcma/driver_pci_host.c | 4 +--
drivers/hwmon/sis5595.c | 13 +++----
drivers/hwmon/via686a.c | 13 +++----
drivers/hwmon/vt8231.c | 13 +++----
drivers/i2c/busses/i2c-ali15x3.c | 5 ++-
drivers/i2c/busses/i2c-nforce2.c | 3 +-
drivers/i2c/busses/i2c-sis5595.c | 15 +++-----
drivers/misc/cxl/vphb.c | 4 +--
drivers/net/ethernet/realtek/r8169_main.c | 2 +-
drivers/nvme/host/pci.c | 2 +-
drivers/pci/access.c | 14 ++++----
drivers/pci/controller/dwc/pci-meson.c | 4 +--
.../pci/controller/dwc/pcie-designware-host.c | 2 +-
drivers/pci/controller/dwc/pcie-designware.c | 4 +--
drivers/pci/controller/dwc/pcie-hisi.c | 4 +--
drivers/pci/controller/dwc/pcie-tegra194.c | 4 +--
.../pci/controller/mobiveil/pcie-mobiveil.c | 4 +--
drivers/pci/controller/pci-aardvark.c | 4 +--
drivers/pci/controller/pci-ftpci100.c | 4 +--
drivers/pci/controller/pci-hyperv.c | 8 ++---
drivers/pci/controller/pci-mvebu.c | 4 +--
drivers/pci/controller/pci-thunder-ecam.c | 36 +++++++++----------
drivers/pci/controller/pci-thunder-pem.c | 4 +--
drivers/pci/controller/pci-xgene.c | 5 ++-
drivers/pci/controller/pcie-altera.c | 16 ++++-----
drivers/pci/controller/pcie-iproc.c | 10 +++---
drivers/pci/controller/pcie-mediatek.c | 4 +--
drivers/pci/controller/pcie-rcar-host.c | 8 ++---
drivers/pci/controller/pcie-rockchip-host.c | 10 +++---
drivers/pci/pci-bridge-emul.c | 14 ++++----
drivers/pci/pci.c | 8 ++---
drivers/pci/pcie/bw_notification.c | 4 +--
drivers/pci/probe.c | 4 +--
drivers/pci/quirks.c | 4 +--
drivers/pci/syscall.c | 8 ++---
drivers/pci/xen-pcifront.c | 2 +-
drivers/scsi/ipr.c | 16 ++++-----
drivers/scsi/pmcraid.c | 6 ++--
drivers/ssb/driver_gige.c | 4 +--
drivers/ssb/driver_pcicore.c | 4 +--
drivers/xen/xen-pciback/conf_space.c | 2 +-
122 files changed, 347 insertions(+), 369 deletions(-)
--
2.18.2
^ permalink raw reply
* [PATCH 2/2] powerpc/kvm/cma: Improve kernel log during boot
From: Aneesh Kumar K.V @ 2020-07-13 15:07 UTC (permalink / raw)
To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V
In-Reply-To: <20200713150749.25245-1-aneesh.kumar@linux.ibm.com>
Current kernel gives:
[ 0.000000] cma: Reserved 26224 MiB at 0x0000007959000000
[ 0.000000] hugetlb_cma: reserve 65536 MiB, up to 16384 MiB per node
[ 0.000000] cma: Reserved 16384 MiB at 0x0000001800000000
With the fix
[ 0.000000] kvm_cma_reserve: reserving 26214 MiB for global area
[ 0.000000] cma: Reserved 26224 MiB at 0x0000007959000000
[ 0.000000] hugetlb_cma: reserve 65536 MiB, up to 16384 MiB per node
[ 0.000000] cma: Reserved 16384 MiB at 0x0000001800000000
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
arch/powerpc/kvm/book3s_hv_builtin.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 7cd3cf3d366b..073617ce83e0 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -113,7 +113,7 @@ void __init kvm_cma_reserve(void)
selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT;
if (selected_size) {
- pr_debug("%s: reserving %ld MiB for global area\n", __func__,
+ pr_info("%s: reserving %ld MiB for global area\n", __func__,
(unsigned long)selected_size / SZ_1M);
align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
cma_declare_contiguous(0, selected_size, 0, align_size,
--
2.26.2
^ permalink raw reply related
* [PATCH 1/2] powerpc/hugetlb/cma: Allocate gigantic hugetlb pages using CMA
From: Aneesh Kumar K.V @ 2020-07-13 15:07 UTC (permalink / raw)
To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V
commit: cf11e85fc08c ("mm: hugetlb: optionally allocate gigantic hugepages using cma")
added support for allocating gigantic hugepages using CMA. This patch
enables the same for powerpc
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
arch/powerpc/include/asm/hugetlb.h | 7 +++++++
arch/powerpc/kernel/setup-common.c | 3 +++
arch/powerpc/mm/hugetlbpage.c | 18 ++++++++++++++++++
3 files changed, 28 insertions(+)
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 551a9d4d3958..013165e62618 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -57,6 +57,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t pte, int dirty);
+void gigantic_hugetlb_cma_reserve(void) __init;
#include <asm-generic/hugetlb.h>
#else /* ! CONFIG_HUGETLB_PAGE */
@@ -71,6 +72,12 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned long addr,
{
return NULL;
}
+
+
+static inline void __init gigantic_hugetlb_cma_reserve(void)
+{
+}
+
#endif /* CONFIG_HUGETLB_PAGE */
#endif /* _ASM_POWERPC_HUGETLB_H */
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 9d3faac53295..b198b0ff25bc 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -928,6 +928,9 @@ void __init setup_arch(char **cmdline_p)
/* Reserve large chunks of memory for use by CMA for KVM. */
kvm_cma_reserve();
+ /* Reserve large chunks of memory for us by CMA for hugetlb */
+ gigantic_hugetlb_cma_reserve();
+
klp_init_thread_info(&init_task);
init_mm.start_code = (unsigned long)_stext;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e9bfbccd975d..26292544630f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -684,3 +684,21 @@ void flush_dcache_icache_hugepage(struct page *page)
}
}
}
+
+void __init gigantic_hugetlb_cma_reserve(void)
+{
+ unsigned long order = 0;
+
+ if (radix_enabled())
+ order = PUD_SHIFT - PAGE_SHIFT;
+ else if (!firmware_has_feature(FW_FEATURE_LPAR) && mmu_psize_defs[MMU_PAGE_16G].shift)
+ /*
+ * For pseries we do use ibm,expected#pages for reserving 16G pages.
+ */
+ order = mmu_psize_to_shift(MMU_PAGE_16G) - PAGE_SHIFT;
+
+ if (order) {
+ VM_WARN_ON(order < MAX_ORDER);
+ hugetlb_cma_reserve(order);
+ }
+}
--
2.26.2
^ permalink raw reply related
* Re: [PATCH v5] ima: move APPRAISE_BOOTPARAM dependency on ARCH_POLICY to runtime
From: Bruno Meneguele @ 2020-07-13 15:03 UTC (permalink / raw)
To: Mimi Zohar
Cc: linux-s390, nayna, erichte, x86, linux-kernel, stable,
linux-integrity, linuxppc-dev
In-Reply-To: <20200710192516.GC10547@glitch>
[-- Attachment #1: Type: text/plain, Size: 5872 bytes --]
On Fri, Jul 10, 2020 at 04:25:16PM -0300, Bruno Meneguele wrote:
> On Fri, Jul 10, 2020 at 02:54:48PM -0400, Mimi Zohar wrote:
> > On Fri, 2020-07-10 at 15:34 -0300, Bruno Meneguele wrote:
> > > On Fri, Jul 10, 2020 at 03:03:38PM -0300, Bruno Meneguele wrote:
> > > > On Fri, Jul 10, 2020 at 01:23:24PM -0400, Mimi Zohar wrote:
> > > > > On Thu, 2020-07-09 at 13:46 -0300, Bruno Meneguele wrote:
> > > > > > APPRAISE_BOOTPARAM has been marked as dependent on !ARCH_POLICY in compile
> > > > > > time, enforcing the appraisal whenever the kernel had the arch policy option
> > > > > > enabled.
> > > > >
> > > > > > However it breaks systems where the option is set but the system didn't
> > > > > > boot in a "secure boot" platform. In this scenario, anytime an appraisal
> > > > > > policy (i.e. ima_policy=appraisal_tcb) is used it will be forced, without
> > > > > > giving the user the opportunity to label the filesystem, before enforcing
> > > > > > integrity.
> > > > > >
> > > > > > Considering the ARCH_POLICY is only effective when secure boot is actually
> > > > > > enabled this patch remove the compile time dependency and move it to a
> > > > > > runtime decision, based on the secure boot state of that platform.
> > > > >
> > > > > Perhaps we could simplify this patch description a bit?
> > > > >
> > > > > The IMA_APPRAISE_BOOTPARAM config allows enabling different
> > > > > "ima_appraise=" modes - log, fix, enforce - at run time, but not when
> > > > > IMA architecture specific policies are enabled. This prevents
> > > > > properly labeling the filesystem on systems where secure boot is
> > > > > supported, but not enabled on the platform. Only when secure boot is
> > > > > enabled, should these IMA appraise modes be disabled.
> > > > >
> > > > > This patch removes the compile time dependency and makes it a runtime
> > > > > decision, based on the secure boot state of that platform.
> > > > >
> > > >
> > > > Sounds good to me.
> > > >
> > > > > <snip>
> > > > >
> > > > > > diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
> > > > > > index a9649b04b9f1..884de471b38a 100644
> > > > > > --- a/security/integrity/ima/ima_appraise.c
> > > > > > +++ b/security/integrity/ima/ima_appraise.c
> > > > > > @@ -19,6 +19,11 @@
> > > > > > static int __init default_appraise_setup(c
> > > > >
> > > > > > har *str)
> > > > > > {
> > > > > > #ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
> > > > > > + if (arch_ima_get_secureboot()) {
> > > > > > + pr_info("appraise boot param ignored: secure boot enabled");
> > > > >
> > > > > Instead of a generic statement, is it possible to include the actual
> > > > > option being denied? Perhaps something like: "Secure boot enabled,
> > > > > ignoring %s boot command line option"
> > > > >
> > > > > Mimi
> > > > >
> > > >
> > > > Yes, sure.
> > > >
> > >
> > > Btw, would it make sense to first make sure we have a valid "str"
> > > option and not something random to print?
> > >
> > > diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
> > > index a9649b04b9f1..1f1175531d3e 100644
> > > --- a/security/integrity/ima/ima_appraise.c
> > > +++ b/security/integrity/ima/ima_appraise.c
> > > @@ -25,6 +25,16 @@ static int __init default_appraise_setup(char *str)
> > > ima_appraise = IMA_APPRAISE_LOG;
> > > else if (strncmp(str, "fix", 3) == 0)
> > > ima_appraise = IMA_APPRAISE_FIX;
> > > + else
> > > + pr_info("invalid \"%s\" appraise option");
> > > +
> > > + if (arch_ima_get_secureboot()) {
> > > + if (!is_ima_appraise_enabled()) {
> > > + pr_info("Secure boot enabled: ignoring ima_appraise=%s boot parameter option",
> > > + str);
> > > + ima_appraise = IMA_APPRAISE_ENFORCE;
> > > + }
> > > + }
> >
> > Providing feedback is probably a good idea. However, the
> > "arch_ima_get_secureboot" test can't come after setting
> > "ima_appraise."
> >
>
> Sorry, but I'm not sure if I got the reason to why it can't be done
> after: would it be basically to prevent any further processing about
> ima_appraise as a matter of security principle? Or maybe to keep the
> dependency between secureboot and bootparam truly strict?
>
> Or are there something else I'm missing?
>
I'm going to send a v6 with the pr_info() placed in the beginning
directly printing 'str', thus we can have the actual issue solved.
Then later I send another patches to handle the other cases of limiting
'str' printing and also giving the user a feedback about invalid
ima_appraise= options. So we can discuss further on that.
Thanks Mimi.
> > Mimi
> >
> > > #endif
> > > return 1;
> > > }
> > >
> > >
> > > The "else" there I think would make sense as well, at least to give the
> > > user some feedback about a possible mispelling of him (as a separate
> > > patch).
> > >
> > > And "if(!is_ima_appraise_enabled())" would avoid to print anything about
> > > "ignoring the option" to the user in case he explicitly set "enforce",
> > > which we know there isn't any real effect but is allowed and shown in
> > > kernel-parameters.txt.
> > >
> > > > Thanks!
> > > >
> > > > > > + return 1;
> > > > > > + }
> > > > > > +
> > > > > > if (strncmp(str, "off", 3) == 0)
> > > > > > ima_appraise = 0;
> > > > > > else if (strncmp(str, "log", 3) == 0)
> > > > >
> > > >
> > > > --
> > > > bmeneg
> > > > PGP Key: http://bmeneg.com/pubkey.txt
> > >
> > >
> > >
> >
>
> --
> bmeneg
> PGP Key: http://bmeneg.com/pubkey.txt
--
bmeneg
PGP Key: http://bmeneg.com/pubkey.txt
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* [PATCH v2] powerpc/perf: Add kernel support for new MSR[HV PR] bits in trace-imc
From: Madhavan Srinivasan @ 2020-07-13 14:46 UTC (permalink / raw)
To: mpe; +Cc: Anju T Sudhakar, linuxppc-dev, Madhavan Srinivasan
From: Anju T Sudhakar <anju@linux.vnet.ibm.com>
IMC trace-mode record has MSR[HV PR] bits added in the third DW.
These bits can be used to set the cpumode for the instruction pointer
captured in each sample.
Add support in kernel to use these bits to set the cpumode for
each sample.
Signed-off-by: Anju T Sudhakar <anju@linux.vnet.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
---
Changelog v1:
- Changed check to use CPU_FTR_ARCH_31
arch/powerpc/include/asm/imc-pmu.h | 5 +++++
arch/powerpc/perf/imc-pmu.c | 29 ++++++++++++++++++++++++-----
2 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h
index 4da4fcba0684..4f897993b710 100644
--- a/arch/powerpc/include/asm/imc-pmu.h
+++ b/arch/powerpc/include/asm/imc-pmu.h
@@ -99,6 +99,11 @@ struct trace_imc_data {
*/
#define IMC_TRACE_RECORD_TB1_MASK 0x3ffffffffffULL
+/*
+ * Bit 0:1 in third DW of IMC trace record
+ * specifies the MSR[HV PR] values.
+ */
+#define IMC_TRACE_RECORD_VAL_HVPR(x) ((x) >> 62)
/*
* Device tree parser code detects IMC pmu support and
diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 0edcfd0b491d..a45d694a5d5d 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -1288,11 +1288,30 @@ static int trace_imc_prepare_sample(struct trace_imc_data *mem,
header->size = sizeof(*header) + event->header_size;
header->misc = 0;
- if (is_kernel_addr(data->ip))
- header->misc |= PERF_RECORD_MISC_KERNEL;
- else
- header->misc |= PERF_RECORD_MISC_USER;
-
+ if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+ switch (IMC_TRACE_RECORD_VAL_HVPR(mem->val)) {
+ case 0:/* when MSR HV and PR not set in the trace-record */
+ header->misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+ break;
+ case 1: /* MSR HV is 0 and PR is 1 */
+ header->misc |= PERF_RECORD_MISC_GUEST_USER;
+ break;
+ case 2: /* MSR HV is 1 and PR is 0 */
+ header->misc |= PERF_RECORD_MISC_HYPERVISOR;
+ break;
+ case 3: /* MSR HV is 1 and PR is 1 */
+ header->misc |= PERF_RECORD_MISC_USER;
+ break;
+ default:
+ pr_info("IMC: Unable to set the flag based on MSR bits\n");
+ break;
+ }
+ } else {
+ if (is_kernel_addr(data->ip))
+ header->misc |= PERF_RECORD_MISC_KERNEL;
+ else
+ header->misc |= PERF_RECORD_MISC_USER;
+ }
perf_event_header__init_id(header, data, event);
return 0;
--
2.26.2
^ permalink raw reply related
* Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Mathieu Desnoyers @ 2020-07-13 14:13 UTC (permalink / raw)
To: Nicholas Piggin
Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, x86, linux-kernel,
linux-mm, Andy Lutomirski, linuxppc-dev
In-Reply-To: <1594647408.wmrazhwjzb.astroid@bobo.none>
----- On Jul 13, 2020, at 9:47 AM, Nicholas Piggin npiggin@gmail.com wrote:
> Excerpts from Nicholas Piggin's message of July 13, 2020 2:45 pm:
>> Excerpts from Andy Lutomirski's message of July 11, 2020 3:04 am:
>>> Also, as it stands, I can easily see in_irq() ceasing to promise to
>>> serialize. There are older kernels for which it does not promise to
>>> serialize. And I have plans to make it stop serializing in the
>>> nearish future.
>>
>> You mean x86's return from interrupt? Sounds fun... you'll konw where to
>> update the membarrier sync code, at least :)
>
> Oh, I should actually say Mathieu recently clarified a return from
> interrupt doesn't fundamentally need to serialize in order to support
> membarrier sync core.
Clarification to your statement:
Return from interrupt to kernel code does not need to be context serializing
as long as kernel serializes before returning to user-space.
However, return from interrupt to user-space needs to be context serializing.
Thanks,
Mathieu
>
> https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-July/214171.html
>
> So you may not need to do anything more if you relaxed it.
>
> Thanks,
> Nick
--
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com
^ permalink raw reply
* Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode
From: Nicholas Piggin @ 2020-07-13 13:47 UTC (permalink / raw)
To: Andy Lutomirski
Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
Mathieu Desnoyers, linuxppc-dev
In-Reply-To: <1594613902.1wzayj0p15.astroid@bobo.none>
Excerpts from Nicholas Piggin's message of July 13, 2020 2:45 pm:
> Excerpts from Andy Lutomirski's message of July 11, 2020 3:04 am:
>> Also, as it stands, I can easily see in_irq() ceasing to promise to
>> serialize. There are older kernels for which it does not promise to
>> serialize. And I have plans to make it stop serializing in the
>> nearish future.
>
> You mean x86's return from interrupt? Sounds fun... you'll konw where to
> update the membarrier sync code, at least :)
Oh, I should actually say Mathieu recently clarified a return from
interrupt doesn't fundamentally need to serialize in order to support
membarrier sync core.
https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-July/214171.html
So you may not need to do anything more if you relaxed it.
Thanks,
Nick
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox