* [PATCH v6 05/11] powerpc/drmem: make lmb walk a bit more flexible
From: Hari Bathini @ 2020-07-29 11:40 UTC (permalink / raw)
To: Michael Ellerman
Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
Thiago Jung Bauermann, Andrew Morton, Dave Young, Vivek Goyal,
Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>
Currently, numa & prom are the users of drmem lmb walk code. Loading
kdump with kexec_file also needs to walk the drmem LMBs to setup the
usable memory ranges for kdump kernel. But there are couple of issues
in using the code as is. One, walk_drmem_lmb() code is built into the
.init section currently, while kexec_file needs it later. Two, there
is no scope to pass data to the callback function for processing and/
or erroring out on certain conditions.
Fix that by, moving drmem LMB walk code out of .init section, adding
scope to pass data to the callback function and bailing out when
an error is encountered in the callback function.
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---
v5 -> v6:
* Unchanged.
v4 -> v5:
* Unchanged.
v3 -> v4:
* Unchanged. Added Reviewed-by tag from Thiago.
v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.
v1 -> v2:
* No changes.
arch/powerpc/include/asm/drmem.h | 9 ++--
arch/powerpc/kernel/prom.c | 13 +++---
arch/powerpc/mm/drmem.c | 87 +++++++++++++++++++++++++-------------
arch/powerpc/mm/numa.c | 13 +++---
4 files changed, 78 insertions(+), 44 deletions(-)
diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 414d209f45bb..17ccc6474ab6 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -90,13 +90,14 @@ static inline bool drmem_lmb_reserved(struct drmem_lmb *lmb)
}
u64 drmem_lmb_memory_max(void);
-void __init walk_drmem_lmbs(struct device_node *dn,
- void (*func)(struct drmem_lmb *, const __be32 **));
+int walk_drmem_lmbs(struct device_node *dn, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *));
int drmem_update_dt(void);
#ifdef CONFIG_PPC_PSERIES
-void __init walk_drmem_lmbs_early(unsigned long node,
- void (*func)(struct drmem_lmb *, const __be32 **));
+int __init
+walk_drmem_lmbs_early(unsigned long node, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *));
#endif
static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb)
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 9cc49f265c86..7df78de378b0 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -468,8 +468,9 @@ static bool validate_mem_limit(u64 base, u64 *size)
* This contains a list of memory blocks along with NUMA affinity
* information.
*/
-static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
- const __be32 **usm)
+static int __init early_init_drmem_lmb(struct drmem_lmb *lmb,
+ const __be32 **usm,
+ void *data)
{
u64 base, size;
int is_kexec_kdump = 0, rngs;
@@ -484,7 +485,7 @@ static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
*/
if ((lmb->flags & DRCONF_MEM_RESERVED) ||
!(lmb->flags & DRCONF_MEM_ASSIGNED))
- return;
+ return 0;
if (*usm)
is_kexec_kdump = 1;
@@ -499,7 +500,7 @@ static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
*/
rngs = dt_mem_next_cell(dt_root_size_cells, usm);
if (!rngs) /* there are no (base, size) duple */
- return;
+ return 0;
}
do {
@@ -524,6 +525,8 @@ static void __init early_init_drmem_lmb(struct drmem_lmb *lmb,
if (lmb->flags & DRCONF_MEM_HOTREMOVABLE)
memblock_mark_hotplug(base, size);
} while (--rngs);
+
+ return 0;
}
#endif /* CONFIG_PPC_PSERIES */
@@ -534,7 +537,7 @@ static int __init early_init_dt_scan_memory_ppc(unsigned long node,
#ifdef CONFIG_PPC_PSERIES
if (depth == 1 &&
strcmp(uname, "ibm,dynamic-reconfiguration-memory") == 0) {
- walk_drmem_lmbs_early(node, early_init_drmem_lmb);
+ walk_drmem_lmbs_early(node, NULL, early_init_drmem_lmb);
return 0;
}
#endif
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 59327cefbc6a..b2eeea39684c 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -14,6 +14,8 @@
#include <asm/prom.h>
#include <asm/drmem.h>
+static int n_root_addr_cells, n_root_size_cells;
+
static struct drmem_lmb_info __drmem_info;
struct drmem_lmb_info *drmem_info = &__drmem_info;
@@ -189,12 +191,13 @@ int drmem_update_dt(void)
return rc;
}
-static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
+static void read_drconf_v1_cell(struct drmem_lmb *lmb,
const __be32 **prop)
{
const __be32 *p = *prop;
- lmb->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
+ lmb->base_addr = of_read_number(p, n_root_addr_cells);
+ p += n_root_addr_cells;
lmb->drc_index = of_read_number(p++, 1);
p++; /* skip reserved field */
@@ -205,29 +208,33 @@ static void __init read_drconf_v1_cell(struct drmem_lmb *lmb,
*prop = p;
}
-static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm,
- void (*func)(struct drmem_lmb *, const __be32 **))
+static int
+__walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
struct drmem_lmb lmb;
u32 i, n_lmbs;
+ int ret = 0;
n_lmbs = of_read_number(prop++, 1);
- if (n_lmbs == 0)
- return;
-
for (i = 0; i < n_lmbs; i++) {
read_drconf_v1_cell(&lmb, &prop);
- func(&lmb, &usm);
+ ret = func(&lmb, &usm, data);
+ if (ret)
+ break;
}
+
+ return ret;
}
-static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
+static void read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
const __be32 **prop)
{
const __be32 *p = *prop;
dr_cell->seq_lmbs = of_read_number(p++, 1);
- dr_cell->base_addr = dt_mem_next_cell(dt_root_addr_cells, &p);
+ dr_cell->base_addr = of_read_number(p, n_root_addr_cells);
+ p += n_root_addr_cells;
dr_cell->drc_index = of_read_number(p++, 1);
dr_cell->aa_index = of_read_number(p++, 1);
dr_cell->flags = of_read_number(p++, 1);
@@ -235,17 +242,16 @@ static void __init read_drconf_v2_cell(struct of_drconf_cell_v2 *dr_cell,
*prop = p;
}
-static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
- void (*func)(struct drmem_lmb *, const __be32 **))
+static int
+__walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
struct of_drconf_cell_v2 dr_cell;
struct drmem_lmb lmb;
u32 i, j, lmb_sets;
+ int ret = 0;
lmb_sets = of_read_number(prop++, 1);
- if (lmb_sets == 0)
- return;
-
for (i = 0; i < lmb_sets; i++) {
read_drconf_v2_cell(&dr_cell, &prop);
@@ -259,21 +265,29 @@ static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm,
lmb.aa_index = dr_cell.aa_index;
lmb.flags = dr_cell.flags;
- func(&lmb, &usm);
+ ret = func(&lmb, &usm, data);
+ if (ret)
+ break;
}
}
+
+ return ret;
}
#ifdef CONFIG_PPC_PSERIES
-void __init walk_drmem_lmbs_early(unsigned long node,
- void (*func)(struct drmem_lmb *, const __be32 **))
+int __init walk_drmem_lmbs_early(unsigned long node, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
const __be32 *prop, *usm;
- int len;
+ int len, ret = -ENODEV;
prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
if (!prop || len < dt_root_size_cells * sizeof(__be32))
- return;
+ return ret;
+
+ /* Get the address & size cells */
+ n_root_addr_cells = dt_root_addr_cells;
+ n_root_size_cells = dt_root_size_cells;
drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
@@ -281,20 +295,21 @@ void __init walk_drmem_lmbs_early(unsigned long node,
prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory", &len);
if (prop) {
- __walk_drmem_v1_lmbs(prop, usm, func);
+ ret = __walk_drmem_v1_lmbs(prop, usm, data, func);
} else {
prop = of_get_flat_dt_prop(node, "ibm,dynamic-memory-v2",
&len);
if (prop)
- __walk_drmem_v2_lmbs(prop, usm, func);
+ ret = __walk_drmem_v2_lmbs(prop, usm, data, func);
}
memblock_dump_all();
+ return ret;
}
#endif
-static int __init init_drmem_lmb_size(struct device_node *dn)
+static int init_drmem_lmb_size(struct device_node *dn)
{
const __be32 *prop;
int len;
@@ -303,12 +318,12 @@ static int __init init_drmem_lmb_size(struct device_node *dn)
return 0;
prop = of_get_property(dn, "ibm,lmb-size", &len);
- if (!prop || len < dt_root_size_cells * sizeof(__be32)) {
+ if (!prop || len < n_root_size_cells * sizeof(__be32)) {
pr_info("Could not determine LMB size\n");
return -1;
}
- drmem_info->lmb_size = dt_mem_next_cell(dt_root_size_cells, &prop);
+ drmem_info->lmb_size = of_read_number(prop, n_root_size_cells);
return 0;
}
@@ -329,24 +344,36 @@ static const __be32 *of_get_usable_memory(struct device_node *dn)
return prop;
}
-void __init walk_drmem_lmbs(struct device_node *dn,
- void (*func)(struct drmem_lmb *, const __be32 **))
+int walk_drmem_lmbs(struct device_node *dn, void *data,
+ int (*func)(struct drmem_lmb *, const __be32 **, void *))
{
const __be32 *prop, *usm;
+ int ret = -ENODEV;
+
+ if (!of_root)
+ return ret;
+
+ /* Get the address & size cells */
+ of_node_get(of_root);
+ n_root_addr_cells = of_n_addr_cells(of_root);
+ n_root_size_cells = of_n_size_cells(of_root);
+ of_node_put(of_root);
if (init_drmem_lmb_size(dn))
- return;
+ return ret;
usm = of_get_usable_memory(dn);
prop = of_get_property(dn, "ibm,dynamic-memory", NULL);
if (prop) {
- __walk_drmem_v1_lmbs(prop, usm, func);
+ ret = __walk_drmem_v1_lmbs(prop, usm, data, func);
} else {
prop = of_get_property(dn, "ibm,dynamic-memory-v2", NULL);
if (prop)
- __walk_drmem_v2_lmbs(prop, usm, func);
+ ret = __walk_drmem_v2_lmbs(prop, usm, data, func);
}
+
+ return ret;
}
static void __init init_drmem_v1_lmbs(const __be32 *prop)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 9fcf2d195830..88eb6894418d 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -644,8 +644,9 @@ static inline int __init read_usm_ranges(const __be32 **usm)
* Extract NUMA information from the ibm,dynamic-reconfiguration-memory
* node. This assumes n_mem_{addr,size}_cells have been set.
*/
-static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
- const __be32 **usm)
+static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
+ const __be32 **usm,
+ void *data)
{
unsigned int ranges, is_kexec_kdump = 0;
unsigned long base, size, sz;
@@ -657,7 +658,7 @@ static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
*/
if ((lmb->flags & DRCONF_MEM_RESERVED)
|| !(lmb->flags & DRCONF_MEM_ASSIGNED))
- return;
+ return 0;
if (*usm)
is_kexec_kdump = 1;
@@ -669,7 +670,7 @@ static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
if (is_kexec_kdump) {
ranges = read_usm_ranges(usm);
if (!ranges) /* there are no (base, size) duple */
- return;
+ return 0;
}
do {
@@ -686,6 +687,8 @@ static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
if (sz)
memblock_set_node(base, sz, &memblock.memory, nid);
} while (--ranges);
+
+ return 0;
}
static int __init parse_numa_properties(void)
@@ -787,7 +790,7 @@ static int __init parse_numa_properties(void)
*/
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (memory) {
- walk_drmem_lmbs(memory, numa_setup_drmem_lmb);
+ walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb);
of_node_put(memory);
}
^ permalink raw reply related
* [PATCH v6 04/11] ppc64/kexec_file: avoid stomping memory used by special regions
From: Hari Bathini @ 2020-07-29 11:40 UTC (permalink / raw)
To: Michael Ellerman
Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
Vivek Goyal, Andrew Morton, Dave Young, Thiago Jung Bauermann,
Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>
crashkernel region could have an overlap with special memory regions
like opal, rtas, tce-table & such. These regions are referred to as
exclude memory ranges. Setup this ranges during image probe in order
to avoid them while finding the buffer for different kdump segments.
Override arch_kexec_locate_mem_hole() to locate a memory hole taking
these ranges into account.
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---
v5 -> v6:
* Implemented all the add_foo_mem_ranges() functions that get used while
setting up exclude memory ranges.
v4 -> v5:
* Unchanged. Added Reviewed-by tag from Thiago.
v3 -> v4:
* Dropped KDUMP_BUF_MIN & KDUMP_BUF_MAX macros and fixed off-by-one error
in arch_locate_mem_hole() helper routines.
v2 -> v3:
* If there are no exclude ranges, the right thing to do is fallbacking
back to default kexec_locate_mem_hole() implementation instead of
returning 0. Fixed that.
v1 -> v2:
* Did arch_kexec_locate_mem_hole() override to handle special regions.
* Ensured holes in the memory are accounted for while locating mem hole.
* Updated add_rtas_mem_range() & add_opal_mem_range() callsites based on
the new prototype for these functions.
arch/powerpc/include/asm/kexec.h | 7 -
arch/powerpc/include/asm/kexec_ranges.h | 14 +
arch/powerpc/kexec/elf_64.c | 8 +
arch/powerpc/kexec/file_load_64.c | 337 +++++++++++++++++++++++++++++++
arch/powerpc/kexec/ranges.c | 177 ++++++++++++++++
5 files changed, 539 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index ac8fd4839171..835dc92e091c 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -100,14 +100,16 @@ void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_co
#ifdef CONFIG_KEXEC_FILE
extern const struct kexec_file_ops kexec_elf64_ops;
-#ifdef CONFIG_IMA_KEXEC
#define ARCH_HAS_KIMAGE_ARCH
struct kimage_arch {
+ struct crash_mem *exclude_ranges;
+
+#ifdef CONFIG_IMA_KEXEC
phys_addr_t ima_buffer_addr;
size_t ima_buffer_size;
-};
#endif
+};
int setup_purgatory(struct kimage *image, const void *slave_code,
const void *fdt, unsigned long kernel_load_addr,
@@ -125,6 +127,7 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
unsigned long initrd_load_addr,
unsigned long initrd_len, const char *cmdline);
#endif /* CONFIG_PPC64 */
+
#endif /* CONFIG_KEXEC_FILE */
#else /* !CONFIG_KEXEC_CORE */
diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h
index 35ae31a7a4de..7a90000f8d15 100644
--- a/arch/powerpc/include/asm/kexec_ranges.h
+++ b/arch/powerpc/include/asm/kexec_ranges.h
@@ -7,5 +7,19 @@
void sort_memory_ranges(struct crash_mem *mrngs, bool merge);
struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges);
int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size);
+int add_tce_mem_ranges(struct crash_mem **mem_ranges);
+int add_initrd_mem_range(struct crash_mem **mem_ranges);
+#ifdef CONFIG_PPC_BOOK3S_64
+int add_htab_mem_range(struct crash_mem **mem_ranges);
+#else
+static inline int add_htab_mem_range(struct crash_mem **mem_ranges)
+{
+ return 0;
+}
+#endif
+int add_kernel_mem_range(struct crash_mem **mem_ranges);
+int add_rtas_mem_range(struct crash_mem **mem_ranges);
+int add_opal_mem_range(struct crash_mem **mem_ranges);
+int add_reserved_mem_ranges(struct crash_mem **mem_ranges);
#endif /* _ASM_POWERPC_KEXEC_RANGES_H */
diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index 23ad04ccaf8e..64c15a5a280b 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -46,6 +46,14 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
if (ret)
goto out;
+ if (image->type == KEXEC_TYPE_CRASH) {
+ /* min & max buffer values for kdump case */
+ kbuf.buf_min = pbuf.buf_min = crashk_res.start;
+ kbuf.buf_max = pbuf.buf_max =
+ ((crashk_res.end < ppc64_rma_size) ?
+ crashk_res.end : (ppc64_rma_size - 1));
+ }
+
ret = kexec_elf_load(image, &ehdr, &elf_info, &kbuf, &kernel_load_addr);
if (ret)
goto out;
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index 3e9ac5f216b0..d09c7724efa8 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -17,12 +17,262 @@
#include <linux/kexec.h>
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
+#include <linux/memblock.h>
+#include <asm/kexec_ranges.h>
const struct kexec_file_ops * const kexec_file_loaders[] = {
&kexec_elf64_ops,
NULL
};
+/**
+ * get_exclude_memory_ranges - Get exclude memory ranges. This list includes
+ * regions like opal/rtas, tce-table, initrd,
+ * kernel, htab which should be avoided while
+ * setting up kexec load segments.
+ * @mem_ranges: Range list to add the memory ranges to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int get_exclude_memory_ranges(struct crash_mem **mem_ranges)
+{
+ int ret;
+
+ ret = add_tce_mem_ranges(mem_ranges);
+ if (ret)
+ goto out;
+
+ ret = add_initrd_mem_range(mem_ranges);
+ if (ret)
+ goto out;
+
+ ret = add_htab_mem_range(mem_ranges);
+ if (ret)
+ goto out;
+
+ ret = add_kernel_mem_range(mem_ranges);
+ if (ret)
+ goto out;
+
+ ret = add_rtas_mem_range(mem_ranges);
+ if (ret)
+ goto out;
+
+ ret = add_opal_mem_range(mem_ranges);
+ if (ret)
+ goto out;
+
+ ret = add_reserved_mem_ranges(mem_ranges);
+ if (ret)
+ goto out;
+
+ /* exclude memory ranges should be sorted for easy lookup */
+ sort_memory_ranges(*mem_ranges, true);
+out:
+ if (ret)
+ pr_err("Failed to setup exclude memory ranges\n");
+ return ret;
+}
+
+/**
+ * __locate_mem_hole_top_down - Looks top down for a large enough memory hole
+ * in the memory regions between buf_min & buf_max
+ * for the buffer. If found, sets kbuf->mem.
+ * @kbuf: Buffer contents and memory parameters.
+ * @buf_min: Minimum address for the buffer.
+ * @buf_max: Maximum address for the buffer.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int __locate_mem_hole_top_down(struct kexec_buf *kbuf,
+ u64 buf_min, u64 buf_max)
+{
+ int ret = -EADDRNOTAVAIL;
+ phys_addr_t start, end;
+ u64 i;
+
+ for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE,
+ MEMBLOCK_NONE, &start, &end, NULL) {
+ /*
+ * memblock uses [start, end) convention while it is
+ * [start, end] here. Fix the off-by-one to have the
+ * same convention.
+ */
+ end -= 1;
+
+ if (start > buf_max)
+ continue;
+
+ /* Memory hole not found */
+ if (end < buf_min)
+ break;
+
+ /* Adjust memory region based on the given range */
+ if (start < buf_min)
+ start = buf_min;
+ if (end > buf_max)
+ end = buf_max;
+
+ start = ALIGN(start, kbuf->buf_align);
+ if (start < end && (end - start + 1) >= kbuf->memsz) {
+ /* Suitable memory range found. Set kbuf->mem */
+ kbuf->mem = ALIGN_DOWN(end - kbuf->memsz + 1,
+ kbuf->buf_align);
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * locate_mem_hole_top_down_ppc64 - Skip special memory regions to find a
+ * suitable buffer with top down approach.
+ * @kbuf: Buffer contents and memory parameters.
+ * @buf_min: Minimum address for the buffer.
+ * @buf_max: Maximum address for the buffer.
+ * @emem: Exclude memory ranges.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int locate_mem_hole_top_down_ppc64(struct kexec_buf *kbuf,
+ u64 buf_min, u64 buf_max,
+ const struct crash_mem *emem)
+{
+ int i, ret = 0, err = -EADDRNOTAVAIL;
+ u64 start, end, tmin, tmax;
+
+ tmax = buf_max;
+ for (i = (emem->nr_ranges - 1); i >= 0; i--) {
+ start = emem->ranges[i].start;
+ end = emem->ranges[i].end;
+
+ if (start > tmax)
+ continue;
+
+ if (end < tmax) {
+ tmin = (end < buf_min ? buf_min : end + 1);
+ ret = __locate_mem_hole_top_down(kbuf, tmin, tmax);
+ if (!ret)
+ return 0;
+ }
+
+ tmax = start - 1;
+
+ if (tmax < buf_min) {
+ ret = err;
+ break;
+ }
+ ret = 0;
+ }
+
+ if (!ret) {
+ tmin = buf_min;
+ ret = __locate_mem_hole_top_down(kbuf, tmin, tmax);
+ }
+ return ret;
+}
+
+/**
+ * __locate_mem_hole_bottom_up - Looks bottom up for a large enough memory hole
+ * in the memory regions between buf_min & buf_max
+ * for the buffer. If found, sets kbuf->mem.
+ * @kbuf: Buffer contents and memory parameters.
+ * @buf_min: Minimum address for the buffer.
+ * @buf_max: Maximum address for the buffer.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int __locate_mem_hole_bottom_up(struct kexec_buf *kbuf,
+ u64 buf_min, u64 buf_max)
+{
+ int ret = -EADDRNOTAVAIL;
+ phys_addr_t start, end;
+ u64 i;
+
+ for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
+ MEMBLOCK_NONE, &start, &end, NULL) {
+ /*
+ * memblock uses [start, end) convention while it is
+ * [start, end] here. Fix the off-by-one to have the
+ * same convention.
+ */
+ end -= 1;
+
+ if (end < buf_min)
+ continue;
+
+ /* Memory hole not found */
+ if (start > buf_max)
+ break;
+
+ /* Adjust memory region based on the given range */
+ if (start < buf_min)
+ start = buf_min;
+ if (end > buf_max)
+ end = buf_max;
+
+ start = ALIGN(start, kbuf->buf_align);
+ if (start < end && (end - start + 1) >= kbuf->memsz) {
+ /* Suitable memory range found. Set kbuf->mem */
+ kbuf->mem = start;
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * locate_mem_hole_bottom_up_ppc64 - Skip special memory regions to find a
+ * suitable buffer with bottom up approach.
+ * @kbuf: Buffer contents and memory parameters.
+ * @buf_min: Minimum address for the buffer.
+ * @buf_max: Maximum address for the buffer.
+ * @emem: Exclude memory ranges.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int locate_mem_hole_bottom_up_ppc64(struct kexec_buf *kbuf,
+ u64 buf_min, u64 buf_max,
+ const struct crash_mem *emem)
+{
+ int i, ret = 0, err = -EADDRNOTAVAIL;
+ u64 start, end, tmin, tmax;
+
+ tmin = buf_min;
+ for (i = 0; i < emem->nr_ranges; i++) {
+ start = emem->ranges[i].start;
+ end = emem->ranges[i].end;
+
+ if (end < tmin)
+ continue;
+
+ if (start > tmin) {
+ tmax = (start > buf_max ? buf_max : start - 1);
+ ret = __locate_mem_hole_bottom_up(kbuf, tmin, tmax);
+ if (!ret)
+ return 0;
+ }
+
+ tmin = end + 1;
+
+ if (tmin > buf_max) {
+ ret = err;
+ break;
+ }
+ ret = 0;
+ }
+
+ if (!ret) {
+ tmax = buf_max;
+ ret = __locate_mem_hole_bottom_up(kbuf, tmin, tmax);
+ }
+ return ret;
+}
+
/**
* setup_purgatory_ppc64 - initialize PPC64 specific purgatory's global
* variables and call setup_purgatory() to initialize
@@ -67,6 +317,67 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
return setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
}
+/**
+ * arch_kexec_locate_mem_hole - Skip special memory regions like rtas, opal,
+ * tce-table, reserved-ranges & such (exclude
+ * memory ranges) as they can't be used for kexec
+ * segment buffer. Sets kbuf->mem when a suitable
+ * memory hole is found.
+ * @kbuf: Buffer contents and memory parameters.
+ *
+ * Assumes minimum of PAGE_SIZE alignment for kbuf->memsz & kbuf->buf_align.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+ struct crash_mem **emem;
+ u64 buf_min, buf_max;
+ int ret;
+
+ /*
+ * Use the generic kexec_locate_mem_hole for regular
+ * kexec_file_load syscall
+ */
+ if (kbuf->image->type != KEXEC_TYPE_CRASH)
+ return kexec_locate_mem_hole(kbuf);
+
+ /* Look up the exclude ranges list while locating the memory hole */
+ emem = &(kbuf->image->arch.exclude_ranges);
+ if (!(*emem) || ((*emem)->nr_ranges == 0)) {
+ pr_warn("No exclude range list. Using the default locate mem hole method\n");
+ return kexec_locate_mem_hole(kbuf);
+ }
+
+ /* Segments for kdump kernel should be within crashkernel region */
+ buf_min = (kbuf->buf_min < crashk_res.start ?
+ crashk_res.start : kbuf->buf_min);
+ buf_max = (kbuf->buf_max > crashk_res.end ?
+ crashk_res.end : kbuf->buf_max);
+
+ if (buf_min > buf_max) {
+ pr_err("Invalid buffer min and/or max values\n");
+ return -EINVAL;
+ }
+
+ if (kbuf->top_down)
+ ret = locate_mem_hole_top_down_ppc64(kbuf, buf_min, buf_max,
+ *emem);
+ else
+ ret = locate_mem_hole_bottom_up_ppc64(kbuf, buf_min, buf_max,
+ *emem);
+
+ /* Add the buffer allocated to the exclude list for the next lookup */
+ if (!ret) {
+ add_mem_range(emem, kbuf->mem, kbuf->memsz);
+ sort_memory_ranges(*emem, true);
+ } else {
+ pr_err("Failed to locate memory buffer of size %lu\n",
+ kbuf->memsz);
+ }
+ return ret;
+}
+
/**
* arch_kexec_kernel_image_probe - Does additional handling needed to setup
* kexec segments.
@@ -79,9 +390,31 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
unsigned long buf_len)
{
- /* We don't support crash kernels yet. */
- if (image->type == KEXEC_TYPE_CRASH)
+ if (image->type == KEXEC_TYPE_CRASH) {
+ int ret;
+
+ /* Get exclude memory ranges needed for setting up kdump segments */
+ ret = get_exclude_memory_ranges(&(image->arch.exclude_ranges));
+ if (ret)
+ pr_err("Failed to setup exclude memory ranges for buffer lookup\n");
+ /* Return this until all changes for panic kernel are in */
return -EOPNOTSUPP;
+ }
return kexec_image_probe_default(image, buf, buf_len);
}
+
+/**
+ * arch_kimage_file_post_load_cleanup - Frees up all the allocations done
+ * while loading the image.
+ * @image: kexec image being loaded.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+ kfree(image->arch.exclude_ranges);
+ image->arch.exclude_ranges = NULL;
+
+ return kexec_image_post_load_cleanup_default(image);
+}
diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c
index dc3ce036f416..6b81c852feab 100644
--- a/arch/powerpc/kexec/ranges.c
+++ b/arch/powerpc/kexec/ranges.c
@@ -233,3 +233,180 @@ int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
return __add_mem_range(mem_ranges, base, size);
}
+
+/**
+ * add_tce_mem_ranges - Adds tce-table range to the given memory ranges list.
+ * @mem_ranges: Range list to add the memory range(s) to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_tce_mem_ranges(struct crash_mem **mem_ranges)
+{
+ struct device_node *dn = NULL;
+ int ret = 0;
+
+ for_each_node_by_type(dn, "pci") {
+ u64 base;
+ u32 size;
+
+ ret = of_property_read_u64(dn, "linux,tce-base", &base);
+ ret |= of_property_read_u32(dn, "linux,tce-size", &size);
+ if (ret) {
+ /*
+ * It is ok to have pci nodes without tce. So, ignore
+ * property does not exist error.
+ */
+ if (ret == -EINVAL) {
+ ret = 0;
+ continue;
+ }
+ break;
+ }
+
+ ret = add_mem_range(mem_ranges, base, size);
+ if (ret)
+ break;
+ }
+
+ of_node_put(dn);
+ return ret;
+}
+
+/**
+ * add_initrd_mem_range - Adds initrd range to the given memory ranges list,
+ * if the initrd was retained.
+ * @mem_ranges: Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_initrd_mem_range(struct crash_mem **mem_ranges)
+{
+ u64 base, end;
+ int ret;
+
+ /* This range means something, only if initrd was retained */
+ if (!strstr(saved_command_line, "retain_initrd"))
+ return 0;
+
+ ret = of_property_read_u64(of_chosen, "linux,initrd-start", &base);
+ ret |= of_property_read_u64(of_chosen, "linux,initrd-end", &end);
+ if (!ret)
+ ret = add_mem_range(mem_ranges, base, end - base + 1);
+
+ return ret;
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+/**
+ * add_htab_mem_range - Adds htab range to the given memory ranges list,
+ * if it exists
+ * @mem_ranges: Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_htab_mem_range(struct crash_mem **mem_ranges)
+{
+ if (!htab_address)
+ return 0;
+
+ return add_mem_range(mem_ranges, __pa(htab_address), htab_size_bytes);
+}
+#endif
+
+/**
+ * add_kernel_mem_range - Adds kernel text region to the given
+ * memory ranges list.
+ * @mem_ranges: Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_kernel_mem_range(struct crash_mem **mem_ranges)
+{
+ return add_mem_range(mem_ranges, 0, __pa(_end));
+}
+
+/**
+ * add_rtas_mem_range - Adds RTAS region to the given memory ranges list.
+ * @mem_ranges: Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_rtas_mem_range(struct crash_mem **mem_ranges)
+{
+ struct device_node *dn;
+ u32 base, size;
+ int ret = 0;
+
+ dn = of_find_node_by_path("/rtas");
+ if (!dn)
+ return 0;
+
+ ret = of_property_read_u32(dn, "linux,rtas-base", &base);
+ ret |= of_property_read_u32(dn, "rtas-size", &size);
+ if (!ret)
+ ret = add_mem_range(mem_ranges, base, size);
+
+ of_node_put(dn);
+ return ret;
+}
+
+/**
+ * add_opal_mem_range - Adds OPAL region to the given memory ranges list.
+ * @mem_ranges: Range list to add the memory range to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_opal_mem_range(struct crash_mem **mem_ranges)
+{
+ struct device_node *dn;
+ u64 base, size;
+ int ret;
+
+ dn = of_find_node_by_path("/ibm,opal");
+ if (!dn)
+ return 0;
+
+ ret = of_property_read_u64(dn, "opal-base-address", &base);
+ ret |= of_property_read_u64(dn, "opal-runtime-size", &size);
+ if (!ret)
+ ret = add_mem_range(mem_ranges, base, size);
+
+ of_node_put(dn);
+ return ret;
+}
+
+/**
+ * add_reserved_mem_ranges - Adds "/reserved-ranges" regions exported by f/w
+ * to the given memory ranges list.
+ * @mem_ranges: Range list to add the memory ranges to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_reserved_mem_ranges(struct crash_mem **mem_ranges)
+{
+ int n_mem_addr_cells, n_mem_size_cells, i, len, cells, ret = 0;
+ const __be32 *prop;
+
+ prop = of_get_property(of_root, "reserved-ranges", &len);
+ if (!prop)
+ return 0;
+
+ n_mem_addr_cells = of_n_addr_cells(of_root);
+ n_mem_size_cells = of_n_size_cells(of_root);
+ cells = n_mem_addr_cells + n_mem_size_cells;
+
+ /* Each reserved range is an (address,size) pair */
+ for (i = 0; i < (len / (sizeof(u32) * cells)); i++) {
+ u64 base, size;
+
+ base = of_read_number(prop + (i * cells), n_mem_addr_cells);
+ size = of_read_number(prop + (i * cells) + n_mem_addr_cells,
+ n_mem_size_cells);
+
+ ret = add_mem_range(mem_ranges, base, size);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
^ permalink raw reply related
* [PATCH v6 03/11] powerpc/kexec_file: add helper functions for getting memory ranges
From: Hari Bathini @ 2020-07-29 11:40 UTC (permalink / raw)
To: Michael Ellerman
Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
Thiago Jung Bauermann, Andrew Morton, Dave Young, Vivek Goyal,
Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>
In kexec case, the kernel to be loaded uses the same memory layout as
the running kernel. So, passing on the DT of the running kernel would
be good enough.
But in case of kdump, different memory ranges are needed to manage
loading the kdump kernel, booting into it and exporting the elfcore
of the crashing kernel. The ranges are exclude memory ranges, usable
memory ranges, reserved memory ranges and crash memory ranges.
Exclude memory ranges specify the list of memory ranges to avoid while
loading kdump segments. Usable memory ranges list the memory ranges
that could be used for booting kdump kernel. Reserved memory ranges
list the memory regions for the loading kernel's reserve map. Crash
memory ranges list the memory ranges to be exported as the crashing
kernel's elfcore.
Add helper functions for setting up the above mentioned memory ranges.
This helpers facilitate in understanding the subsequent changes better
and make it easy to setup the different memory ranges listed above, as
and when appropriate.
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---
v5 -> v6:
* Dropped email address from copyright header of the new file being
added: arch/powerpc/kexec/ranges.c
* Changed mrngs to mem_rngs. Using the convention mem_ranges for
'struct crash_mem **' types & mem_rngs for 'struct crash_mem *'
for easy readibility.
* Updated add_opal_mem_range() & add_rtas_mem_range() functions without
goto statements.
* Moved implementation of all add_foo_mem_range(s)() functions to
patch 04/11, where they are used.
* Fixed reference count leak in add_tce_mem_ranges() function and also
updated error handling in reading tce table base & sizes.
v4 -> v5:
* Added Reviewed-by tag from Thiago.
* Added the missing "#ifdef CONFIG_PPC_BOOK3S_64" around add_htab_mem_range()
function in arch/powerpc/kexec/ranges.c file.
* add_tce_mem_ranges() function returned error when tce table is not found
in a pci node. This is wrong as pci nodes may not always have tce tables
(KVM guests, for example). Fixed it by ignoring error in reading tce
table base/size while returning from the function.
v3 -> v4:
* Updated sort_memory_ranges() function to reuse sort() from lib/sort.c
and addressed other review comments from Thiago.
v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.
v1 -> v2:
* Added an option to merge ranges while sorting to minimize reallocations
for memory ranges list.
* Dropped within_crashkernel option for add_opal_mem_range() &
add_rtas_mem_range() as it is not really needed.
arch/powerpc/include/asm/kexec_ranges.h | 11 +
arch/powerpc/kexec/Makefile | 2
arch/powerpc/kexec/ranges.c | 235 +++++++++++++++++++++++++++++++
3 files changed, 247 insertions(+), 1 deletion(-)
create mode 100644 arch/powerpc/include/asm/kexec_ranges.h
create mode 100644 arch/powerpc/kexec/ranges.c
diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h
new file mode 100644
index 000000000000..35ae31a7a4de
--- /dev/null
+++ b/arch/powerpc/include/asm/kexec_ranges.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_POWERPC_KEXEC_RANGES_H
+#define _ASM_POWERPC_KEXEC_RANGES_H
+
+#define MEM_RANGE_CHUNK_SZ 2048 /* Memory ranges size chunk */
+
+void sort_memory_ranges(struct crash_mem *mrngs, bool merge);
+struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges);
+int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size);
+
+#endif /* _ASM_POWERPC_KEXEC_RANGES_H */
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
index 67c355329457..4aff6846c772 100644
--- a/arch/powerpc/kexec/Makefile
+++ b/arch/powerpc/kexec/Makefile
@@ -7,7 +7,7 @@ obj-y += core.o crash.o core_$(BITS).o
obj-$(CONFIG_PPC32) += relocate_32.o
-obj-$(CONFIG_KEXEC_FILE) += file_load.o file_load_$(BITS).o elf_$(BITS).o
+obj-$(CONFIG_KEXEC_FILE) += file_load.o ranges.o file_load_$(BITS).o elf_$(BITS).o
ifdef CONFIG_HAVE_IMA_KEXEC
ifdef CONFIG_IMA
diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c
new file mode 100644
index 000000000000..dc3ce036f416
--- /dev/null
+++ b/arch/powerpc/kexec/ranges.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * powerpc code to implement the kexec_file_load syscall
+ *
+ * Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004 IBM Corp.
+ * Copyright (C) 2004,2005 Milton D Miller II, IBM Corporation
+ * Copyright (C) 2005 R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2020 IBM Corporation
+ *
+ * Based on kexec-tools' kexec-ppc64.c, fs2dt.c.
+ * Heavily modified for the kernel by
+ * Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "kexec ranges: " fmt
+
+#include <linux/sort.h>
+#include <linux/kexec.h>
+#include <linux/of_device.h>
+#include <linux/slab.h>
+#include <asm/sections.h>
+#include <asm/kexec_ranges.h>
+
+/**
+ * get_max_nr_ranges - Get the max no. of ranges crash_mem structure
+ * could hold, given the size allocated for it.
+ * @size: Allocation size of crash_mem structure.
+ *
+ * Returns the maximum no. of ranges.
+ */
+static inline unsigned int get_max_nr_ranges(size_t size)
+{
+ return ((size - sizeof(struct crash_mem)) /
+ sizeof(struct crash_mem_range));
+}
+
+/**
+ * get_mem_rngs_size - Get the allocated size of mem_rngs based on
+ * max_nr_ranges and chunk size.
+ * @mem_rngs: Memory ranges.
+ *
+ * Returns the maximum size of @mem_rngs.
+ */
+static inline size_t get_mem_rngs_size(struct crash_mem *mem_rngs)
+{
+ size_t size;
+
+ if (!mem_rngs)
+ return 0;
+
+ size = (sizeof(struct crash_mem) +
+ (mem_rngs->max_nr_ranges * sizeof(struct crash_mem_range)));
+
+ /*
+ * Memory is allocated in size multiple of MEM_RANGE_CHUNK_SZ.
+ * So, align to get the actual length.
+ */
+ return ALIGN(size, MEM_RANGE_CHUNK_SZ);
+}
+
+/**
+ * __add_mem_range - add a memory range to memory ranges list.
+ * @mem_ranges: Range list to add the memory range to.
+ * @base: Base address of the range to add.
+ * @size: Size of the memory range to add.
+ *
+ * (Re)allocates memory, if needed.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int __add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
+{
+ struct crash_mem *mem_rngs = *mem_ranges;
+
+ if (!mem_rngs || (mem_rngs->nr_ranges == mem_rngs->max_nr_ranges)) {
+ mem_rngs = realloc_mem_ranges(mem_ranges);
+ if (!mem_rngs)
+ return -ENOMEM;
+ }
+
+ mem_rngs->ranges[mem_rngs->nr_ranges].start = base;
+ mem_rngs->ranges[mem_rngs->nr_ranges].end = base + size - 1;
+ pr_debug("Added memory range [%#016llx - %#016llx] at index %d\n",
+ base, base + size - 1, mem_rngs->nr_ranges);
+ mem_rngs->nr_ranges++;
+ return 0;
+}
+
+/**
+ * __merge_memory_ranges - Merges the given memory ranges list.
+ * @mem_rngs: Range list to merge.
+ *
+ * Assumes a sorted range list.
+ *
+ * Returns nothing.
+ */
+static void __merge_memory_ranges(struct crash_mem *mem_rngs)
+{
+ struct crash_mem_range *ranges;
+ int i, idx;
+
+ if (!mem_rngs)
+ return;
+
+ idx = 0;
+ ranges = &(mem_rngs->ranges[0]);
+ for (i = 1; i < mem_rngs->nr_ranges; i++) {
+ if (ranges[i].start <= (ranges[i-1].end + 1))
+ ranges[idx].end = ranges[i].end;
+ else {
+ idx++;
+ if (i == idx)
+ continue;
+
+ ranges[idx] = ranges[i];
+ }
+ }
+ mem_rngs->nr_ranges = idx + 1;
+}
+
+/* cmp_func_t callback to sort ranges with sort() */
+static int rngcmp(const void *_x, const void *_y)
+{
+ const struct crash_mem_range *x = _x, *y = _y;
+
+ if (x->start > y->start)
+ return 1;
+ if (x->start < y->start)
+ return -1;
+ return 0;
+}
+
+/**
+ * sort_memory_ranges - Sorts the given memory ranges list.
+ * @mem_rngs: Range list to sort.
+ * @merge: If true, merge the list after sorting.
+ *
+ * Returns nothing.
+ */
+void sort_memory_ranges(struct crash_mem *mem_rngs, bool merge)
+{
+ int i;
+
+ if (!mem_rngs)
+ return;
+
+ /* Sort the ranges in-place */
+ sort(&(mem_rngs->ranges[0]), mem_rngs->nr_ranges,
+ sizeof(mem_rngs->ranges[0]), rngcmp, NULL);
+
+ if (merge)
+ __merge_memory_ranges(mem_rngs);
+
+ /* For debugging purpose */
+ pr_debug("Memory ranges:\n");
+ for (i = 0; i < mem_rngs->nr_ranges; i++) {
+ pr_debug("\t[%03d][%#016llx - %#016llx]\n", i,
+ mem_rngs->ranges[i].start,
+ mem_rngs->ranges[i].end);
+ }
+}
+
+/**
+ * realloc_mem_ranges - reallocate mem_ranges with size incremented
+ * by MEM_RANGE_CHUNK_SZ. Frees up the old memory,
+ * if memory allocation fails.
+ * @mem_ranges: Memory ranges to reallocate.
+ *
+ * Returns pointer to reallocated memory on success, NULL otherwise.
+ */
+struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges)
+{
+ struct crash_mem *mem_rngs = *mem_ranges;
+ unsigned int nr_ranges;
+ size_t size;
+
+ size = get_mem_rngs_size(mem_rngs);
+ nr_ranges = mem_rngs ? mem_rngs->nr_ranges : 0;
+
+ size += MEM_RANGE_CHUNK_SZ;
+ mem_rngs = krealloc(*mem_ranges, size, GFP_KERNEL);
+ if (!mem_rngs) {
+ kfree(*mem_ranges);
+ *mem_ranges = NULL;
+ return NULL;
+ }
+
+ mem_rngs->nr_ranges = nr_ranges;
+ mem_rngs->max_nr_ranges = get_max_nr_ranges(size);
+ *mem_ranges = mem_rngs;
+
+ return mem_rngs;
+}
+
+/**
+ * add_mem_range - Updates existing memory range, if there is an overlap.
+ * Else, adds a new memory range.
+ * @mem_ranges: Range list to add the memory range to.
+ * @base: Base address of the range to add.
+ * @size: Size of the memory range to add.
+ *
+ * (Re)allocates memory, if needed.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
+{
+ struct crash_mem *mem_rngs = *mem_ranges;
+ u64 mstart, mend, end;
+ unsigned int i;
+
+ if (!size)
+ return 0;
+
+ end = base + size - 1;
+
+ if (!mem_rngs || !(mem_rngs->nr_ranges))
+ return __add_mem_range(mem_ranges, base, size);
+
+ for (i = 0; i < mem_rngs->nr_ranges; i++) {
+ mstart = mem_rngs->ranges[i].start;
+ mend = mem_rngs->ranges[i].end;
+ if (base < mend && end > mstart) {
+ if (base < mstart)
+ mem_rngs->ranges[i].start = base;
+ if (end > mend)
+ mem_rngs->ranges[i].end = end;
+ return 0;
+ }
+ }
+
+ return __add_mem_range(mem_ranges, base, size);
+}
^ permalink raw reply related
* [PATCH v6 02/11] powerpc/kexec_file: mark PPC64 specific code
From: Hari Bathini @ 2020-07-29 11:39 UTC (permalink / raw)
To: Michael Ellerman
Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
Thiago Jung Bauermann, Andrew Morton, Laurent Dufour, Dave Young,
Vivek Goyal, Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>
Some of the kexec_file_load code isn't PPC64 specific. Move PPC64
specific code from kexec/file_load.c to kexec/file_load_64.c. Also,
rename purgatory/trampoline.S to purgatory/trampoline_64.S in the
same spirit. No functional changes.
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---
v5 -> v6:
* Dropped email address from copyright header of the new file being
added: arch/powerpc/kexec/file_load_64.c
v4 -> v5:
* Unchanged.
v3 -> v4:
* Moved common code back to set_new_fdt() from setup_new_fdt_ppc64()
function. Added Reviewed-by tags from Laurent & Thiago.
v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.
v1 -> v2:
* No changes.
arch/powerpc/include/asm/kexec.h | 9 ++
arch/powerpc/kexec/Makefile | 2 -
arch/powerpc/kexec/elf_64.c | 7 +-
arch/powerpc/kexec/file_load.c | 19 +----
arch/powerpc/kexec/file_load_64.c | 87 ++++++++++++++++++++++++
arch/powerpc/purgatory/Makefile | 4 +
arch/powerpc/purgatory/trampoline.S | 117 --------------------------------
arch/powerpc/purgatory/trampoline_64.S | 117 ++++++++++++++++++++++++++++++++
8 files changed, 222 insertions(+), 140 deletions(-)
create mode 100644 arch/powerpc/kexec/file_load_64.c
delete mode 100644 arch/powerpc/purgatory/trampoline.S
create mode 100644 arch/powerpc/purgatory/trampoline_64.S
diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
index c68476818753..ac8fd4839171 100644
--- a/arch/powerpc/include/asm/kexec.h
+++ b/arch/powerpc/include/asm/kexec.h
@@ -116,6 +116,15 @@ int setup_new_fdt(const struct kimage *image, void *fdt,
unsigned long initrd_load_addr, unsigned long initrd_len,
const char *cmdline);
int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size);
+
+#ifdef CONFIG_PPC64
+int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
+ const void *fdt, unsigned long kernel_load_addr,
+ unsigned long fdt_load_addr);
+int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
+ unsigned long initrd_load_addr,
+ unsigned long initrd_len, const char *cmdline);
+#endif /* CONFIG_PPC64 */
#endif /* CONFIG_KEXEC_FILE */
#else /* !CONFIG_KEXEC_CORE */
diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile
index 86380c69f5ce..67c355329457 100644
--- a/arch/powerpc/kexec/Makefile
+++ b/arch/powerpc/kexec/Makefile
@@ -7,7 +7,7 @@ obj-y += core.o crash.o core_$(BITS).o
obj-$(CONFIG_PPC32) += relocate_32.o
-obj-$(CONFIG_KEXEC_FILE) += file_load.o elf_$(BITS).o
+obj-$(CONFIG_KEXEC_FILE) += file_load.o file_load_$(BITS).o elf_$(BITS).o
ifdef CONFIG_HAVE_IMA_KEXEC
ifdef CONFIG_IMA
diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c
index 3072fd6dbe94..23ad04ccaf8e 100644
--- a/arch/powerpc/kexec/elf_64.c
+++ b/arch/powerpc/kexec/elf_64.c
@@ -88,7 +88,8 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
goto out;
}
- ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
+ ret = setup_new_fdt_ppc64(image, fdt, initrd_load_addr,
+ initrd_len, cmdline);
if (ret)
goto out;
@@ -107,8 +108,8 @@ static void *elf64_load(struct kimage *image, char *kernel_buf,
pr_debug("Loaded device tree at 0x%lx\n", fdt_load_addr);
slave_code = elf_info.buffer + elf_info.proghdrs[0].p_offset;
- ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
- fdt_load_addr);
+ ret = setup_purgatory_ppc64(image, slave_code, fdt, kernel_load_addr,
+ fdt_load_addr);
if (ret)
pr_err("Error setting up the purgatory.\n");
diff --git a/arch/powerpc/kexec/file_load.c b/arch/powerpc/kexec/file_load.c
index 143c91724617..38439aba27d7 100644
--- a/arch/powerpc/kexec/file_load.c
+++ b/arch/powerpc/kexec/file_load.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * ppc64 code to implement the kexec_file_load syscall
+ * powerpc code to implement the kexec_file_load syscall
*
* Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
* Copyright (C) 2004 IBM Corp.
@@ -20,22 +20,7 @@
#include <linux/libfdt.h>
#include <asm/ima.h>
-#define SLAVE_CODE_SIZE 256
-
-const struct kexec_file_ops * const kexec_file_loaders[] = {
- &kexec_elf64_ops,
- NULL
-};
-
-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- /* We don't support crash kernels yet. */
- if (image->type == KEXEC_TYPE_CRASH)
- return -EOPNOTSUPP;
-
- return kexec_image_probe_default(image, buf, buf_len);
-}
+#define SLAVE_CODE_SIZE 256 /* First 0x100 bytes */
/**
* setup_purgatory - initialize the purgatory's global variables
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
new file mode 100644
index 000000000000..3e9ac5f216b0
--- /dev/null
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ppc64 code to implement the kexec_file_load syscall
+ *
+ * Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004 IBM Corp.
+ * Copyright (C) 2004,2005 Milton D Miller II, IBM Corporation
+ * Copyright (C) 2005 R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2020 IBM Corporation
+ *
+ * Based on kexec-tools' kexec-ppc64.c, kexec-elf-rel-ppc64.c, fs2dt.c.
+ * Heavily modified for the kernel by
+ * Hari Bathini, IBM Corporation.
+ */
+
+#include <linux/kexec.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+
+const struct kexec_file_ops * const kexec_file_loaders[] = {
+ &kexec_elf64_ops,
+ NULL
+};
+
+/**
+ * setup_purgatory_ppc64 - initialize PPC64 specific purgatory's global
+ * variables and call setup_purgatory() to initialize
+ * common global variable.
+ * @image: kexec image.
+ * @slave_code: Slave code for the purgatory.
+ * @fdt: Flattened device tree for the next kernel.
+ * @kernel_load_addr: Address where the kernel is loaded.
+ * @fdt_load_addr: Address where the flattened device tree is loaded.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
+ const void *fdt, unsigned long kernel_load_addr,
+ unsigned long fdt_load_addr)
+{
+ int ret;
+
+ ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
+ fdt_load_addr);
+ if (ret)
+ pr_err("Failed to setup purgatory symbols");
+ return ret;
+}
+
+/**
+ * setup_new_fdt_ppc64 - Update the flattend device-tree of the kernel
+ * being loaded.
+ * @image: kexec image being loaded.
+ * @fdt: Flattened device tree for the next kernel.
+ * @initrd_load_addr: Address where the next initrd will be loaded.
+ * @initrd_len: Size of the next initrd, or 0 if there will be none.
+ * @cmdline: Command line for the next kernel, or NULL if there will
+ * be none.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
+ unsigned long initrd_load_addr,
+ unsigned long initrd_len, const char *cmdline)
+{
+ return setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline);
+}
+
+/**
+ * arch_kexec_kernel_image_probe - Does additional handling needed to setup
+ * kexec segments.
+ * @image: kexec image being loaded.
+ * @buf: Buffer pointing to elf data.
+ * @buf_len: Length of the buffer.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ /* We don't support crash kernels yet. */
+ if (image->type == KEXEC_TYPE_CRASH)
+ return -EOPNOTSUPP;
+
+ return kexec_image_probe_default(image, buf, buf_len);
+}
diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile
index 7c6d8b14f440..348f59581052 100644
--- a/arch/powerpc/purgatory/Makefile
+++ b/arch/powerpc/purgatory/Makefile
@@ -2,11 +2,11 @@
KASAN_SANITIZE := n
-targets += trampoline.o purgatory.ro kexec-purgatory.c
+targets += trampoline_$(BITS).o purgatory.ro kexec-purgatory.c
LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined
-$(obj)/purgatory.ro: $(obj)/trampoline.o FORCE
+$(obj)/purgatory.ro: $(obj)/trampoline_$(BITS).o FORCE
$(call if_changed,ld)
quiet_cmd_bin2c = BIN2C $@
diff --git a/arch/powerpc/purgatory/trampoline.S b/arch/powerpc/purgatory/trampoline.S
deleted file mode 100644
index a5a83c3f53e6..000000000000
--- a/arch/powerpc/purgatory/trampoline.S
+++ /dev/null
@@ -1,117 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * kexec trampoline
- *
- * Based on code taken from kexec-tools and kexec-lite.
- *
- * Copyright (C) 2004 - 2005, Milton D Miller II, IBM Corporation
- * Copyright (C) 2006, Mohan Kumar M, IBM Corporation
- * Copyright (C) 2013, Anton Blanchard, IBM Corporation
- */
-
-#include <asm/asm-compat.h>
-
- .machine ppc64
- .balign 256
- .globl purgatory_start
-purgatory_start:
- b master
-
- /* ABI: possible run_at_load flag at 0x5c */
- .org purgatory_start + 0x5c
- .globl run_at_load
-run_at_load:
- .long 0
- .size run_at_load, . - run_at_load
-
- /* ABI: slaves start at 60 with r3=phys */
- .org purgatory_start + 0x60
-slave:
- b .
- /* ABI: end of copied region */
- .org purgatory_start + 0x100
- .size purgatory_start, . - purgatory_start
-
-/*
- * The above 0x100 bytes at purgatory_start are replaced with the
- * code from the kernel (or next stage) by setup_purgatory().
- */
-
-master:
- or %r1,%r1,%r1 /* low priority to let other threads catchup */
- isync
- mr %r17,%r3 /* save cpu id to r17 */
- mr %r15,%r4 /* save physical address in reg15 */
-
- or %r3,%r3,%r3 /* ok now to high priority, lets boot */
- lis %r6,0x1
- mtctr %r6 /* delay a bit for slaves to catch up */
- bdnz . /* before we overwrite 0-100 again */
-
- bl 0f /* Work out where we're running */
-0: mflr %r18
-
- /* load device-tree address */
- ld %r3, (dt_offset - 0b)(%r18)
- mr %r16,%r3 /* save dt address in reg16 */
- li %r4,20
- LWZX_BE %r6,%r3,%r4 /* fetch __be32 version number at byte 20 */
- cmpwi %cr0,%r6,2 /* v2 or later? */
- blt 1f
- li %r4,28
- STWX_BE %r17,%r3,%r4 /* Store my cpu as __be32 at byte 28 */
-1:
- /* load the kernel address */
- ld %r4,(kernel - 0b)(%r18)
-
- /* load the run_at_load flag */
- /* possibly patched by kexec */
- ld %r6,(run_at_load - 0b)(%r18)
- /* and patch it into the kernel */
- stw %r6,(0x5c)(%r4)
-
- mr %r3,%r16 /* restore dt address */
-
- li %r5,0 /* r5 will be 0 for kernel */
-
- mfmsr %r11
- andi. %r10,%r11,1 /* test MSR_LE */
- bne .Little_endian
-
- mtctr %r4 /* prepare branch to */
- bctr /* start kernel */
-
-.Little_endian:
- mtsrr0 %r4 /* prepare branch to */
-
- clrrdi %r11,%r11,1 /* clear MSR_LE */
- mtsrr1 %r11
-
- rfid /* update MSR and start kernel */
-
-
- .balign 8
- .globl kernel
-kernel:
- .8byte 0x0
- .size kernel, . - kernel
-
- .balign 8
- .globl dt_offset
-dt_offset:
- .8byte 0x0
- .size dt_offset, . - dt_offset
-
-
- .data
- .balign 8
-.globl purgatory_sha256_digest
-purgatory_sha256_digest:
- .skip 32
- .size purgatory_sha256_digest, . - purgatory_sha256_digest
-
- .balign 8
-.globl purgatory_sha_regions
-purgatory_sha_regions:
- .skip 8 * 2 * 16
- .size purgatory_sha_regions, . - purgatory_sha_regions
diff --git a/arch/powerpc/purgatory/trampoline_64.S b/arch/powerpc/purgatory/trampoline_64.S
new file mode 100644
index 000000000000..a5a83c3f53e6
--- /dev/null
+++ b/arch/powerpc/purgatory/trampoline_64.S
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * kexec trampoline
+ *
+ * Based on code taken from kexec-tools and kexec-lite.
+ *
+ * Copyright (C) 2004 - 2005, Milton D Miller II, IBM Corporation
+ * Copyright (C) 2006, Mohan Kumar M, IBM Corporation
+ * Copyright (C) 2013, Anton Blanchard, IBM Corporation
+ */
+
+#include <asm/asm-compat.h>
+
+ .machine ppc64
+ .balign 256
+ .globl purgatory_start
+purgatory_start:
+ b master
+
+ /* ABI: possible run_at_load flag at 0x5c */
+ .org purgatory_start + 0x5c
+ .globl run_at_load
+run_at_load:
+ .long 0
+ .size run_at_load, . - run_at_load
+
+ /* ABI: slaves start at 60 with r3=phys */
+ .org purgatory_start + 0x60
+slave:
+ b .
+ /* ABI: end of copied region */
+ .org purgatory_start + 0x100
+ .size purgatory_start, . - purgatory_start
+
+/*
+ * The above 0x100 bytes at purgatory_start are replaced with the
+ * code from the kernel (or next stage) by setup_purgatory().
+ */
+
+master:
+ or %r1,%r1,%r1 /* low priority to let other threads catchup */
+ isync
+ mr %r17,%r3 /* save cpu id to r17 */
+ mr %r15,%r4 /* save physical address in reg15 */
+
+ or %r3,%r3,%r3 /* ok now to high priority, lets boot */
+ lis %r6,0x1
+ mtctr %r6 /* delay a bit for slaves to catch up */
+ bdnz . /* before we overwrite 0-100 again */
+
+ bl 0f /* Work out where we're running */
+0: mflr %r18
+
+ /* load device-tree address */
+ ld %r3, (dt_offset - 0b)(%r18)
+ mr %r16,%r3 /* save dt address in reg16 */
+ li %r4,20
+ LWZX_BE %r6,%r3,%r4 /* fetch __be32 version number at byte 20 */
+ cmpwi %cr0,%r6,2 /* v2 or later? */
+ blt 1f
+ li %r4,28
+ STWX_BE %r17,%r3,%r4 /* Store my cpu as __be32 at byte 28 */
+1:
+ /* load the kernel address */
+ ld %r4,(kernel - 0b)(%r18)
+
+ /* load the run_at_load flag */
+ /* possibly patched by kexec */
+ ld %r6,(run_at_load - 0b)(%r18)
+ /* and patch it into the kernel */
+ stw %r6,(0x5c)(%r4)
+
+ mr %r3,%r16 /* restore dt address */
+
+ li %r5,0 /* r5 will be 0 for kernel */
+
+ mfmsr %r11
+ andi. %r10,%r11,1 /* test MSR_LE */
+ bne .Little_endian
+
+ mtctr %r4 /* prepare branch to */
+ bctr /* start kernel */
+
+.Little_endian:
+ mtsrr0 %r4 /* prepare branch to */
+
+ clrrdi %r11,%r11,1 /* clear MSR_LE */
+ mtsrr1 %r11
+
+ rfid /* update MSR and start kernel */
+
+
+ .balign 8
+ .globl kernel
+kernel:
+ .8byte 0x0
+ .size kernel, . - kernel
+
+ .balign 8
+ .globl dt_offset
+dt_offset:
+ .8byte 0x0
+ .size dt_offset, . - dt_offset
+
+
+ .data
+ .balign 8
+.globl purgatory_sha256_digest
+purgatory_sha256_digest:
+ .skip 32
+ .size purgatory_sha256_digest, . - purgatory_sha256_digest
+
+ .balign 8
+.globl purgatory_sha_regions
+purgatory_sha_regions:
+ .skip 8 * 2 * 16
+ .size purgatory_sha_regions, . - purgatory_sha_regions
^ permalink raw reply related
* [PATCH v6 01/11] kexec_file: allow archs to handle special regions while locating memory hole
From: Hari Bathini @ 2020-07-29 11:39 UTC (permalink / raw)
To: Michael Ellerman
Cc: kernel test robot, Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain,
Petr Tesarik, Mahesh J Salgaonkar, Sourabh Jain, lkml,
linuxppc-dev, Vivek Goyal, Andrew Morton, Dave Young,
Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <159602259854.575379.16910915605574571585.stgit@hbathini>
Some architectures may have special memory regions, within the given
memory range, which can't be used for the buffer in a kexec segment.
Implement weak arch_kexec_locate_mem_hole() definition which arch code
may override, to take care of special regions, while trying to locate
a memory hole.
Also, add the missing declarations for arch overridable functions and
and drop the __weak descriptors in the declarations to avoid non-weak
definitions from becoming weak.
Reported-by: kernel test robot <lkp@intel.com>
[lkp: In v1, arch_kimage_file_post_load_cleanup() declaration was missing]
Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
---
v5 -> v6:
* Unchanged.
v4 -> v5:
* Unchanged.
v3 -> v4:
* Unchanged. Added Reviewed-by tag from Thiago.
v2 -> v3:
* Unchanged. Added Acked-by & Tested-by tags from Dave & Pingfan.
v1 -> v2:
* Introduced arch_kexec_locate_mem_hole() for override and dropped
weak arch_kexec_add_buffer().
* Dropped __weak identifier for arch overridable functions.
* Fixed the missing declaration for arch_kimage_file_post_load_cleanup()
reported by lkp. lkp report for reference:
- https://lore.kernel.org/patchwork/patch/1264418/
include/linux/kexec.h | 29 ++++++++++++++++++-----------
kernel/kexec_file.c | 16 ++++++++++++++--
2 files changed, 32 insertions(+), 13 deletions(-)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index ea67910ae6b7..9e93bef52968 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -183,17 +183,24 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
bool get_value);
void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name);
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len);
-void * __weak arch_kexec_kernel_image_load(struct kimage *image);
-int __weak arch_kexec_apply_relocations_add(struct purgatory_info *pi,
- Elf_Shdr *section,
- const Elf_Shdr *relsec,
- const Elf_Shdr *symtab);
-int __weak arch_kexec_apply_relocations(struct purgatory_info *pi,
- Elf_Shdr *section,
- const Elf_Shdr *relsec,
- const Elf_Shdr *symtab);
+/* Architectures may override the below functions */
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len);
+void *arch_kexec_kernel_image_load(struct kimage *image);
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ Elf_Shdr *section,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+int arch_kexec_apply_relocations(struct purgatory_info *pi,
+ Elf_Shdr *section,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+int arch_kimage_file_post_load_cleanup(struct kimage *image);
+#ifdef CONFIG_KEXEC_SIG
+int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len);
+#endif
+int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf);
extern int kexec_add_buffer(struct kexec_buf *kbuf);
int kexec_locate_mem_hole(struct kexec_buf *kbuf);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 09cc78df53c6..e89912d33a27 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -635,6 +635,19 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
return ret == 1 ? 0 : -EADDRNOTAVAIL;
}
+/**
+ * arch_kexec_locate_mem_hole - Find free memory to place the segments.
+ * @kbuf: Parameters for the memory search.
+ *
+ * On success, kbuf->mem will have the start address of the memory region found.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+ return kexec_locate_mem_hole(kbuf);
+}
+
/**
* kexec_add_buffer - place a buffer in a kexec segment
* @kbuf: Buffer contents and memory parameters.
@@ -647,7 +660,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
*/
int kexec_add_buffer(struct kexec_buf *kbuf)
{
-
struct kexec_segment *ksegment;
int ret;
@@ -675,7 +687,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf)
kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
/* Walk the RAM ranges and allocate a suitable range for the buffer */
- ret = kexec_locate_mem_hole(kbuf);
+ ret = arch_kexec_locate_mem_hole(kbuf);
if (ret)
return ret;
^ permalink raw reply related
* [PATCH v6 00/11] ppc64: enable kdump support for kexec_file_load syscall
From: Hari Bathini @ 2020-07-29 11:38 UTC (permalink / raw)
To: Michael Ellerman
Cc: kernel test robot, Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain,
Petr Tesarik, Mahesh J Salgaonkar, Sourabh Jain, lkml,
linuxppc-dev, Vivek Goyal, Andrew Morton, Laurent Dufour,
Dave Young, Thiago Jung Bauermann, Eric Biederman
Sorry! There was a gateway issue on my system while posting v5, due to
which some patches did not make it through. Resending...
This patch series enables kdump support for kexec_file_load system
call (kexec -s -p) on PPC64. The changes are inspired from kexec-tools
code but heavily modified for kernel consumption.
The first patch adds a weak arch_kexec_locate_mem_hole() function to
override locate memory hole logic suiting arch needs. There are some
special regions in ppc64 which should be avoided while loading buffer
& there are multiple callers to kexec_add_buffer making it complicated
to maintain range sanity and using generic lookup at the same time.
The second patch marks ppc64 specific code within arch/powerpc/kexec
and arch/powerpc/purgatory to make the subsequent code changes easy
to understand.
The next patch adds helper function to setup different memory ranges
needed for loading kdump kernel, booting into it and exporting the
crashing kernel's elfcore.
The fourth patch overrides arch_kexec_locate_mem_hole() function to
locate memory hole for kdump segments by accounting for the special
memory regions, referred to as excluded memory ranges, and sets
kbuf->mem when a suitable memory region is found.
The fifth patch moves walk_drmem_lmbs() out of .init section with
a few changes to reuse it for setting up kdump kernel's usable memory
ranges. The next patch uses walk_drmem_lmbs() to look up the LMBs
and set linux,drconf-usable-memory & linux,usable-memory properties
in order to restrict kdump kernel's memory usage.
The next patch setups up backup region as a kexec segment while
loading kdump kernel and teaches purgatory to copy data from source
to destination.
Patch 09 builds the elfcore header for the running kernel & passes
the info to kdump kernel via "elfcorehdr=" parameter to export as
/proc/vmcore file. The next patch sets up the memory reserve map
for the kexec kernel and also claims kdump support for kdump as
all the necessary changes are added.
The next patch fixes a lookup issue for `kexec -l -s` case when
memory is reserved for crashkernel.
The last patch updates purgatory to setup r8 & r9 with opal base
and opal entry addresses respectively to aid kernels built with
CONFIG_PPC_EARLY_DEBUG_OPAL enabled.
Tested the changes successfully on P8, P9 lpars, couple of OpenPOWER
boxes, one with secureboot enabled, KVM guest and a simulator.
v5 -> v6:
* Fixed reference count leak in add_tce_mem_ranges() function and also
updated error handling in reading tce table base & sizes.
* Instead of trying to reinvent the wheel with get_node_path() &
get_node_path_size() functions, used %pOF format as suggested by mpe.
* Moved patch 07/11 to end of the series for mpe to take a call on
whether to have it or not.
v4 -> v5:
* Dropped patches 07/12 & 08/12 and updated purgatory to do everything
in assembly.
* Added a new patch (which was part of patch 08/12 in v4) to update
r8 & r9 registers with opal base & opal entry addresses as it is
expected on kernels built with CONFIG_PPC_EARLY_DEBUG_OPAL enabled.
* Fixed kexec load issue on KVM guest.
v3 -> v4:
* Updated get_node_path() function to be iterative instead of a recursive one.
* Added comment explaining why low memory is added to kdump kernel's usable
memory ranges though it doesn't fall in crashkernel region.
* Fixed stack_buf to be quadword aligned in accordance with ABI.
* Added missing of_node_put() in setup_purgatory_ppc64().
* Added a FIXME tag to indicate issue in adding opal/rtas regions to
core image.
v2 -> v3:
* Fixed TOC pointer calculation for purgatory by using section info
that has relocations applied.
* Fixed arch_kexec_locate_mem_hole() function to fallback to generic
kexec_locate_mem_hole() lookup if exclude ranges list is empty.
* Dropped check for backup_start in trampoline_64.S as purgatory()
function takes care of it anyway.
v1 -> v2:
* Introduced arch_kexec_locate_mem_hole() for override and dropped
weak arch_kexec_add_buffer().
* Addressed warnings reported by lkp.
* Added patch to address kexec load issue when memory is reserved
for crashkernel.
* Used the appropriate license header for the new files added.
* Added an option to merge ranges to minimize reallocations while
adding memory ranges.
* Dropped within_crashkernel parameter for add_opal_mem_range() &
add_rtas_mem_range() functions as it is not really needed.
---
Hari Bathini (11):
kexec_file: allow archs to handle special regions while locating memory hole
powerpc/kexec_file: mark PPC64 specific code
powerpc/kexec_file: add helper functions for getting memory ranges
ppc64/kexec_file: avoid stomping memory used by special regions
powerpc/drmem: make lmb walk a bit more flexible
ppc64/kexec_file: restrict memory usage of kdump kernel
ppc64/kexec_file: setup backup region for kdump kernel
ppc64/kexec_file: prepare elfcore header for crashing kernel
ppc64/kexec_file: add appropriate regions for memory reserve map
ppc64/kexec_file: fix kexec load failure with lack of memory hole
ppc64/kexec_file: enable early kernel's OPAL calls
arch/powerpc/include/asm/crashdump-ppc64.h | 19
arch/powerpc/include/asm/drmem.h | 9
arch/powerpc/include/asm/kexec.h | 29 +
arch/powerpc/include/asm/kexec_ranges.h | 25 +
arch/powerpc/kernel/prom.c | 13
arch/powerpc/kexec/Makefile | 2
arch/powerpc/kexec/elf_64.c | 36 +
arch/powerpc/kexec/file_load.c | 60 +-
arch/powerpc/kexec/file_load_64.c | 1119 ++++++++++++++++++++++++++++
arch/powerpc/kexec/ranges.c | 412 ++++++++++
arch/powerpc/mm/drmem.c | 87 +-
arch/powerpc/mm/numa.c | 13
arch/powerpc/purgatory/Makefile | 4
arch/powerpc/purgatory/trampoline.S | 117 ---
arch/powerpc/purgatory/trampoline_64.S | 163 ++++
include/linux/kexec.h | 29 -
kernel/kexec_file.c | 16
17 files changed, 1958 insertions(+), 195 deletions(-)
create mode 100644 arch/powerpc/include/asm/crashdump-ppc64.h
create mode 100644 arch/powerpc/include/asm/kexec_ranges.h
create mode 100644 arch/powerpc/kexec/file_load_64.c
create mode 100644 arch/powerpc/kexec/ranges.c
delete mode 100644 arch/powerpc/purgatory/trampoline.S
create mode 100644 arch/powerpc/purgatory/trampoline_64.S
^ permalink raw reply
* Re: [PATCH] powerpc: Fix MMCRA_BHRB_DISABLE define to work with binutils version < 2.28
From: Madhavan Srinivasan @ 2020-07-29 10:13 UTC (permalink / raw)
To: Athira Rajeev, mpe; +Cc: maddy, linuxppc-dev
In-Reply-To: <1595996214-5833-1-git-send-email-atrajeev@linux.vnet.ibm.com>
On 7/29/20 9:46 AM, Athira Rajeev wrote:
> commit 9908c826d5ed ("powerpc/perf: Add Power10 PMU feature to
> DT CPU features") defines MMCRA_BHRB_DISABLE as `0x2000000000UL`.
> Binutils version less than 2.28 doesn't support UL suffix.
>
> linux-ppc/arch/powerpc/kernel/cpu_setup_power.S: Assembler messages:
> linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
> linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: junk at end of line, first unrecognized character is `L'
> linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
> linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
> linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: junk at end of line, first unrecognized character is `L'
> linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
> linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
> linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: operand out of range (0x0000002000000000 is not between 0xffffffffffff8000 and 0x000000000000ffff)
>
> Fix this by wrapping it around `_UL` macro.
Looks fine to me.
Reviewed-by: Madhavan Srinivasan <maddy@linux.ibm.com>
>
> Fixes: 9908c826d5ed ("Add Power10 PMU feature to DT CPU features")
> Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
> Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
> arch/powerpc/include/asm/reg.h | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index ae71027..41419f1 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -12,6 +12,7 @@
> #ifdef __KERNEL__
>
> #include <linux/stringify.h>
> +#include <linux/const.h>
> #include <asm/cputable.h>
> #include <asm/asm-const.h>
> #include <asm/feature-fixups.h>
> @@ -888,7 +889,7 @@
> #define MMCRA_SLOT 0x07000000UL /* SLOT bits (37-39) */
> #define MMCRA_SLOT_SHIFT 24
> #define MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */
> -#define MMCRA_BHRB_DISABLE 0x2000000000UL // BHRB disable bit for ISA v3.1
> +#define MMCRA_BHRB_DISABLE _UL(0x2000000000) // BHRB disable bit for ISA v3.1
> #define POWER6_MMCRA_SDSYNC 0x0000080000000000ULL /* SDAR/SIAR synced */
> #define POWER6_MMCRA_SIHV 0x0000040000000000ULL
> #define POWER6_MMCRA_SIPR 0x0000020000000000ULL
^ permalink raw reply
* Re: [PATCH 04/15] arm64: numa: simplify dummy_numa_init()
From: Jonathan Cameron @ 2020-07-29 8:30 UTC (permalink / raw)
To: Mike Rapoport
Cc: linux-sh, Peter Zijlstra, Catalin Marinas, Dave Hansen,
linux-kernel, Max Filippov, Paul Mackerras, sparclinux,
linux-riscv, Will Deacon, Thomas Gleixner, linux-s390,
linux-c6x-dev, Yoshinori Sato, x86, Russell King, Mike Rapoport,
clang-built-linux, Ingo Molnar, Christoph Hellwig,
uclinux-h8-devel, linux-xtensa, openrisc, Borislav Petkov,
Andy Lutomirski, Paul Walmsley, Stafford Horne, linux-arm-kernel,
Michal Simek, linux-mm, linux-mips, iommu, Palmer Dabbelt,
Andrew Morton, linuxppc-dev
In-Reply-To: <20200728051153.1590-5-rppt@kernel.org>
On Tue, 28 Jul 2020 08:11:42 +0300
Mike Rapoport <rppt@kernel.org> wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
>
> dummy_numa_init() loops over memblock.memory and passes nid=0 to
> numa_add_memblk() which essentially wraps memblock_set_node(). However,
> memblock_set_node() can cope with entire memory span itself, so the loop
> over memblock.memory regions is redundant.
>
> Replace the loop with a single call to memblock_set_node() to the entire
> memory.
Hi Mike,
I had a similar patch I was going to post shortly so can add a bit more
on the advantages of this one.
Beyond cleaning up, it also fixes an issue with a buggy ACPI firmware in which the SRAT
table covers some but not all of the memory in the EFI memory map. Stealing bits
from the draft cover letter I had for that...
> This issue can be easily triggered by having an SRAT table which fails
> to cover all elements of the EFI memory map.
>
> This firmware error is detected and a warning printed. e.g.
> "NUMA: Warning: invalid memblk node 64 [mem 0x240000000-0x27fffffff]"
> At that point we fall back to dummy_numa_init().
>
> However, the failed ACPI init has left us with our memblocks all broken
> up as we split them when trying to assign them to NUMA nodes.
>
> We then iterate over the memblocks and add them to node 0.
>
> for_each_memblock(memory, mblk) {
> ret = numa_add_memblk(0, mblk->base, mblk->base + mblk->size);
> if (!ret)
> continue;
> pr_err("NUMA init failed\n");
> return ret;
> }
>
> numa_add_memblk() calls memblock_set_node() which merges regions that
> were previously split up during the earlier attempt to add them to different
> nodes during parsing of SRAT.
>
> This means elements are moved in the memblock array and we can end up
> in a different memblock after the call to numa_add_memblk().
> Result is:
>
> Unable to handle kernel paging request at virtual address 0000000000003a40
> Mem abort info:
> ESR = 0x96000004
> EC = 0x25: DABT (current EL), IL = 32 bits
> SET = 0, FnV = 0
> EA = 0, S1PTW = 0
> Data abort info:
> ISV = 0, ISS = 0x00000004
> CM = 0, WnR = 0
> [0000000000003a40] user address but active_mm is swapper
> Internal error: Oops: 96000004 [#1] PREEMPT SMP
>
> ...
>
> Call trace:
> sparse_init_nid+0x5c/0x2b0
> sparse_init+0x138/0x170
> bootmem_init+0x80/0xe0
> setup_arch+0x2a0/0x5fc
> start_kernel+0x8c/0x648
>
> As an illustrative example:
> EFI table has one block of memory.
> memblks[0] = [0...0x2f] so we start with a single memblock.
>
> SRAT has
> [0x00...0x0f] in node 0
> [0x10...0x1f] in node 1
> but no entry covering
> [0x20...0x2f].
>
> Whilst parsing SRAT the single memblock is broken into 3.
> memblks[0] = [0x00...0x0f] in node 0
> memblks[1] = [0x10...0x1f] in node 1
> memblks[2] = [0x20...0x2f] in node MAX_NUM_NODES (invalid value)
>
> A sanity check parse then detects the invalid section and acpi_numa_init
> fails. We then fall back to the dummy path.
>
> That iterates over the memblocks. We'll use i an index in the array of memblocks
>
> i = 0;
> memblks[0] = [0x00...0x0f] set to node0.
> merge doesn't do anything because the neighbouring memblock is still in node1.
>
> i = 1
> memblks[1] = [0x10...0x1f] set to node 0.
> merge combines memblock 0 and 1 to give a new set of memblocks.
>
> memblks[0] = [0x00..0x1f] in node 0
> memblks[1] = [0x20..0x2f] in node MAX_NUM_NODES.
>
> i = 2 off the end of the now reduced array of memblocks, so exit the loop.
> (if we restart the loop here everything will be fine).
>
> Later sparse_init_nid tries to use the node of the second memblock to index
> somethings and boom.
>
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> ---
> arch/arm64/mm/numa.c | 13 +++++--------
> 1 file changed, 5 insertions(+), 8 deletions(-)
>
> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
> index aafcee3e3f7e..0cbdbcc885fb 100644
> --- a/arch/arm64/mm/numa.c
> +++ b/arch/arm64/mm/numa.c
> @@ -423,19 +423,16 @@ static int __init numa_init(int (*init_func)(void))
> */
> static int __init dummy_numa_init(void)
> {
> + phys_addr_t start = memblock_start_of_DRAM();
> + phys_addr_t end = memblock_end_of_DRAM();
> int ret;
> - struct memblock_region *mblk;
>
> if (numa_off)
> pr_info("NUMA disabled\n"); /* Forced off on command line. */
> - pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n",
> - memblock_start_of_DRAM(), memblock_end_of_DRAM() - 1);
> -
> - for_each_memblock(memory, mblk) {
> - ret = numa_add_memblk(0, mblk->base, mblk->base + mblk->size);
> - if (!ret)
> - continue;
> + pr_info("Faking a node at [mem %#018Lx-%#018Lx]\n", start, end - 1);
>
> + ret = numa_add_memblk(0, start, end);
> + if (ret) {
> pr_err("NUMA init failed\n");
> return ret;
> }
^ permalink raw reply
* [PATCH v2 3/3] cpuidle-pseries : Fixup exit latency for CEDE(0)
From: Gautham R. Shenoy @ 2020-07-29 6:47 UTC (permalink / raw)
To: Nicholas Piggin, Anton Blanchard, Nathan Lynch, Michael Ellerman,
Michael Neuling, Vaidyanathan Srinivasan
Cc: linuxppc-dev, Gautham R. Shenoy, linux-kernel, linux-pm
In-Reply-To: <1596005254-25753-1-git-send-email-ego@linux.vnet.ibm.com>
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
We are currently assuming that CEDE(0) has exit latency 10us, since
there is no way for us to query from the platform. However, if the
wakeup latency of an Extended CEDE state is smaller than 10us, then we
can be sure that the exit latency of CEDE(0) cannot be more than that.
that.
In this patch, we fix the exit latency of CEDE(0) if we discover an
Extended CEDE state with wakeup latency smaller than 10us.
Benchmark results:
ebizzy:
2 ebizzy threads bound to the same big-core. 25% improvement in the
avg records/s with patch.
x without_patch
+ with_patch
N Min Max Median Avg Stddev
x 10 2491089 5834307 5398375 4244335 1596244.9
+ 10 2893813 5834474 5832448 5327281.3 1055941.4
context_switch2 :
There is no major regression observed with this patch as seen from the
context_switch2 benchmark.
context_switch2 across CPU0 CPU1 (Both belong to same big-core, but different
small cores). We observe a minor 0.14% regression in the number of
context-switches (higher is better).
x without_patch
+ with_patch
N Min Max Median Avg Stddev
x 500 348872 362236 354712 354745.69 2711.827
+ 500 349422 361452 353942 354215.4 2576.9258
Difference at 99.0% confidence
-530.288 +/- 430.963
-0.149484% +/- 0.121485%
(Student's t, pooled s = 2645.24)
context_switch2 across CPU0 CPU8 (Different big-cores). We observe a 0.37%
improvement in the number of context-switches (higher is better).
x without_patch
+ with_patch
N Min Max Median Avg Stddev
x 500 287956 294940 288896 288977.23 646.59295
+ 500 288300 294646 289582 290064.76 1161.9992
Difference at 99.0% confidence
1087.53 +/- 153.194
0.376337% +/- 0.0530125%
(Student's t, pooled s = 940.299)
schbench:
No major difference could be seen until the 99.9th percentile.
Without-patch
Latency percentiles (usec)
50.0th: 29
75.0th: 39
90.0th: 49
95.0th: 59
*99.0th: 13104
99.5th: 14672
99.9th: 15824
min=0, max=17993
With-patch:
Latency percentiles (usec)
50.0th: 29
75.0th: 40
90.0th: 50
95.0th: 61
*99.0th: 13648
99.5th: 14768
99.9th: 15664
min=0, max=29812
Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
---
drivers/cpuidle/cpuidle-pseries.c | 34 ++++++++++++++++++++++++++++++++--
1 file changed, 32 insertions(+), 2 deletions(-)
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index b1dc24d..0b2f115 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -334,12 +334,42 @@ static int pseries_cpuidle_driver_init(void)
static int add_pseries_idle_states(void)
{
int nr_states = 2; /* By default we have snooze, CEDE */
+ int i;
+ u64 min_latency_us = dedicated_states[1].exit_latency; /* CEDE latency */
if (parse_cede_parameters())
return nr_states;
- pr_info("cpuidle : Skipping the %d Extended CEDE idle states\n",
- nr_xcede_records);
+ for (i = 0; i < nr_xcede_records; i++) {
+ u64 latency_tb = xcede_records[i].wakeup_latency_tb_ticks;
+ u64 latency_us = tb_to_ns(latency_tb) / NSEC_PER_USEC;
+
+ if (latency_us < min_latency_us)
+ min_latency_us = latency_us;
+ }
+
+ /*
+ * We are currently assuming that CEDE(0) has exit latency
+ * 10us, since there is no way for us to query from the
+ * platform.
+ *
+ * However, if the wakeup latency of an Extended CEDE state is
+ * smaller than 10us, then we can be sure that CEDE(0)
+ * requires no more than that.
+ *
+ * Perform the fix-up.
+ */
+ if (min_latency_us < dedicated_states[1].exit_latency) {
+ u64 cede0_latency = min_latency_us - 1;
+
+ if (cede0_latency <= 0)
+ cede0_latency = min_latency_us;
+
+ dedicated_states[1].exit_latency = cede0_latency;
+ dedicated_states[1].target_residency = 10 * (cede0_latency);
+ pr_info("cpuidle : Fixed up CEDE exit latency to %llu us\n",
+ cede0_latency);
+ }
return nr_states;
}
--
1.9.4
^ permalink raw reply related
* [PATCH v2 0/3] cpuidle-pseries: Parse extended CEDE information for idle.
From: Gautham R. Shenoy @ 2020-07-29 6:47 UTC (permalink / raw)
To: Nicholas Piggin, Anton Blanchard, Nathan Lynch, Michael Ellerman,
Michael Neuling, Vaidyanathan Srinivasan
Cc: linuxppc-dev, Gautham R. Shenoy, linux-kernel, linux-pm
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Hi,
This is a v2 of the patch series to parse the extended CEDE
information in the pseries-cpuidle driver.
The v1 of this patchset can be found here :
https://lore.kernel.org/linuxppc-dev/1594120299-31389-1-git-send-email-ego@linux.vnet.ibm.com/
The change from v1 --> v2 :
* Dropped Patches 4 and 5 which would expose extended idle-states,
that wakeup on external interrupts, to cpuidle framework. These
were RFC patches in v1. Dropped them because currently the only
extended CEDE state that wakesup on external interrupts is CEDE(1)
which adds no signifcant value over CEDE(0).
* Rebased the patches onto powerpc/merge.
* No changes in code for Patches 1-3.
Motivation:
===========
On pseries Dedicated Linux LPARs, apart from the polling snooze idle
state, we currently have the CEDE idle state which cedes the CPU to
the hypervisor with latency-hint = 0.
However, the PowerVM hypervisor supports additional extended CEDE
states, which can be queried through the "ibm,get-systems-parameter"
rtas-call with the CEDE_LATENCY_TOKEN. The hypervisor maps these
extended CEDE states to appropriate platform idle-states in order to
provide energy-savings as well as shifting power to the active
units. On existing pseries LPARs today we have extended CEDE with
latency-hints {1,2} supported.
The patches in this patchset, adds code to parse the CEDE latency
records provided by the hypervisor. We use this information to
determine the wakeup latency of the regular CEDE (which we have been
so far hardcoding to 10us while experimentally it is much lesser ~
1us), by looking at the wakeup latency provided by the hypervisor for
Extended CEDE states. Since the platform currently advertises Extended
CEDE 1 to have wakeup latency of 2us, we can be sure that the wakeup
latency of the regular CEDE is no more than this.
With Patches 1-3, we see an improvement in the single-threaded
performance on ebizzy.
2 ebizzy threads bound to the same big-core. 25% improvement in the
avg records/s (higher the better) with patches 1-3.
x without_patches
* with_patches
N Min Max Median Avg Stddev
x 10 2491089 5834307 5398375 4244335 1596244.9
* 10 2893813 5834474 5832448 5327281.3 1055941.4
We do not observe any major regression in either the context_switch2
benchmark or the schbench benchmark
context_switch2 across CPU0 CPU1 (Both belong to same big-core, but different
small cores). We observe a minor 0.14% regression in the number of
context-switches (higher is better).
x without_patch
* with_patch
N Min Max Median Avg Stddev
x 500 348872 362236 354712 354745.69 2711.827
* 500 349422 361452 353942 354215.4 2576.9258
context_switch2 across CPU0 CPU8 (Different big-cores). We observe a 0.37%
improvement in the number of context-switches (higher is better).
x without_patch
* with_patch
N Min Max Median Avg Stddev
x 500 287956 294940 288896 288977.23 646.59295
* 500 288300 294646 289582 290064.76 1161.9992
schbench:
No major difference could be seen until the 99.9th percentile.
Without-patch
Latency percentiles (usec)
50.0th: 29
75.0th: 39
90.0th: 49
95.0th: 59
*99.0th: 13104
99.5th: 14672
99.9th: 15824
min=0, max=17993
With-patch:
Latency percentiles (usec)
50.0th: 29
75.0th: 40
90.0th: 50
95.0th: 61
*99.0th: 13648
99.5th: 14768
99.9th: 15664
min=0, max=29812
Gautham R. Shenoy (3):
cpuidle-pseries: Set the latency-hint before entering CEDE
cpuidle-pseries: Add function to parse extended CEDE records
cpuidle-pseries : Fixup exit latency for CEDE(0)
drivers/cpuidle/cpuidle-pseries.c | 167 +++++++++++++++++++++++++++++++++++++-
1 file changed, 165 insertions(+), 2 deletions(-)
--
1.9.4
^ permalink raw reply
* [PATCH v2 1/3] cpuidle-pseries: Set the latency-hint before entering CEDE
From: Gautham R. Shenoy @ 2020-07-29 6:47 UTC (permalink / raw)
To: Nicholas Piggin, Anton Blanchard, Nathan Lynch, Michael Ellerman,
Michael Neuling, Vaidyanathan Srinivasan
Cc: linuxppc-dev, Gautham R. Shenoy, linux-kernel, linux-pm
In-Reply-To: <1596005254-25753-1-git-send-email-ego@linux.vnet.ibm.com>
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
As per the PAPR, each H_CEDE call is associated with a latency-hint to
be passed in the VPA field "cede_latency_hint". The CEDE states that
we were implicitly entering so far is CEDE with latency-hint = 0.
This patch explicitly sets the latency hint corresponding to the CEDE
state that we are currently entering. While at it, we save the
previous hint, to be restored once we wakeup from CEDE. This will be
required in the future when we expose extended-cede states through the
cpuidle framework, where each of them will have a different
cede-latency hint.
Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
---
drivers/cpuidle/cpuidle-pseries.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index 3e058ad2..88e71c3 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -86,19 +86,27 @@ static void check_and_cede_processor(void)
}
}
+#define NR_CEDE_STATES 1 /* CEDE with latency-hint 0 */
+#define NR_DEDICATED_STATES (NR_CEDE_STATES + 1) /* Includes snooze */
+
+u8 cede_latency_hint[NR_DEDICATED_STATES];
static int dedicated_cede_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
{
+ u8 old_latency_hint;
pseries_idle_prolog();
get_lppaca()->donate_dedicated_cpu = 1;
+ old_latency_hint = get_lppaca()->cede_latency_hint;
+ get_lppaca()->cede_latency_hint = cede_latency_hint[index];
HMT_medium();
check_and_cede_processor();
local_irq_disable();
get_lppaca()->donate_dedicated_cpu = 0;
+ get_lppaca()->cede_latency_hint = old_latency_hint;
pseries_idle_epilog();
@@ -130,7 +138,7 @@ static int shared_cede_loop(struct cpuidle_device *dev,
/*
* States for dedicated partition case.
*/
-static struct cpuidle_state dedicated_states[] = {
+static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
{ /* Snooze */
.name = "snooze",
.desc = "snooze",
--
1.9.4
^ permalink raw reply related
* [PATCH v2 2/3] cpuidle-pseries: Add function to parse extended CEDE records
From: Gautham R. Shenoy @ 2020-07-29 6:47 UTC (permalink / raw)
To: Nicholas Piggin, Anton Blanchard, Nathan Lynch, Michael Ellerman,
Michael Neuling, Vaidyanathan Srinivasan
Cc: linuxppc-dev, Gautham R. Shenoy, linux-kernel, linux-pm
In-Reply-To: <1596005254-25753-1-git-send-email-ego@linux.vnet.ibm.com>
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Currently we use CEDE with latency-hint 0 as the only other idle state
on a dedicated LPAR apart from the polling "snooze" state.
The platform might support additional extended CEDE idle states, which
can be discovered through the "ibm,get-system-parameter" rtas-call
made with CEDE_LATENCY_TOKEN.
This patch adds a function to obtain information about the extended
CEDE idle states from the platform and parse the contents to populate
an array of extended CEDE states. These idle states thus discovered
will be added to the cpuidle framework in the next patch.
dmesg on a POWER9 LPAR, demonstrating the output of parsing the
extended CEDE latency parameters.
[ 5.913180] xcede : xcede_record_size = 10
[ 5.913183] xcede : Record 0 : hint = 1, latency =0x400 tb-ticks, Wake-on-irq = 1
[ 5.913188] xcede : Record 1 : hint = 2, latency =0x3e8000 tb-ticks, Wake-on-irq = 0
[ 5.913193] cpuidle : Skipping the 2 Extended CEDE idle states
Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
---
drivers/cpuidle/cpuidle-pseries.c | 129 +++++++++++++++++++++++++++++++++++++-
1 file changed, 127 insertions(+), 2 deletions(-)
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index 88e71c3..b1dc24d 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -21,6 +21,7 @@
#include <asm/runlatch.h>
#include <asm/idle.h>
#include <asm/plpar_wrappers.h>
+#include <asm/rtas.h>
static struct cpuidle_driver pseries_idle_driver = {
.name = "pseries_idle",
@@ -86,9 +87,120 @@ static void check_and_cede_processor(void)
}
}
-#define NR_CEDE_STATES 1 /* CEDE with latency-hint 0 */
+struct xcede_latency_records {
+ u8 latency_hint;
+ u64 wakeup_latency_tb_ticks;
+ u8 responsive_to_irqs;
+};
+
+/*
+ * XCEDE : Extended CEDE states discovered through the
+ * "ibm,get-systems-parameter" rtas-call with the token
+ * CEDE_LATENCY_TOKEN
+ */
+#define MAX_XCEDE_STATES 4
+#define XCEDE_LATENCY_RECORD_SIZE 10
+#define XCEDE_LATENCY_PARAM_MAX_LENGTH (2 + 2 + \
+ (MAX_XCEDE_STATES * XCEDE_LATENCY_RECORD_SIZE))
+
+#define CEDE_LATENCY_TOKEN 45
+
+#define NR_CEDE_STATES (MAX_XCEDE_STATES + 1) /* CEDE with latency-hint 0 */
#define NR_DEDICATED_STATES (NR_CEDE_STATES + 1) /* Includes snooze */
+struct xcede_latency_records xcede_records[MAX_XCEDE_STATES];
+unsigned int nr_xcede_records;
+char xcede_parameters[XCEDE_LATENCY_PARAM_MAX_LENGTH];
+
+static int parse_cede_parameters(void)
+{
+ int ret = -1, i;
+ u16 payload_length;
+ u8 xcede_record_size;
+ u32 total_xcede_records_size;
+ char *payload;
+
+ memset(xcede_parameters, 0, XCEDE_LATENCY_PARAM_MAX_LENGTH);
+
+ ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+ NULL, CEDE_LATENCY_TOKEN, __pa(xcede_parameters),
+ XCEDE_LATENCY_PARAM_MAX_LENGTH);
+
+ if (ret) {
+ pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
+ return ret;
+ }
+
+ payload_length = be16_to_cpu(*(__be16 *)(&xcede_parameters[0]));
+ payload = &xcede_parameters[2];
+
+ /*
+ * If the platform supports the cede latency settings
+ * information system parameter it must provide the following
+ * information in the NULL terminated parameter string:
+ *
+ * a. The first byte is the length “N” of each cede
+ * latency setting record minus one (zero indicates a length
+ * of 1 byte).
+ *
+ * b. For each supported cede latency setting a cede latency
+ * setting record consisting of the first “N” bytes as per
+ * the following table.
+ *
+ * -----------------------------
+ * | Field | Field |
+ * | Name | Length |
+ * -----------------------------
+ * | Cede Latency | 1 Byte |
+ * | Specifier Value | |
+ * -----------------------------
+ * | Maximum wakeup | |
+ * | latency in | 8 Bytes|
+ * | tb-ticks | |
+ * -----------------------------
+ * | Responsive to | |
+ * | external | 1 Byte |
+ * | interrupts | |
+ * -----------------------------
+ *
+ * This version has cede latency record size = 10.
+ */
+ xcede_record_size = (u8)payload[0] + 1;
+
+ if (xcede_record_size != XCEDE_LATENCY_RECORD_SIZE) {
+ pr_err("xcede : Expected record-size %d. Observed size %d.\n",
+ XCEDE_LATENCY_RECORD_SIZE, xcede_record_size);
+ return -EINVAL;
+ }
+
+ pr_info("xcede : xcede_record_size = %d\n", xcede_record_size);
+
+ /*
+ * Since the payload_length includes the last NULL byte and
+ * the xcede_record_size, the remaining bytes correspond to
+ * array of all cede_latency settings.
+ */
+ total_xcede_records_size = payload_length - 2;
+ nr_xcede_records = total_xcede_records_size / xcede_record_size;
+
+ payload++;
+ for (i = 0; i < nr_xcede_records; i++) {
+ struct xcede_latency_records *record = &xcede_records[i];
+
+ record->latency_hint = (u8)payload[0];
+ record->wakeup_latency_tb_ticks =
+ be64_to_cpu(*(__be64 *)(&payload[1]));
+ record->responsive_to_irqs = (u8)payload[9];
+ payload += xcede_record_size;
+ pr_info("xcede : Record %d : hint = %u, latency =0x%llx tb-ticks, Wake-on-irq = %u\n",
+ i, record->latency_hint,
+ record->wakeup_latency_tb_ticks,
+ record->responsive_to_irqs);
+ }
+
+ return 0;
+}
+
u8 cede_latency_hint[NR_DEDICATED_STATES];
static int dedicated_cede_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
@@ -219,6 +331,19 @@ static int pseries_cpuidle_driver_init(void)
return 0;
}
+static int add_pseries_idle_states(void)
+{
+ int nr_states = 2; /* By default we have snooze, CEDE */
+
+ if (parse_cede_parameters())
+ return nr_states;
+
+ pr_info("cpuidle : Skipping the %d Extended CEDE idle states\n",
+ nr_xcede_records);
+
+ return nr_states;
+}
+
/*
* pseries_idle_probe()
* Choose state table for shared versus dedicated partition
@@ -241,7 +366,7 @@ static int pseries_idle_probe(void)
max_idle_state = ARRAY_SIZE(shared_states);
} else {
cpuidle_state_table = dedicated_states;
- max_idle_state = ARRAY_SIZE(dedicated_states);
+ max_idle_state = add_pseries_idle_states();
}
} else
return -ENODEV;
--
1.9.4
^ permalink raw reply related
* Re: [PATCH v4 09/10] Powerpc/smp: Create coregroup domain
From: Srikar Dronamraju @ 2020-07-29 6:13 UTC (permalink / raw)
To: Valentin Schneider
Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
LKML, Nicholas Piggin, Oliver O'Halloran, Jordan Niethe,
linuxppc-dev, Ingo Molnar
In-Reply-To: <jhjr1sviswg.mognet@arm.com>
* Valentin Schneider <valentin.schneider@arm.com> [2020-07-28 16:03:11]:
Hi Valentin,
Thanks for looking into the patches.
> On 27/07/20 06:32, Srikar Dronamraju wrote:
> > Add percpu coregroup maps and masks to create coregroup domain.
> > If a coregroup doesn't exist, the coregroup domain will be degenerated
> > in favour of SMT/CACHE domain.
> >
>
> So there's at least one arm64 platform out there with the same "pairs of
> cores share L2" thing (Ampere eMAG), and that lives quite happily with the
> default scheduler topology (SMT/MC/DIE). Each pair of core gets its MC
> domain, and the whole system is covered by DIE.
>
> Now arguably it's not a perfect representation; DIE doesn't have
> SD_SHARE_PKG_RESOURCES so the highest level sd_llc can point to is MC. That
> will impact all callsites using cpus_share_cache(): in the eMAG case, only
> pairs of cores will be seen as sharing cache, even though *all* cores share
> the same L3.
>
Okay, Its good to know that we have a chip which is similar to P9 in
topology.
> I'm trying to paint a picture of what the P9 topology looks like (the one
> you showcase in your cover letter) to see if there are any similarities;
> from what I gather in [1], wikichips and your cover letter, with P9 you can
> have something like this in a single DIE (somewhat unsure about L3 setup;
> it looks to be distributed?)
>
> +---------------------------------------------------------------------+
> | L3 |
> +---------------+-+---------------+-+---------------+-+---------------+
> | L2 | | L2 | | L2 | | L2 |
> +------+-+------+ +------+-+------+ +------+-+------+ +------+-+------+
> | L1 | | L1 | | L1 | | L1 | | L1 | | L1 | | L1 | | L1 |
> +------+ +------+ +------+ +------+ +------+ +------+ +------+ +------+
> |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs|
> +------+ +------+ +------+ +------+ +------+ +------+ +------+ +------+
>
> Which would lead to (ignoring the whole SMT CPU numbering shenanigans)
>
> NUMA [ ...
> DIE [ ]
> MC [ ] [ ] [ ] [ ]
> BIGCORE [ ] [ ] [ ] [ ]
> SMT [ ] [ ] [ ] [ ] [ ] [ ] [ ] [ ]
> 00-03 04-07 08-11 12-15 16-19 20-23 24-27 28-31 <other node here>
>
What you have summed up is perfectly what a P9 topology looks like. I dont
think I could have explained it better than this.
> This however has MC == BIGCORE; what makes it you can have different spans
> for these two domains? If it's not too much to ask, I'd love to have a P9
> topology diagram.
>
> [1]: 20200722081822.GG9290@linux.vnet.ibm.com
At this time the current topology would be good enough i.e BIGCORE would
always be equal to a MC. However in future we could have chips that can have
lesser/larger number of CPUs in llc than in a BIGCORE or we could have
granular or split L3 caches within a DIE. In such a case BIGCORE != MC.
Also in the current P9 itself, two neighbouring core-pairs form a quad.
Cache latency within a quad is better than a latency to a distant core-pair.
Cache latency within a core pair is way better than latency within a quad.
So if we have only 4 threads running on a DIE all of them accessing the same
cache-lines, then we could probably benefit if all the tasks were to run
within the quad aka MC/Coregroup.
I have found some benchmarks which are latency sensitive to benefit by
having a grouping a quad level (using kernel hacks and not backed by
firmware changes). Gautham also found similar results in his experiments
but he only used binding within the stock kernel.
I am not setting SD_SHARE_PKG_RESOURCES in MC/Coregroup sd_flags as in MC
domain need not be LLC domain for Power.
--
Thanks and Regards
Srikar Dronamraju
^ permalink raw reply
* Re: [PATCH] powerpc/64s/hash: Fix hash_preload running with interrupts enabled
From: Athira Rajeev @ 2020-07-29 4:18 UTC (permalink / raw)
To: Michael Ellerman; +Cc: Aneesh Kumar K . V, linuxppc-dev, Nicholas Piggin
In-Reply-To: <87h7ts79iv.fsf@mpe.ellerman.id.au>
> On 28-Jul-2020, at 6:14 AM, Michael Ellerman <mpe@ellerman.id.au> wrote:
>
> Athira Rajeev <atrajeev@linux.vnet.ibm.com> writes:
>>> On 27-Jul-2020, at 6:05 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
>>>
>>> Athira Rajeev <atrajeev@linux.vnet.ibm.com> writes:
>>>>> On 27-Jul-2020, at 11:39 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>>>>
>>>>> Commit 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
>>>>> caller") removed the local_irq_disable from hash_preload, but it was
>>>>> required for more than just the page table walk: the hash pte busy bit is
>>>>> effectively a lock which may be taken in interrupt context, and the local
>>>>> update flag test must not be preempted before it's used.
>>>>>
>>>>> This solves apparent lockups with perf interrupting __hash_page_64K. If
>>>>> get_perf_callchain then also takes a hash fault on the same page while it
>>>>> is already locked, it will loop forever taking hash faults, which looks like
>>>>> this:
>>>>>
>>>>> cpu 0x49e: Vector: 100 (System Reset) at [c00000001a4f7d70]
>>>>> pc: c000000000072dc8: hash_page_mm+0x8/0x800
>>>>> lr: c00000000000c5a4: do_hash_page+0x24/0x38
>>>>> sp: c0002ac1cc69ac70
>>>>> msr: 8000000000081033
>>>>> current = 0xc0002ac1cc602e00
>>>>> paca = 0xc00000001de1f280 irqmask: 0x03 irq_happened: 0x01
>>>>> pid = 20118, comm = pread2_processe
>>>>> Linux version 5.8.0-rc6-00345-g1fad14f18bc6
>>>>> 49e:mon> t
>>>>> [c0002ac1cc69ac70] c00000000000c5a4 do_hash_page+0x24/0x38 (unreliable)
>>>>> --- Exception: 300 (Data Access) at c00000000008fa60 __copy_tofrom_user_power7+0x20c/0x7ac
>>>>> [link register ] c000000000335d10 copy_from_user_nofault+0xf0/0x150
>>>>> [c0002ac1cc69af70] c00032bf9fa3c880 (unreliable)
>>>>> [c0002ac1cc69afa0] c000000000109df0 read_user_stack_64+0x70/0xf0
>>>>> [c0002ac1cc69afd0] c000000000109fcc perf_callchain_user_64+0x15c/0x410
>>>>> [c0002ac1cc69b060] c000000000109c00 perf_callchain_user+0x20/0x40
>>>>> [c0002ac1cc69b080] c00000000031c6cc get_perf_callchain+0x25c/0x360
>>>>> [c0002ac1cc69b120] c000000000316b50 perf_callchain+0x70/0xa0
>>>>> [c0002ac1cc69b140] c000000000316ddc perf_prepare_sample+0x25c/0x790
>>>>> [c0002ac1cc69b1a0] c000000000317350 perf_event_output_forward+0x40/0xb0
>>>>> [c0002ac1cc69b220] c000000000306138 __perf_event_overflow+0x88/0x1a0
>>>>> [c0002ac1cc69b270] c00000000010cf70 record_and_restart+0x230/0x750
>>>>> [c0002ac1cc69b620] c00000000010d69c perf_event_interrupt+0x20c/0x510
>>>>> [c0002ac1cc69b730] c000000000027d9c performance_monitor_exception+0x4c/0x60
>>>>> [c0002ac1cc69b750] c00000000000b2f8 performance_monitor_common_virt+0x1b8/0x1c0
>>>>> --- Exception: f00 (Performance Monitor) at c0000000000cb5b0 pSeries_lpar_hpte_insert+0x0/0x160
>>>>> [link register ] c0000000000846f0 __hash_page_64K+0x210/0x540
>>>>> [c0002ac1cc69ba50] 0000000000000000 (unreliable)
>>>>> [c0002ac1cc69bb00] c000000000073ae0 update_mmu_cache+0x390/0x3a0
>>>>> [c0002ac1cc69bb70] c00000000037f024 wp_page_copy+0x364/0xce0
>>>>> [c0002ac1cc69bc20] c00000000038272c do_wp_page+0xdc/0xa60
>>>>> [c0002ac1cc69bc70] c0000000003857bc handle_mm_fault+0xb9c/0x1b60
>>>>> [c0002ac1cc69bd50] c00000000006c434 __do_page_fault+0x314/0xc90
>>>>> [c0002ac1cc69be20] c00000000000c5c8 handle_page_fault+0x10/0x2c
>>>>> --- Exception: 300 (Data Access) at 00007fff8c861fe8
>>>>> SP (7ffff6b19660) is in userspace
>>>>>
>>>>> Reported-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>>>>> Reported-by: Anton Blanchard <anton@ozlabs.org>
>>>>> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>> Fixes: 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
>>>>> caller")
>>>>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>>>>
>>>>
>>>> Hi,
>>>>
>>>> Tested with the patch and it fixes the lockups I was seeing with my test run.
>>>> Thanks for the fix.
>>>>
>>>> Tested-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>>>
>>> Thanks for testing.
>>>
>>> What test are you running?
>>
>> Hi Michael
>>
>> I was running “perf record” and Unixbench tests ( https://github.com/kdlucas/byte-unixbench ) in parallel where we were getting soft lockups
>>
>> 1. Perf command run:
>> # perf record -a -g -c 10000000 -o <data_file> sleep 60
>>
>> 2. Unixbench tests
>> # Run -q -c <nr_threads> spawn
>
> Thanks, I can reproduce it with that.
Sure Michael
>
> cheers
^ permalink raw reply
* [PATCH] powerpc: Fix MMCRA_BHRB_DISABLE define to work with binutils version < 2.28
From: Athira Rajeev @ 2020-07-29 4:16 UTC (permalink / raw)
To: mpe; +Cc: maddy, linuxppc-dev
commit 9908c826d5ed ("powerpc/perf: Add Power10 PMU feature to
DT CPU features") defines MMCRA_BHRB_DISABLE as `0x2000000000UL`.
Binutils version less than 2.28 doesn't support UL suffix.
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S: Assembler messages:
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: junk at end of line, first unrecognized character is `L'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: junk at end of line, first unrecognized character is `L'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: operand out of range (0x0000002000000000 is not between 0xffffffffffff8000 and 0x000000000000ffff)
Fix this by wrapping it around `_UL` macro.
Fixes: 9908c826d5ed ("Add Power10 PMU feature to DT CPU features")
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
---
arch/powerpc/include/asm/reg.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index ae71027..41419f1 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -12,6 +12,7 @@
#ifdef __KERNEL__
#include <linux/stringify.h>
+#include <linux/const.h>
#include <asm/cputable.h>
#include <asm/asm-const.h>
#include <asm/feature-fixups.h>
@@ -888,7 +889,7 @@
#define MMCRA_SLOT 0x07000000UL /* SLOT bits (37-39) */
#define MMCRA_SLOT_SHIFT 24
#define MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */
-#define MMCRA_BHRB_DISABLE 0x2000000000UL // BHRB disable bit for ISA v3.1
+#define MMCRA_BHRB_DISABLE _UL(0x2000000000) // BHRB disable bit for ISA v3.1
#define POWER6_MMCRA_SDSYNC 0x0000080000000000ULL /* SDAR/SIAR synced */
#define POWER6_MMCRA_SIHV 0x0000040000000000ULL
#define POWER6_MMCRA_SIPR 0x0000020000000000ULL
--
1.8.3.1
^ permalink raw reply related
* [PATCH] powerpc/configs: Add BLK_DEV_NVME to pseries_defconfig
From: Anton Blanchard @ 2020-07-29 4:08 UTC (permalink / raw)
To: benh, paulus, mpe; +Cc: linuxppc-dev
I've forgotten to manual enable NVME when building pseries kernels
for machines with NVME adapters. Since it's a reasonably common
configuration, enable it by default.
Signed-off-by: Anton Blanchard <anton@ozlabs.org>
---
arch/powerpc/configs/pseries_defconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index dfa4a726333b..358642d6f46d 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -94,6 +94,7 @@ CONFIG_BLK_DEV_NBD=m
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=65536
CONFIG_VIRTIO_BLK=m
+CONFIG_BLK_DEV_NVME=y
CONFIG_BLK_DEV_SD=y
CONFIG_CHR_DEV_ST=m
CONFIG_BLK_DEV_SR=y
--
2.26.2
^ permalink raw reply related
* Re: [RESEND PATCH v5 07/11] ppc64/kexec_file: enable early kernel's OPAL calls
From: Michael Ellerman @ 2020-07-29 1:15 UTC (permalink / raw)
To: Hari Bathini, Andrew Morton
Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <23baef6a-6ddc-572a-82c5-21a7fa441485@linux.ibm.com>
Hari Bathini <hbathini@linux.ibm.com> writes:
> On 28/07/20 7:16 pm, Michael Ellerman wrote:
>> Hari Bathini <hbathini@linux.ibm.com> writes:
>>> Kernel built with CONFIG_PPC_EARLY_DEBUG_OPAL enabled expects r8 & r9
>>> to be filled with OPAL base & entry addresses respectively. Setting
>>> these registers allows the kernel to perform OPAL calls before the
>>> device tree is parsed.
>>
>> I'm not convinced we want to do this.
>>
>> If we do it becomes part of the kexec ABI and we have to honour it into
>> the future.
>>
>> And in practice there are no non-development kernels built with OPAL early
>> debugging enabled, so it's not clear it actually helps anyone other than
>> developers.
>>
>
> Hmmm.. kexec-tools does it since commit d58ad564852c ("kexec/ppc64
> Enable early kernel's OPAL calls") for kexec_load syscall. So, we would
> be breaking kexec ABI either way, I guess.
Ugh, OK.
> Let me put this patch at the end of the series in the respin to let you
> decide whether to have it or not..
Thanks.
cheers
^ permalink raw reply
* Re: [PATCH 13/15] arch, drivers: replace for_each_membock() with for_each_mem_range()
From: Emil Renner Berthing @ 2020-07-28 15:02 UTC (permalink / raw)
To: Mike Rapoport
Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-kernel, Max Filippov,
Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
Thomas Gleixner, Marek Szyprowski, linux-s390, linux-c6x-dev,
Yoshinori Sato, x86, Russell King, Mike Rapoport,
clang-built-linux, Ingo Molnar, Christoph Hellwig,
Catalin Marinas, uclinux-h8-devel, linux-xtensa, openrisc,
Borislav Petkov, Andy Lutomirski, Paul Walmsley, Stafford Horne,
linux-arm-kernel, Michal Simek, linux-mm, linux-mips, iommu,
Palmer Dabbelt, Andrew Morton, linuxppc-dev
In-Reply-To: <20200728051153.1590-14-rppt@kernel.org>
[-- Attachment #1: Type: text/plain, Size: 44043 bytes --]
On Tue, 28 Jul 2020, 07:16 Mike Rapoport, <rppt@kernel.org> wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
>
> There are several occurrences of the following pattern:
>
> for_each_memblock(memory, reg) {
> start = __pfn_to_phys(memblock_region_memory_base_pfn(reg);
> end = __pfn_to_phys(memblock_region_memory_end_pfn(reg));
>
> /* do something with start and end */
> }
>
> Using for_each_mem_range() iterator is more appropriate in such cases and
> allows simpler and cleaner code.
>
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
> arch/arm/kernel/setup.c | 18 +++++++----
> arch/arm/mm/mmu.c | 39 ++++++++----------------
> arch/arm/mm/pmsa-v7.c | 20 ++++++------
> arch/arm/mm/pmsa-v8.c | 17 +++++------
> arch/arm/xen/mm.c | 7 +++--
> arch/arm64/mm/kasan_init.c | 8 ++---
> arch/arm64/mm/mmu.c | 11 ++-----
> arch/c6x/kernel/setup.c | 9 +++---
> arch/microblaze/mm/init.c | 9 +++---
> arch/mips/cavium-octeon/dma-octeon.c | 12 ++++----
> arch/mips/kernel/setup.c | 31 +++++++++----------
> arch/openrisc/mm/init.c | 8 +++--
> arch/powerpc/kernel/fadump.c | 27 +++++++---------
> arch/powerpc/mm/book3s64/hash_utils.c | 16 +++++-----
> arch/powerpc/mm/book3s64/radix_pgtable.c | 11 +++----
> arch/powerpc/mm/kasan/kasan_init_32.c | 8 ++---
> arch/powerpc/mm/mem.c | 16 ++++++----
> arch/powerpc/mm/pgtable_32.c | 8 ++---
> arch/riscv/mm/init.c | 24 ++++++---------
> arch/riscv/mm/kasan_init.c | 10 +++---
> arch/s390/kernel/setup.c | 27 ++++++++++------
> arch/s390/mm/vmem.c | 16 +++++-----
> arch/sparc/mm/init_64.c | 12 +++-----
> drivers/bus/mvebu-mbus.c | 12 ++++----
> drivers/s390/char/zcore.c | 9 +++---
> 25 files changed, 187 insertions(+), 198 deletions(-)
>
> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> index d8e18cdd96d3..3f65d0ac9f63 100644
> --- a/arch/arm/kernel/setup.c
> +++ b/arch/arm/kernel/setup.c
> @@ -843,19 +843,25 @@ early_param("mem", early_mem);
>
> static void __init request_standard_resources(const struct machine_desc
> *mdesc)
> {
> - struct memblock_region *region;
> + phys_addr_t start, end, res_end;
> struct resource *res;
> + u64 i;
>
> kernel_code.start = virt_to_phys(_text);
> kernel_code.end = virt_to_phys(__init_begin - 1);
> kernel_data.start = virt_to_phys(_sdata);
> kernel_data.end = virt_to_phys(_end - 1);
>
> - for_each_memblock(memory, region) {
> - phys_addr_t start =
> __pfn_to_phys(memblock_region_memory_base_pfn(region));
> - phys_addr_t end =
> __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
> + for_each_mem_range(i, &start, &end) {
> unsigned long boot_alias_start;
>
> + /*
> + * In memblock, end points to the first byte after the
> + * range while in resourses, end points to the last byte in
> + * the range.
> + */
> + res_end = end - 1;
> +
> /*
> * Some systems have a special memory alias which is only
> * used for booting. We need to advertise this region to
> @@ -869,7 +875,7 @@ static void __init request_standard_resources(const
> struct machine_desc *mdesc)
> __func__, sizeof(*res));
> res->name = "System RAM (boot alias)";
> res->start = boot_alias_start;
> - res->end = phys_to_idmap(end);
> + res->end = phys_to_idmap(res_end);
> res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> request_resource(&iomem_resource, res);
> }
> @@ -880,7 +886,7 @@ static void __init request_standard_resources(const
> struct machine_desc *mdesc)
> sizeof(*res));
> res->name = "System RAM";
> res->start = start;
> - res->end = end;
> + res->end = res_end;
> res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
>
> request_resource(&iomem_resource, res);
> diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> index 628028bfbb92..a149d9cb4fdb 100644
> --- a/arch/arm/mm/mmu.c
> +++ b/arch/arm/mm/mmu.c
> @@ -1155,9 +1155,8 @@ phys_addr_t arm_lowmem_limit __initdata = 0;
>
> void __init adjust_lowmem_bounds(void)
> {
> - phys_addr_t memblock_limit = 0;
> - u64 vmalloc_limit;
> - struct memblock_region *reg;
> + phys_addr_t block_start, block_end, memblock_limit = 0;
> + u64 vmalloc_limit, i;
> phys_addr_t lowmem_limit = 0;
>
> /*
> @@ -1173,26 +1172,18 @@ void __init adjust_lowmem_bounds(void)
> * The first usable region must be PMD aligned. Mark its start
> * as MEMBLOCK_NOMAP if it isn't
> */
> - for_each_memblock(memory, reg) {
> - if (!memblock_is_nomap(reg)) {
> - if (!IS_ALIGNED(reg->base, PMD_SIZE)) {
> - phys_addr_t len;
> + for_each_mem_range(i, &block_start, &block_end) {
> + if (!IS_ALIGNED(block_start, PMD_SIZE)) {
> + phys_addr_t len;
>
> - len = round_up(reg->base, PMD_SIZE) -
> reg->base;
> - memblock_mark_nomap(reg->base, len);
> - }
> - break;
> + len = round_up(block_start, PMD_SIZE) -
> block_start;
> + memblock_mark_nomap(block_start, len);
> }
> + break;
> }
>
> - for_each_memblock(memory, reg) {
> - phys_addr_t block_start = reg->base;
> - phys_addr_t block_end = reg->base + reg->size;
> -
> - if (memblock_is_nomap(reg))
> - continue;
> -
> - if (reg->base < vmalloc_limit) {
> + for_each_mem_range(i, &block_start, &block_end) {
> + if (block_start < vmalloc_limit) {
> if (block_end > lowmem_limit)
> /*
> * Compare as u64 to ensure vmalloc_limit
> does
> @@ -1441,19 +1432,15 @@ static void __init kmap_init(void)
>
> static void __init map_lowmem(void)
> {
> - struct memblock_region *reg;
> phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START),
> SECTION_SIZE);
> phys_addr_t kernel_x_end = round_up(__pa(__init_end),
> SECTION_SIZE);
> + phys_addr_t start, end;
> + u64 i;
>
> /* Map all the lowmem memory banks. */
> - for_each_memblock(memory, reg) {
> - phys_addr_t start = reg->base;
> - phys_addr_t end = start + reg->size;
> + for_each_mem_range(i, &start, &end) {
> struct map_desc map;
>
> - if (memblock_is_nomap(reg))
> - continue;
> -
> if (end > arm_lowmem_limit)
> end = arm_lowmem_limit;
> if (start >= end)
> diff --git a/arch/arm/mm/pmsa-v7.c b/arch/arm/mm/pmsa-v7.c
> index 699fa2e88725..44b7644a4237 100644
> --- a/arch/arm/mm/pmsa-v7.c
> +++ b/arch/arm/mm/pmsa-v7.c
> @@ -231,10 +231,9 @@ static int __init allocate_region(phys_addr_t base,
> phys_addr_t size,
> void __init pmsav7_adjust_lowmem_bounds(void)
> {
> phys_addr_t specified_mem_size = 0, total_mem_size = 0;
> - struct memblock_region *reg;
> - bool first = true;
> phys_addr_t mem_start;
> phys_addr_t mem_end;
> + phys_addr_t reg_start, reg_end;
> unsigned int mem_max_regions;
> int num, i;
>
> @@ -262,20 +261,19 @@ void __init pmsav7_adjust_lowmem_bounds(void)
> mem_max_regions -= num;
> #endif
>
> - for_each_memblock(memory, reg) {
> - if (first) {
> + for_each_mem_range(i, ®_start, ®_end) {
> + if (i == 0) {
> phys_addr_t phys_offset = PHYS_OFFSET;
>
> /*
> * Initially only use memory continuous from
> * PHYS_OFFSET */
> - if (reg->base != phys_offset)
> + if (reg_start != phys_offset)
> panic("First memory bank must be
> contiguous from PHYS_OFFSET");
>
> - mem_start = reg->base;
> - mem_end = reg->base + reg->size;
> - specified_mem_size = reg->size;
> - first = false;
> + mem_start = reg_start;
> + mem_end = reg_end
> + specified_mem_size = mem_end - mem_start;
> } else {
> /*
> * memblock auto merges contiguous blocks, remove
> @@ -283,8 +281,8 @@ void __init pmsav7_adjust_lowmem_bounds(void)
> * blocks separately while iterating)
> */
> pr_notice("Ignoring RAM after %pa, memory at %pa
> ignored\n",
> - &mem_end, ®->base);
> - memblock_remove(reg->base, 0 - reg->base);
> + &mem_end, ®_start);
> + memblock_remove(reg_start, 0 - reg_start);
> break;
> }
> }
> diff --git a/arch/arm/mm/pmsa-v8.c b/arch/arm/mm/pmsa-v8.c
> index 0d7d5fb59247..b39e74b48437 100644
> --- a/arch/arm/mm/pmsa-v8.c
> +++ b/arch/arm/mm/pmsa-v8.c
> @@ -94,20 +94,19 @@ static __init bool is_region_fixed(int number)
> void __init pmsav8_adjust_lowmem_bounds(void)
> {
> phys_addr_t mem_end;
> - struct memblock_region *reg;
> - bool first = true;
> + phys_addr_t reg_start, reg_end;
> + int i;
>
> - for_each_memblock(memory, reg) {
> - if (first) {
> + for_each_mem_range(i, ®_start, ®_end) {
> + if (i == 0) {
> phys_addr_t phys_offset = PHYS_OFFSET;
>
> /*
> * Initially only use memory continuous from
> * PHYS_OFFSET */
> - if (reg->base != phys_offset)
> + if (reg_start != phys_offset)
> panic("First memory bank must be
> contiguous from PHYS_OFFSET");
> - mem_end = reg->base + reg->size;
> - first = false;
> + mem_end = reg_end;
> } else {
> /*
> * memblock auto merges contiguous blocks, remove
> @@ -115,8 +114,8 @@ void __init pmsav8_adjust_lowmem_bounds(void)
> * blocks separately while iterating)
> */
> pr_notice("Ignoring RAM after %pa, memory at %pa
> ignored\n",
> - &mem_end, ®->base);
> - memblock_remove(reg->base, 0 - reg->base);
> + &mem_end, ®_start);
> + memblock_remove(reg_start, 0 - reg_start);
> break;
> }
> }
> diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
> index d40e9e5fc52b..05f24ff41e36 100644
> --- a/arch/arm/xen/mm.c
> +++ b/arch/arm/xen/mm.c
> @@ -24,11 +24,12 @@
>
> unsigned long xen_get_swiotlb_free_pages(unsigned int order)
> {
> - struct memblock_region *reg;
> + phys_addr_t base;
> gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM;
> + u64 i;
>
> - for_each_memblock(memory, reg) {
> - if (reg->base < (phys_addr_t)0xffffffff) {
> + for_each_mem_range(i, &base, NULL) {
> + if (base < (phys_addr_t)0xffffffff) {
> if (IS_ENABLED(CONFIG_ZONE_DMA32))
> flags |= __GFP_DMA32;
> else
> diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
> index 7291b26ce788..1faa086f9193 100644
> --- a/arch/arm64/mm/kasan_init.c
> +++ b/arch/arm64/mm/kasan_init.c
> @@ -212,7 +212,7 @@ void __init kasan_init(void)
> {
> u64 kimg_shadow_start, kimg_shadow_end;
> u64 mod_shadow_start, mod_shadow_end;
> - struct memblock_region *reg;
> + phys_addr_t _start, _end;
> int i;
>
> kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK;
> @@ -246,9 +246,9 @@ void __init kasan_init(void)
> kasan_populate_early_shadow((void *)mod_shadow_end,
> (void *)kimg_shadow_start);
>
> - for_each_memblock(memory, reg) {
> - void *start = (void *)__phys_to_virt(reg->base);
> - void *end = (void *)__phys_to_virt(reg->base + reg->size);
> + for_each_mem_range(i, &start, &end) {
> + void *_start = (void *)__phys_to_virt(_start);
> + void *end = (void *)__phys_to_virt(_end);
>
> if (start >= end)
> break;
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 1df25f26571d..327264fb83fb 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -461,8 +461,9 @@ static void __init map_mem(pgd_t *pgdp)
> {
> phys_addr_t kernel_start = __pa_symbol(_text);
> phys_addr_t kernel_end = __pa_symbol(__init_begin);
> - struct memblock_region *reg;
> + phys_addr_t start, end;
> int flags = 0;
> + u64 i;
>
> if (rodata_full || debug_pagealloc_enabled())
> flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> @@ -481,15 +482,9 @@ static void __init map_mem(pgd_t *pgdp)
> #endif
>
> /* map all the memory banks */
> - for_each_memblock(memory, reg) {
> - phys_addr_t start = reg->base;
> - phys_addr_t end = start + reg->size;
> -
> + for_each_mem_range(i, &start, &end) {
> if (start >= end)
> break;
> - if (memblock_is_nomap(reg))
> - continue;
> -
> __map_memblock(pgdp, start, end, PAGE_KERNEL, flags);
> }
>
> diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c
> index 8ef35131f999..9254c3b794a5 100644
> --- a/arch/c6x/kernel/setup.c
> +++ b/arch/c6x/kernel/setup.c
> @@ -287,7 +287,8 @@ notrace void __init machine_init(unsigned long dt_ptr)
>
> void __init setup_arch(char **cmdline_p)
> {
> - struct memblock_region *reg;
> + phys_addr_t start, end;
> + u64 i;
>
> printk(KERN_INFO "Initializing kernel\n");
>
> @@ -351,9 +352,9 @@ void __init setup_arch(char **cmdline_p)
> disable_caching(ram_start, ram_end - 1);
>
> /* Set caching of external RAM used by Linux */
> - for_each_memblock(memory, reg)
> - enable_caching(CACHE_REGION_START(reg->base),
> - CACHE_REGION_START(reg->base + reg->size -
> 1));
> + for_each_mem_range(i, &start, &end)
> + enable_caching(CACHE_REGION_START(start),
> + CACHE_REGION_START(end - 1));
>
> #ifdef CONFIG_BLK_DEV_INITRD
> /*
> diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
> index 49e0c241f9b1..15403b5adfcf 100644
> --- a/arch/microblaze/mm/init.c
> +++ b/arch/microblaze/mm/init.c
> @@ -106,13 +106,14 @@ static void __init paging_init(void)
> void __init setup_memory(void)
> {
> #ifndef CONFIG_MMU
> - struct memblock_region *reg;
> u32 kernel_align_start, kernel_align_size;
> + phys_addr_t start, end;
> + u64 i;
>
> /* Find main memory where is the kernel */
> - for_each_memblock(memory, reg) {
> - memory_start = (u32)reg->base;
> - lowmem_size = reg->size;
> + for_each_mem_range(i, &start, &end) {
> + memory_start = start;
> + lowmem_size = end - start;
> if ((memory_start <= (u32)_text) &&
> ((u32)_text <= (memory_start + lowmem_size - 1))) {
> memory_size = lowmem_size;
> diff --git a/arch/mips/cavium-octeon/dma-octeon.c
> b/arch/mips/cavium-octeon/dma-octeon.c
> index 14ea680d180e..d938c1f7c1e1 100644
> --- a/arch/mips/cavium-octeon/dma-octeon.c
> +++ b/arch/mips/cavium-octeon/dma-octeon.c
> @@ -190,25 +190,25 @@ char *octeon_swiotlb;
>
> void __init plat_swiotlb_setup(void)
> {
> - struct memblock_region *mem;
> + phys_addr_t start, end;
> phys_addr_t max_addr;
> phys_addr_t addr_size;
> size_t swiotlbsize;
> unsigned long swiotlb_nslabs;
> + u64 i;
>
> max_addr = 0;
> addr_size = 0;
>
> - for_each_memblock(memory, mem) {
> + for_each_mem_range(i, &start, &end) {
> /* These addresses map low for PCI. */
> if (mem->base > 0x410000000ull && !OCTEON_IS_OCTEON2())
> continue;
>
> - addr_size += mem->size;
> -
> - if (max_addr < mem->base + mem->size)
> - max_addr = mem->base + mem->size;
> + addr_size += (end - start);
>
> + if (max_addr < end)
> + max_addr = end;
> }
>
> swiotlbsize = PAGE_SIZE;
> diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
> index 7b537fa2035d..eaac1b66026d 100644
> --- a/arch/mips/kernel/setup.c
> +++ b/arch/mips/kernel/setup.c
> @@ -300,8 +300,9 @@ static void __init bootmem_init(void)
>
> static void __init bootmem_init(void)
> {
> - struct memblock_region *mem;
> phys_addr_t ramstart, ramend;
> + phys_addr_t start, end;
> + u64 i;
>
> ramstart = memblock_start_of_DRAM();
> ramend = memblock_end_of_DRAM();
> @@ -338,18 +339,13 @@ static void __init bootmem_init(void)
>
> min_low_pfn = ARCH_PFN_OFFSET;
> max_pfn = PFN_DOWN(ramend);
> - for_each_memblock(memory, mem) {
> - unsigned long start = memblock_region_memory_base_pfn(mem);
> - unsigned long end = memblock_region_memory_end_pfn(mem);
> -
> + for_each_mem_range(i, &start, &end) {
> /*
> * Skip highmem here so we get an accurate max_low_pfn if
> low
> * memory stops short of high memory.
> * If the region overlaps HIGHMEM_START, end is clipped so
> * max_pfn excludes the highmem portion.
> */
> - if (memblock_is_nomap(mem))
> - continue;
> if (start >= PFN_DOWN(HIGHMEM_START))
> continue;
> if (end > PFN_DOWN(HIGHMEM_START))
> @@ -458,13 +454,12 @@ early_param("memmap", early_parse_memmap);
> unsigned long setup_elfcorehdr, setup_elfcorehdr_size;
> static int __init early_parse_elfcorehdr(char *p)
> {
> - struct memblock_region *mem;
> + phys_addr_t start, end;
> + u64 i;
>
> setup_elfcorehdr = memparse(p, &p);
>
> - for_each_memblock(memory, mem) {
> - unsigned long start = mem->base;
> - unsigned long end = start + mem->size;
> + for_each_mem_range(i, &start, &end) {
> if (setup_elfcorehdr >= start && setup_elfcorehdr < end) {
> /*
> * Reserve from the elf core header to the end of
> @@ -728,7 +723,8 @@ static void __init arch_mem_init(char **cmdline_p)
>
> static void __init resource_init(void)
> {
> - struct memblock_region *region;
> + phys_addr_t start, end;
> + u64 i;
>
> if (UNCAC_BASE != IO_BASE)
> return;
> @@ -740,9 +736,7 @@ static void __init resource_init(void)
> bss_resource.start = __pa_symbol(&__bss_start);
> bss_resource.end = __pa_symbol(&__bss_stop) - 1;
>
> - for_each_memblock(memory, region) {
> - phys_addr_t start =
> PFN_PHYS(memblock_region_memory_base_pfn(region));
> - phys_addr_t end =
> PFN_PHYS(memblock_region_memory_end_pfn(region)) - 1;
> + for_each_mem_range(i, &start, &end) {
> struct resource *res;
>
> res = memblock_alloc(sizeof(struct resource),
> SMP_CACHE_BYTES);
> @@ -751,7 +745,12 @@ static void __init resource_init(void)
> sizeof(struct resource));
>
> res->start = start;
> - res->end = end;
> + /*
> + * In memblock, end points to the first byte after the
> + * range while in resourses, end points to the last byte in
> + * the range.
> + */
> + res->end = end - 1;
> res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
> res->name = "System RAM";
>
> diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
> index 3d7c79c7745d..8348feaaf46e 100644
> --- a/arch/openrisc/mm/init.c
> +++ b/arch/openrisc/mm/init.c
> @@ -64,6 +64,7 @@ extern const char _s_kernel_ro[], _e_kernel_ro[];
> */
> static void __init map_ram(void)
> {
> + phys_addr_t start, end;
> unsigned long v, p, e;
> pgprot_t prot;
> pgd_t *pge;
> @@ -71,6 +72,7 @@ static void __init map_ram(void)
> pud_t *pue;
> pmd_t *pme;
> pte_t *pte;
> + u64 i;
> /* These mark extents of read-only kernel pages...
> * ...from vmlinux.lds.S
> */
> @@ -78,9 +80,9 @@ static void __init map_ram(void)
>
> v = PAGE_OFFSET;
>
> - for_each_memblock(memory, region) {
> - p = (u32) region->base & PAGE_MASK;
> - e = p + (u32) region->size;
> + for_each_mem_range(i, &start, &end) {
> + p = (u32) start & PAGE_MASK;
> + e = (u32) end;
>
> v = (u32) __va(p);
> pge = pgd_offset_k(v);
> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index fdbafe417139..435b98d069eb 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -180,13 +180,13 @@ int is_fadump_active(void)
> */
> static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end)
> {
> - struct memblock_region *reg;
> + phys_addr_t reg_start, reg_end;
> bool ret = false;
> - u64 start, end;
> + u64 i, start, end;
>
> - for_each_memblock(memory, reg) {
> - start = max_t(u64, d_start, reg->base);
> - end = min_t(u64, d_end, (reg->base + reg->size));
> + for_each_mem_range(i, ®_start, ®_end) {
> + start = max_t(u64, d_start, reg_start);
> + end = min_t(u64, d_end, reg_end));
> if (d_start < end) {
> /* Memory hole from d_start to start */
> if (start > d_start)
> @@ -413,7 +413,7 @@ static int __init fadump_get_boot_mem_regions(void)
> {
> unsigned long base, size, cur_size, hole_size, last_end;
> unsigned long mem_size = fw_dump.boot_memory_size;
> - struct memblock_region *reg;
> + phys_addr_t reg_start, reg_end;
> int ret = 1;
>
> fw_dump.boot_mem_regs_cnt = 0;
> @@ -421,9 +421,8 @@ static int __init fadump_get_boot_mem_regions(void)
> last_end = 0;
> hole_size = 0;
> cur_size = 0;
> - for_each_memblock(memory, reg) {
> - base = reg->base;
> - size = reg->size;
> + for_each_mem_range(i, ®_start, ®_end) {
> + size = reg_end - reg_start;
> hole_size += (base - last_end);
>
> if ((cur_size + size) >= mem_size) {
> @@ -959,9 +958,8 @@ static int fadump_init_elfcore_header(char *bufp)
> */
> static int fadump_setup_crash_memory_ranges(void)
> {
> - struct memblock_region *reg;
> - u64 start, end;
> - int i, ret;
> + u64 i, start, end;
> + int ret;
>
> pr_debug("Setup crash memory ranges.\n");
> crash_mrange_info.mem_range_cnt = 0;
> @@ -979,10 +977,7 @@ static int fadump_setup_crash_memory_ranges(void)
> return ret;
> }
>
> - for_each_memblock(memory, reg) {
> - start = (u64)reg->base;
> - end = start + (u64)reg->size;
> -
> + for_each_mem_range(i, &start, end) {
>
I don't know anything about this code, but from pure pattern matching it
looks like you missed a & here.
/*
> * skip the memory chunk that is already added
> * (0 through boot_memory_top).
> diff --git a/arch/powerpc/mm/book3s64/hash_utils.c
> b/arch/powerpc/mm/book3s64/hash_utils.c
> index 468169e33c86..9ba76b075b11 100644
> --- a/arch/powerpc/mm/book3s64/hash_utils.c
> +++ b/arch/powerpc/mm/book3s64/hash_utils.c
> @@ -7,7 +7,7 @@
> *
> * SMP scalability work:
> * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
> - *
> + *
> * Module name: htab.c
> *
> * Description:
> @@ -862,8 +862,8 @@ static void __init htab_initialize(void)
> unsigned long table;
> unsigned long pteg_count;
> unsigned long prot;
> - unsigned long base = 0, size = 0;
> - struct memblock_region *reg;
> + phys_addr_t base = 0, size = 0, end;
> + u64 i;
>
> DBG(" -> htab_initialize()\n");
>
> @@ -879,7 +879,7 @@ static void __init htab_initialize(void)
> /*
> * Calculate the required size of the htab. We want the number of
> * PTEGs to equal one half the number of real pages.
> - */
> + */
> htab_size_bytes = htab_get_table_size();
> pteg_count = htab_size_bytes >> 7;
>
> @@ -889,7 +889,7 @@ static void __init htab_initialize(void)
> firmware_has_feature(FW_FEATURE_PS3_LV1)) {
> /* Using a hypervisor which owns the htab */
> htab_address = NULL;
> - _SDR1 = 0;
> + _SDR1 = 0;
> #ifdef CONFIG_FA_DUMP
> /*
> * If firmware assisted dump is active firmware preserves
> @@ -955,9 +955,9 @@ static void __init htab_initialize(void)
> #endif /* CONFIG_DEBUG_PAGEALLOC */
>
> /* create bolted the linear mapping in the hash table */
> - for_each_memblock(memory, reg) {
> - base = (unsigned long)__va(reg->base);
> - size = reg->size;
> + for_each_mem_range(i, &base, &end) {
> + size = end - base;
> + base = (unsigned long)__va(base);
>
> DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
> base, size, prot);
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c
> b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index bb00e0cba119..65657b920847 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -318,28 +318,27 @@ static int __meminit
> create_physical_mapping(unsigned long start,
> static void __init radix_init_pgtable(void)
> {
> unsigned long rts_field;
> - struct memblock_region *reg;
> + phys_addr_t start, end;
> + u64 i;
>
> /* We don't support slb for radix */
> mmu_slb_size = 0;
> /*
> * Create the linear mapping, using standard page size for now
> */
> - for_each_memblock(memory, reg) {
> + for_each_mem_range(i, &start, &end) {
> /*
> * The memblock allocator is up at this point, so the
> * page tables will be allocated within the range. No
> * need or a node (which we don't have yet).
> */
>
> - if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
> + if (end >= RADIX_VMALLOC_START) {
> pr_warn("Outside the supported range\n");
> continue;
> }
>
> - WARN_ON(create_physical_mapping(reg->base,
> - reg->base + reg->size,
> - -1, PAGE_KERNEL));
> + WARN_ON(create_physical_mapping(start, end, -1,
> PAGE_KERNEL));
> }
>
> /* Find out how many PID bits are supported */
> diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c
> b/arch/powerpc/mm/kasan/kasan_init_32.c
> index 0760e1e754e4..6e73434e4e41 100644
> --- a/arch/powerpc/mm/kasan/kasan_init_32.c
> +++ b/arch/powerpc/mm/kasan/kasan_init_32.c
> @@ -120,11 +120,11 @@ static void __init
> kasan_unmap_early_shadow_vmalloc(void)
> static void __init kasan_mmu_init(void)
> {
> int ret;
> - struct memblock_region *reg;
> + phys_addr_t base, end;
> + u64 i;
>
> - for_each_memblock(memory, reg) {
> - phys_addr_t base = reg->base;
> - phys_addr_t top = min(base + reg->size, total_lowmem);
> + for_each_mem_range(i, &base, &end) {
> + phys_addr_t top = min(end, total_lowmem);
>
> if (base >= top)
> continue;
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 38d1acd7c8ef..0248b6d58fcd 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -593,20 +593,24 @@ void flush_icache_user_page(struct vm_area_struct
> *vma, struct page *page,
> */
> static int __init add_system_ram_resources(void)
> {
> - struct memblock_region *reg;
> + phys_addr_t start, end;
> + u64 i;
>
> - for_each_memblock(memory, reg) {
> + for_each_mem_range(i, &start, &end) {
> struct resource *res;
> - unsigned long base = reg->base;
> - unsigned long size = reg->size;
>
> res = kzalloc(sizeof(struct resource), GFP_KERNEL);
> WARN_ON(!res);
>
> if (res) {
> res->name = "System RAM";
> - res->start = base;
> - res->end = base + size - 1;
> + res->start = start;
> + /*
> + * In memblock, end points to the first byte after
> + * the range while in resourses, end points to the
> + * last byte in the range.
> + */
> + res->end = end - 1;
> res->flags = IORESOURCE_SYSTEM_RAM |
> IORESOURCE_BUSY;
> WARN_ON(request_resource(&iomem_resource, res) <
> 0);
> }
> diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
> index 6eb4eab79385..079159e97bca 100644
> --- a/arch/powerpc/mm/pgtable_32.c
> +++ b/arch/powerpc/mm/pgtable_32.c
> @@ -123,11 +123,11 @@ static void __init __mapin_ram_chunk(unsigned long
> offset, unsigned long top)
>
> void __init mapin_ram(void)
> {
> - struct memblock_region *reg;
> + phys_addr_t base, end;
> + u64 i;
>
> - for_each_memblock(memory, reg) {
> - phys_addr_t base = reg->base;
> - phys_addr_t top = min(base + reg->size, total_lowmem);
> + for_each_mem_range(i, &base, &end) {
> + phys_addr_t top = min(end, total_lowmem);
>
> if (base >= top)
> continue;
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 7440ba2cdaaa..2abe1165fe56 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -145,21 +145,22 @@ static phys_addr_t dtb_early_pa __initdata;
>
> void __init setup_bootmem(void)
> {
> - struct memblock_region *reg;
> + phys_addr_t start, end;
> phys_addr_t mem_size = 0;
> phys_addr_t total_mem = 0;
> phys_addr_t mem_start, end = 0;
> phys_addr_t vmlinux_end = __pa_symbol(&_end);
> phys_addr_t vmlinux_start = __pa_symbol(&_start);
> + u64 i;
>
> /* Find the memory region containing the kernel */
> - for_each_memblock(memory, reg) {
> - end = reg->base + reg->size;
> + for_each_mem_range(i, &start, &end) {
> + phys_addr_t size = end - start;
> if (!total_mem)
> - mem_start = reg->base;
> - if (reg->base <= vmlinux_start && vmlinux_end <= end)
> - BUG_ON(reg->size == 0);
> - total_mem = total_mem + reg->size;
> + mem_start = start;
> + if (start <= vmlinux_start && vmlinux_end <= end)
> + BUG_ON(size == 0);
> + total_mem = total_mem + size;
> }
>
> /*
> @@ -456,7 +457,7 @@ static void __init setup_vm_final(void)
> {
> uintptr_t va, map_size;
> phys_addr_t pa, start, end;
> - struct memblock_region *reg;
> + u64 i;
>
> /* Set mmu_enabled flag */
> mmu_enabled = true;
> @@ -467,14 +468,9 @@ static void __init setup_vm_final(void)
> PGDIR_SIZE, PAGE_TABLE);
>
> /* Map all memory banks */
> - for_each_memblock(memory, reg) {
> - start = reg->base;
> - end = start + reg->size;
> -
> + for_each_mem_range(i, &start, &end) {
> if (start >= end)
> break;
> - if (memblock_is_nomap(reg))
> - continue;
> if (start <= __pa(PAGE_OFFSET) &&
> __pa(PAGE_OFFSET) < end)
> start = __pa(PAGE_OFFSET);
> diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
> index 87b4ab3d3c77..12ddd1f6bf70 100644
> --- a/arch/riscv/mm/kasan_init.c
> +++ b/arch/riscv/mm/kasan_init.c
> @@ -85,16 +85,16 @@ static void __init populate(void *start, void *end)
>
> void __init kasan_init(void)
> {
> - struct memblock_region *reg;
> - unsigned long i;
> + phys_addr_t _start, _end;
> + u64 i;
>
> kasan_populate_early_shadow((void *)KASAN_SHADOW_START,
> (void *)kasan_mem_to_shadow((void *)
>
> VMALLOC_END));
>
> - for_each_memblock(memory, reg) {
> - void *start = (void *)__va(reg->base);
> - void *end = (void *)__va(reg->base + reg->size);
> + for_each_mem_range(i, &_start, &_end) {
> + void *start = (void *)_start;
> + void *end = (void *)_end;
>
> if (start >= end)
> break;
> diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
> index 8b284cf6e199..b6c4a0c5ff86 100644
> --- a/arch/s390/kernel/setup.c
> +++ b/arch/s390/kernel/setup.c
> @@ -198,7 +198,7 @@ static void __init conmode_default(void)
> cpcmd("QUERY TERM", query_buffer, 1024, NULL);
> ptr = strstr(query_buffer, "CONMODE");
> /*
> - * Set the conmode to 3215 so that the device recognition
> + * Set the conmode to 3215 so that the device recognition
> * will set the cu_type of the console to 3215. If the
> * conmode is 3270 and we don't set it back then both
> * 3215 and the 3270 driver will try to access the console
> @@ -258,7 +258,7 @@ static inline void setup_zfcpdump(void) {}
>
> /*
> * Reboot, halt and power_off stubs. They just call _machine_restart,
> - * _machine_halt or _machine_power_off.
> + * _machine_halt or _machine_power_off.
> */
>
> void machine_restart(char *command)
> @@ -484,8 +484,9 @@ static struct resource __initdata
> *standard_resources[] = {
> static void __init setup_resources(void)
> {
> struct resource *res, *std_res, *sub_res;
> - struct memblock_region *reg;
> + phys_addr_t start, end;
> int j;
> + u64 i;
>
> code_resource.start = (unsigned long) _text;
> code_resource.end = (unsigned long) _etext - 1;
> @@ -494,7 +495,7 @@ static void __init setup_resources(void)
> bss_resource.start = (unsigned long) __bss_start;
> bss_resource.end = (unsigned long) __bss_stop - 1;
>
> - for_each_memblock(memory, reg) {
> + for_each_mem_range(i, &start, &end) {
> res = memblock_alloc(sizeof(*res), 8);
> if (!res)
> panic("%s: Failed to allocate %zu bytes
> align=0x%x\n",
> @@ -502,8 +503,13 @@ static void __init setup_resources(void)
> res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
>
> res->name = "System RAM";
> - res->start = reg->base;
> - res->end = reg->base + reg->size - 1;
> + res->start = start;
> + /*
> + * In memblock, end points to the first byte after the
> + * range while in resourses, end points to the last byte in
> + * the range.
> + */
> + res->end = end - 1;
> request_resource(&iomem_resource, res);
>
> for (j = 0; j < ARRAY_SIZE(standard_resources); j++) {
> @@ -819,14 +825,15 @@ static void __init reserve_kernel(void)
>
> static void __init setup_memory(void)
> {
> - struct memblock_region *reg;
> + phys_addr_t start, end;
> + u64 i;
>
> /*
> * Init storage key for present memory
> */
> - for_each_memblock(memory, reg) {
> - storage_key_init_range(reg->base, reg->base + reg->size);
> - }
> + for_each_mem_range(i, &start, &end)
> + storage_key_init_range(start, end);
> +
> psw_set_key(PAGE_DEFAULT_KEY);
>
> /* Only cosmetics */
> diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
> index 8b6282cf7d13..30076ecc3eb7 100644
> --- a/arch/s390/mm/vmem.c
> +++ b/arch/s390/mm/vmem.c
> @@ -399,10 +399,11 @@ int vmem_add_mapping(unsigned long start, unsigned
> long size)
> */
> void __init vmem_map_init(void)
> {
> - struct memblock_region *reg;
> + phys_addr_t start, end;
> + u64 i;
>
> - for_each_memblock(memory, reg)
> - vmem_add_mem(reg->base, reg->size);
> + for_each_mem_range(i, &start, &end)
> + vmem_add_mem(start, end - start);
> __set_memory((unsigned long)_stext,
> (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
> SET_MEMORY_RO | SET_MEMORY_X);
> @@ -428,16 +429,17 @@ void __init vmem_map_init(void)
> */
> static int __init vmem_convert_memory_chunk(void)
> {
> - struct memblock_region *reg;
> + phys_addr_t start, end;
> struct memory_segment *seg;
> + u64 i;
>
> mutex_lock(&vmem_mutex);
> - for_each_memblock(memory, reg) {
> + for_each_mem_range(i, &start, &end) {
> seg = kzalloc(sizeof(*seg), GFP_KERNEL);
> if (!seg)
> panic("Out of memory...\n");
> - seg->start = reg->base;
> - seg->size = reg->size;
> + seg->start = start;
> + seg->size = end - start;
> insert_memory_segment(seg);
> }
> mutex_unlock(&vmem_mutex);
> diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
> index 02e6e5e0f106..de63c002638e 100644
> --- a/arch/sparc/mm/init_64.c
> +++ b/arch/sparc/mm/init_64.c
> @@ -1192,18 +1192,14 @@ int of_node_to_nid(struct device_node *dp)
>
> static void __init add_node_ranges(void)
> {
> - struct memblock_region *reg;
> + phys_addr_t start, end;
> unsigned long prev_max;
> + u64 i;
>
> memblock_resized:
> prev_max = memblock.memory.max;
>
> - for_each_memblock(memory, reg) {
> - unsigned long size = reg->size;
> - unsigned long start, end;
> -
> - start = reg->base;
> - end = start + size;
> + for_each_mem_range(i, &start, &end) {
> while (start < end) {
> unsigned long this_end;
> int nid;
> @@ -1211,7 +1207,7 @@ static void __init add_node_ranges(void)
> this_end = memblock_nid_range(start, end, &nid);
>
> numadbg("Setting memblock NUMA node nid[%d] "
> - "start[%lx] end[%lx]\n",
> + "start[%llx] end[%lx]\n",
> nid, start, this_end);
>
> memblock_set_node(start, this_end - start,
> diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c
> index 5b2a11a88951..2519ceede64b 100644
> --- a/drivers/bus/mvebu-mbus.c
> +++ b/drivers/bus/mvebu-mbus.c
> @@ -610,23 +610,23 @@ static unsigned int
> armada_xp_mbus_win_remap_offset(int win)
> static void __init
> mvebu_mbus_find_bridge_hole(uint64_t *start, uint64_t *end)
> {
> - struct memblock_region *r;
> - uint64_t s = 0;
> + phys_addr_t reg_start, reg_end;
> + uint64_t i, s = 0;
>
> - for_each_memblock(memory, r) {
> + for_each_mem_range(i, ®_start, ®_end) {
> /*
> * This part of the memory is above 4 GB, so we don't
> * care for the MBus bridge hole.
> */
> - if (r->base >= 0x100000000ULL)
> + if (reg_start >= 0x100000000ULL)
> continue;
>
> /*
> * The MBus bridge hole is at the end of the RAM under
> * the 4 GB limit.
> */
> - if (r->base + r->size > s)
> - s = r->base + r->size;
> + if (reg_end > s)
> + s = reg_end;
> }
>
> *start = s;
> diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
> index 08f812475f5e..484b1ec9a1bc 100644
> --- a/drivers/s390/char/zcore.c
> +++ b/drivers/s390/char/zcore.c
> @@ -148,18 +148,19 @@ static ssize_t zcore_memmap_read(struct file *filp,
> char __user *buf,
>
> static int zcore_memmap_open(struct inode *inode, struct file *filp)
> {
> - struct memblock_region *reg;
> + phys_addr_t start, end;
> char *buf;
> int i = 0;
> + u64 r;
>
> buf = kcalloc(memblock.memory.cnt, CHUNK_INFO_SIZE, GFP_KERNEL);
> if (!buf) {
> return -ENOMEM;
> }
> - for_each_memblock(memory, reg) {
> + for_each_mem_range(r, &start, &end) {
> sprintf(buf + (i++ * CHUNK_INFO_SIZE), "%016llx %016llx ",
> - (unsigned long long) reg->base,
> - (unsigned long long) reg->size);
> + (unsigned long long) start,
> + (unsigned long long) (end - start));
> }
> filp->private_data = buf;
> return nonseekable_open(inode, filp);
> --
> 2.26.2
>
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
>
[-- Attachment #2: Type: text/html, Size: 54937 bytes --]
^ permalink raw reply
* Re: [patch 01/15] mm/memory.c: avoid access flag update TLB flush for retried page fault
From: Nicholas Piggin @ 2020-07-28 22:53 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-arch, Hillf Danton, mm-commits, Catalin Marinas,
Hugh Dickins, Josef Bacik, Will Deacon, Linux-MM, Matthew Wilcox,
Johannes Weiner, Yu Xu, Andrew Morton, linuxppc-dev, Yang Shi,
Kirill A . Shutemov
In-Reply-To: <CAHk-=wgrgRqeEo-YUgec7yQNkN+_+sHBP-NtCnfktCFEuPHTDQ@mail.gmail.com>
Excerpts from Linus Torvalds's message of July 29, 2020 5:02 am:
> On Tue, Jul 28, 2020 at 3:53 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> The quirk is a problem with coprocessor where it's supposed to
>> invalidate the translation after a fault but it doesn't, so we can get a
>> read-only TLB stuck after something else does a RO->RW upgrade on the
>> TLB. Something like that IIRC. Coprocessors have their own MMU which
>> lives in the nest not the core, so you need a global TLB flush to
>> invalidate that thing.
>
> So I assumed, but it does seem confused.
>
> Why? Because if there are stale translations on the co-processor,
> there's no guarantee that one of the CPU's will have them and take a
> fault.
>
> So I'm not seeing why a core CPU doing spurious TLB invalidation would
> follow from "stale TLB in the Nest".
If the nest MMU access faults, it sends an interrupt to the CPU and
the driver tries to handle the page fault for it (I think that's how
it works).
> If anything, I think "we have a coprocessor that needs to never have
> stale TLB entries" would impact the _regular_ TLB invalidates (by
> update_mmu_cache()) and perhaps make those more aggressive, exactly
> because the coprocessor may not handle the fault as gracefully.
It could be done that way... Hmm although we do have something similar
also in radix__ptep_set_access_flags for the relaxing permissions case
so maybe this is required for not-present faults as well? I'm not
actually sure now.
But it's a bit weird and awkward because it's working around quirks in
the hardware which aren't regular, not because we're _completely_
confused (I hope).
Thanks,
Nick
^ permalink raw reply
* Re: [PATCH] powerpc/powernv/pci: Fix build of pci-ioda.o
From: Oliver O'Halloran @ 2020-07-28 22:50 UTC (permalink / raw)
To: Gustavo Romero; +Cc: linuxppc-dev
In-Reply-To: <20200728223337.40447-1-gromero@linux.ibm.com>
On Wed, Jul 29, 2020 at 8:35 AM Gustavo Romero <gromero@linux.ibm.com> wrote:
>
> Currently pnv_ioda_setup_bus_dma() is outside of a CONFIG_IOMMU_API guard
> and if CONFIG_IOMMU_API=n the build can fail if the compiler sets
> -Werror=unused-function, because pnv_ioda_setup_bus_dma() is only used in
> functions guarded by a CONFIG_IOMMU_API guard.
>
> That issue can be easily reproduced using the skiroot_defconfig. For other
> configs, like powernv_defconfig, that issue is hidden by the fact that
> if CONFIG_IOMMU_SUPPORT is enabled plus other common IOMMU options, like
> CONFIG_OF_IOMMU, by default CONFIG_IOMMU_API is enabled as well. Hence, for
> powernv_defconfig, it's necessary to set CONFIG_IOMMU_SUPPORT=n to make the
> build fail, because CONFIG_PCI=y and pci-ioda.c is included in the build,
> but since CONFIG_IOMMU_SUPPORT=n the CONFIG_IOMMU_API is disabled, breaking
> the build.
>
> This commit fixes that build issue by moving the pnv_ioda_setup_bus_dma()
> inside a CONFIG_IOMMU_API guard, so when CONFIG_IOMMU_API is disabled that
> function is not defined.
I think a fix for this is already in -next.
^ permalink raw reply
* [PATCH] powerpc/powernv/pci: Fix build of pci-ioda.o
From: Gustavo Romero @ 2020-07-28 22:33 UTC (permalink / raw)
To: linuxppc-dev; +Cc: oohall, gromero
Currently pnv_ioda_setup_bus_dma() is outside of a CONFIG_IOMMU_API guard
and if CONFIG_IOMMU_API=n the build can fail if the compiler sets
-Werror=unused-function, because pnv_ioda_setup_bus_dma() is only used in
functions guarded by a CONFIG_IOMMU_API guard.
That issue can be easily reproduced using the skiroot_defconfig. For other
configs, like powernv_defconfig, that issue is hidden by the fact that
if CONFIG_IOMMU_SUPPORT is enabled plus other common IOMMU options, like
CONFIG_OF_IOMMU, by default CONFIG_IOMMU_API is enabled as well. Hence, for
powernv_defconfig, it's necessary to set CONFIG_IOMMU_SUPPORT=n to make the
build fail, because CONFIG_PCI=y and pci-ioda.c is included in the build,
but since CONFIG_IOMMU_SUPPORT=n the CONFIG_IOMMU_API is disabled, breaking
the build.
This commit fixes that build issue by moving the pnv_ioda_setup_bus_dma()
inside a CONFIG_IOMMU_API guard, so when CONFIG_IOMMU_API is disabled that
function is not defined.
Signed-off-by: Gustavo Romero <gromero@linux.ibm.com>
---
arch/powerpc/platforms/powernv/pci-ioda.c | 26 +++++++++++------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 73a63efcf855..743d840712da 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1885,19 +1885,6 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
return false;
}
-static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
-{
- struct pci_dev *dev;
-
- list_for_each_entry(dev, &bus->devices, bus_list) {
- set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
- dev->dev.archdata.dma_offset = pe->tce_bypass_base;
-
- if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
- pnv_ioda_setup_bus_dma(pe, dev->subordinate);
- }
-}
-
static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
bool real_mode)
{
@@ -2501,6 +2488,19 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
#endif
#ifdef CONFIG_IOMMU_API
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
+ dev->dev.archdata.dma_offset = pe->tce_bypass_base;
+
+ if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+ pnv_ioda_setup_bus_dma(pe, dev->subordinate);
+ }
+}
+
unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
__u64 window_size, __u32 levels)
{
--
2.17.1
^ permalink raw reply related
* Re: [RESEND PATCH v5 06/11] ppc64/kexec_file: restrict memory usage of kdump kernel
From: Hari Bathini @ 2020-07-28 19:34 UTC (permalink / raw)
To: Michael Ellerman, Andrew Morton
Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <875za77o05.fsf@mpe.ellerman.id.au>
On 28/07/20 7:14 pm, Michael Ellerman wrote:
> Hari Bathini <hbathini@linux.ibm.com> writes:
>> diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
>> index 2df6f4273ddd..8df085a22fd7 100644
>> --- a/arch/powerpc/kexec/file_load_64.c
>> +++ b/arch/powerpc/kexec/file_load_64.c
>> @@ -17,9 +17,21 @@
>> #include <linux/kexec.h>
>> #include <linux/of_fdt.h>
>> #include <linux/libfdt.h>
>> +#include <linux/of_device.h>
>> #include <linux/memblock.h>
>> +#include <linux/slab.h>
>> +#include <asm/drmem.h>
>> #include <asm/kexec_ranges.h>
>>
>> +struct umem_info {
>> + uint64_t *buf; /* data buffer for usable-memory property */
>> + uint32_t idx; /* current index */
>> + uint32_t size; /* size allocated for the data buffer */
>
> Use kernel types please, u64, u32.
>
>> + /* usable memory ranges to look up */
>> + const struct crash_mem *umrngs;
>
> "umrngs".
>
> Given it's part of the umem_info struct could it just be "ranges"?
True. Actually, having crash_mem_range *ranges + u32 nr_ranges and
populating them seems better. Will do that..
>> + return NULL;
>> + }
>
> um_info->size = new_size;
>
>> +
>> + memset(tbuf + um_info->idx, 0, MEM_RANGE_CHUNK_SZ);
>
> Just pass __GFP_ZERO to krealloc?
There are patches submitted to stable fixing a few modules that use
krealloc with __GFP_ZERO. Also, this zeroing is not really needed.
I will drop the memset instead..
Thanks
Hari
^ permalink raw reply
* Re: [RESEND PATCH v5 07/11] ppc64/kexec_file: enable early kernel's OPAL calls
From: Hari Bathini @ 2020-07-28 19:24 UTC (permalink / raw)
To: Michael Ellerman, Andrew Morton
Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <87365b7nx4.fsf@mpe.ellerman.id.au>
On 28/07/20 7:16 pm, Michael Ellerman wrote:
> Hari Bathini <hbathini@linux.ibm.com> writes:
>> Kernel built with CONFIG_PPC_EARLY_DEBUG_OPAL enabled expects r8 & r9
>> to be filled with OPAL base & entry addresses respectively. Setting
>> these registers allows the kernel to perform OPAL calls before the
>> device tree is parsed.
>
> I'm not convinced we want to do this.
>
> If we do it becomes part of the kexec ABI and we have to honour it into
> the future.
>
> And in practice there are no non-development kernels built with OPAL early
> debugging enabled, so it's not clear it actually helps anyone other than
> developers.
>
Hmmm.. kexec-tools does it since commit d58ad564852c ("kexec/ppc64
Enable early kernel's OPAL calls") for kexec_load syscall. So, we would
be breaking kexec ABI either way, I guess.
Let me put this patch at the end of the series in the respin to let you
decide whether to have it or not..
Thanks
Hari
^ permalink raw reply
* Re: [PATCH] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal
From: Nathan Lynch @ 2020-07-28 19:19 UTC (permalink / raw)
To: Laurent Dufour; +Cc: tyreld, cheloha, linuxppc-dev
In-Reply-To: <bd9225f2-40c9-0460-ba45-c29c920b5f91@linux.ibm.com>
Hi Laurent,
Laurent Dufour <ldufour@linux.ibm.com> writes:
> Le 28/07/2020 à 19:37, Nathan Lynch a écrit :
>> The drmem lmb list can have hundreds of thousands of entries, and
>> unfortunately lookups take the form of linear searches. As long as
>> this is the case, traversals have the potential to monopolize the CPU
>> and provoke lockup reports, workqueue stalls, and the like unless
>> they explicitly yield.
>>
>> Rather than placing cond_resched() calls within various
>> for_each_drmem_lmb() loop blocks in the code, put it in the iteration
>> expression of the loop macro itself so users can't omit it.
>
> Is that not too much to call cond_resched() on every LMB?
>
> Could that be less frequent, every 10, or 100, I don't really know ?
Everything done within for_each_drmem_lmb is relatively heavyweight
already. E.g. calling dlpar_remove_lmb()/dlpar_add_lmb() can take dozens
of milliseconds. I don't think cond_resched() is an expensive check in
this context.
^ permalink raw reply
* Re: [patch 01/15] mm/memory.c: avoid access flag update TLB flush for retried page fault
From: Linus Torvalds @ 2020-07-28 19:02 UTC (permalink / raw)
To: Nicholas Piggin
Cc: linux-arch, Hillf Danton, Yang Shi, Yu Xu, Catalin Marinas,
Hugh Dickins, Josef Bacik, Will Deacon, Linux-MM, Matthew Wilcox,
Johannes Weiner, mm-commits, Andrew Morton, linuxppc-dev,
Kirill A . Shutemov
In-Reply-To: <1595932767.wga6c4yy6a.astroid@bobo.none>
On Tue, Jul 28, 2020 at 3:53 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> The quirk is a problem with coprocessor where it's supposed to
> invalidate the translation after a fault but it doesn't, so we can get a
> read-only TLB stuck after something else does a RO->RW upgrade on the
> TLB. Something like that IIRC. Coprocessors have their own MMU which
> lives in the nest not the core, so you need a global TLB flush to
> invalidate that thing.
So I assumed, but it does seem confused.
Why? Because if there are stale translations on the co-processor,
there's no guarantee that one of the CPU's will have them and take a
fault.
So I'm not seeing why a core CPU doing spurious TLB invalidation would
follow from "stale TLB in the Nest".
If anything, I think "we have a coprocessor that needs to never have
stale TLB entries" would impact the _regular_ TLB invalidates (by
update_mmu_cache()) and perhaps make those more aggressive, exactly
because the coprocessor may not handle the fault as gracefully.
I dunno. I don't know the coprocessor side well enough to judge, I'm
just looking at it from a conceptual standpoint.
Linus
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox