LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 12/15] arch, mm: replace for_each_memblock() with for_each_mem_pfn_range()
From: Mike Rapoport @ 2020-07-28  5:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linuxppc-dev, linux-kernel, iommu, Palmer Dabbelt,
	Christoph Hellwig, Mike Rapoport
In-Reply-To: <20200728051153.1590-1-rppt@kernel.org>

From: Mike Rapoport <rppt@linux.ibm.com>

There are several occurrences of the following pattern:

	for_each_memblock(memory, reg) {
		start_pfn = memblock_region_memory_base_pfn(reg);
		end_pfn = memblock_region_memory_end_pfn(reg);

		/* do something with start_pfn and end_pfn */
	}

Rather than iterate over all memblock.memory regions and each time query
for their start and end PFNs, use for_each_mem_pfn_range() iterator to get
simpler and clearer code.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 arch/arm/mm/init.c           | 11 ++++-------
 arch/arm64/mm/init.c         | 11 ++++-------
 arch/powerpc/kernel/fadump.c | 11 ++++++-----
 arch/powerpc/mm/mem.c        | 15 ++++++++-------
 arch/powerpc/mm/numa.c       |  7 ++-----
 arch/s390/mm/page-states.c   |  6 ++----
 arch/sh/mm/init.c            |  9 +++------
 mm/memblock.c                |  6 ++----
 mm/sparse.c                  | 10 ++++------
 9 files changed, 35 insertions(+), 51 deletions(-)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 626af348eb8f..bb56668b4f54 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -304,16 +304,14 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
  */
 static void __init free_unused_memmap(void)
 {
-	unsigned long start, prev_end = 0;
-	struct memblock_region *reg;
+	unsigned long start, end, prev_end = 0;
+	int i;
 
 	/*
 	 * This relies on each bank being in address order.
 	 * The banks are sorted previously in bootmem_init().
 	 */
-	for_each_memblock(memory, reg) {
-		start = memblock_region_memory_base_pfn(reg);
-
+	for_each_mem_pfn_range(i, NUMA_NO_NODE, &start, &end, NULL) {
 #ifdef CONFIG_SPARSEMEM
 		/*
 		 * Take care not to free memmap entries that don't exist
@@ -341,8 +339,7 @@ static void __init free_unused_memmap(void)
 		 * memmap entries are valid from the bank end aligned to
 		 * MAX_ORDER_NR_PAGES.
 		 */
-		prev_end = ALIGN(memblock_region_memory_end_pfn(reg),
-				 MAX_ORDER_NR_PAGES);
+		prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
 	}
 
 #ifdef CONFIG_SPARSEMEM
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 1e93cfc7c47a..271a8ea32482 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -473,12 +473,10 @@ static inline void free_memmap(unsigned long start_pfn, unsigned long end_pfn)
  */
 static void __init free_unused_memmap(void)
 {
-	unsigned long start, prev_end = 0;
-	struct memblock_region *reg;
-
-	for_each_memblock(memory, reg) {
-		start = __phys_to_pfn(reg->base);
+	unsigned long start, end, prev_end = 0;
+	int i;
 
+	for_each_mem_pfn_range(i, NUMA_NO_NODE, &start, &end, NULL) {
 #ifdef CONFIG_SPARSEMEM
 		/*
 		 * Take care not to free memmap entries that don't exist due
@@ -498,8 +496,7 @@ static void __init free_unused_memmap(void)
 		 * memmap entries are valid from the bank end aligned to
 		 * MAX_ORDER_NR_PAGES.
 		 */
-		prev_end = ALIGN(__phys_to_pfn(reg->base + reg->size),
-				 MAX_ORDER_NR_PAGES);
+		prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
 	}
 
 #ifdef CONFIG_SPARSEMEM
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 2446a61e3c25..fdbafe417139 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1216,14 +1216,15 @@ static void fadump_free_reserved_memory(unsigned long start_pfn,
  */
 static void fadump_release_reserved_area(u64 start, u64 end)
 {
-	u64 tstart, tend, spfn, epfn;
-	struct memblock_region *reg;
+	u64 tstart, tend, spfn, epfn, reg_spfn, reg_epfn, i;
 
 	spfn = PHYS_PFN(start);
 	epfn = PHYS_PFN(end);
-	for_each_memblock(memory, reg) {
-		tstart = max_t(u64, spfn, memblock_region_memory_base_pfn(reg));
-		tend   = min_t(u64, epfn, memblock_region_memory_end_pfn(reg));
+
+	for_each_mem_pfn_range(i, NUMA_NO_NODE, &reg_spfn, &reg_epfn, NULL) {
+		tstart = max_t(u64, spfn, reg_spfn);
+		tend   = min_t(u64, epfn, reg_epfn);
+
 		if (tstart < tend) {
 			fadump_free_reserved_memory(tstart, tend);
 
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index c2c11eb8dcfc..38d1acd7c8ef 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -192,15 +192,16 @@ void __init initmem_init(void)
 /* mark pages that don't exist as nosave */
 static int __init mark_nonram_nosave(void)
 {
-	struct memblock_region *reg, *prev = NULL;
+	unsigned long spfn, epfn, prev = 0;
+	int i;
 
-	for_each_memblock(memory, reg) {
-		if (prev &&
-		    memblock_region_memory_end_pfn(prev) < memblock_region_memory_base_pfn(reg))
-			register_nosave_region(memblock_region_memory_end_pfn(prev),
-					       memblock_region_memory_base_pfn(reg));
-		prev = reg;
+	for_each_mem_pfn_range(i, NUMA_NO_NODE, &spfn, &epfn, NULL) {
+		if (prev && prev < spfn)
+			register_nosave_region(prev, spfn);
+
+		prev = epfn;
 	}
+
 	return 0;
 }
 #else /* CONFIG_NEED_MULTIPLE_NODES */
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 9fcf2d195830..53254afae725 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -800,17 +800,14 @@ static void __init setup_nonnuma(void)
 	unsigned long total_ram = memblock_phys_mem_size();
 	unsigned long start_pfn, end_pfn;
 	unsigned int nid = 0;
-	struct memblock_region *reg;
+	int i;
 
 	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
 	       top_of_ram, total_ram);
 	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
 	       (top_of_ram - total_ram) >> 20);
 
-	for_each_memblock(memory, reg) {
-		start_pfn = memblock_region_memory_base_pfn(reg);
-		end_pfn = memblock_region_memory_end_pfn(reg);
-
+	for_each_mem_pfn_range(i, NUMA_NO_NODE, &start_pfn, &end_pfn, NULL) {
 		fake_numa_create_new_node(end_pfn, &nid);
 		memblock_set_node(PFN_PHYS(start_pfn),
 				  PFN_PHYS(end_pfn - start_pfn),
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index fc141893d028..8909f7b7b053 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -183,9 +183,9 @@ static void mark_kernel_pgd(void)
 
 void __init cmma_init_nodat(void)
 {
-	struct memblock_region *reg;
 	struct page *page;
 	unsigned long start, end, ix;
+	int i;
 
 	if (cmma_flag < 2)
 		return;
@@ -193,9 +193,7 @@ void __init cmma_init_nodat(void)
 	mark_kernel_pgd();
 
 	/* Set all kernel pages not used for page tables to stable/no-dat */
-	for_each_memblock(memory, reg) {
-		start = memblock_region_memory_base_pfn(reg);
-		end = memblock_region_memory_end_pfn(reg);
+	for_each_mem_pfn_range(i, NUMA_NO_NODE, &start, &end, NULL) {
 		page = pfn_to_page(start);
 		for (ix = start; ix < end; ix++, page++) {
 			if (__test_and_clear_bit(PG_arch_1, &page->flags))
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 62b8f03ffc80..398ee363e3e3 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -224,15 +224,12 @@ void __init allocate_pgdat(unsigned int nid)
 
 static void __init do_init_bootmem(void)
 {
-	struct memblock_region *reg;
+	unsigned long start_pfn, end_pfn;
+	int i;
 
 	/* Add active regions with valid PFNs. */
-	for_each_memblock(memory, reg) {
-		unsigned long start_pfn, end_pfn;
-		start_pfn = memblock_region_memory_base_pfn(reg);
-		end_pfn = memblock_region_memory_end_pfn(reg);
+	for_each_mem_pfn_range(i, NUMA_NO_NODE, &start_pfn, &end_pfn, NULL)
 		__add_active_range(0, start_pfn, end_pfn);
-	}
 
 	/* All of system RAM sits in node 0 for the non-NUMA case */
 	allocate_pgdat(0);
diff --git a/mm/memblock.c b/mm/memblock.c
index 824938849f6d..2ad5e6e47215 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1659,12 +1659,10 @@ phys_addr_t __init_memblock memblock_reserved_size(void)
 phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
 {
 	unsigned long pages = 0;
-	struct memblock_region *r;
 	unsigned long start_pfn, end_pfn;
+	int i;
 
-	for_each_memblock(memory, r) {
-		start_pfn = memblock_region_memory_base_pfn(r);
-		end_pfn = memblock_region_memory_end_pfn(r);
+	for_each_mem_pfn_range(i, NUMA_NO_NODE, &start_pfn, &end_pfn, NULL) {
 		start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
 		end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
 		pages += end_pfn - start_pfn;
diff --git a/mm/sparse.c b/mm/sparse.c
index b2b9a3e34696..8bdaddb40453 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -292,13 +292,11 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
  */
 void __init memblocks_present(void)
 {
-	struct memblock_region *reg;
+	unsigned long start, end;
+	int i, nid;
 
-	for_each_memblock(memory, reg) {
-		memory_present(memblock_get_region_node(reg),
-			       memblock_region_memory_base_pfn(reg),
-			       memblock_region_memory_end_pfn(reg));
-	}
+	for_each_mem_pfn_range(i, NUMA_NO_NODE, &start, &end, &nid)
+		memory_present(nid, start, end);
 }
 
 /*
-- 
2.26.2


^ permalink raw reply related

* [PATCH 13/15] arch, drivers: replace for_each_membock() with for_each_mem_range()
From: Mike Rapoport @ 2020-07-28  5:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linuxppc-dev, linux-kernel, iommu, Palmer Dabbelt,
	Christoph Hellwig, Mike Rapoport
In-Reply-To: <20200728051153.1590-1-rppt@kernel.org>

From: Mike Rapoport <rppt@linux.ibm.com>

There are several occurrences of the following pattern:

	for_each_memblock(memory, reg) {
		start = __pfn_to_phys(memblock_region_memory_base_pfn(reg);
		end = __pfn_to_phys(memblock_region_memory_end_pfn(reg));

		/* do something with start and end */
	}

Using for_each_mem_range() iterator is more appropriate in such cases and
allows simpler and cleaner code.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 arch/arm/kernel/setup.c                  | 18 +++++++----
 arch/arm/mm/mmu.c                        | 39 ++++++++----------------
 arch/arm/mm/pmsa-v7.c                    | 20 ++++++------
 arch/arm/mm/pmsa-v8.c                    | 17 +++++------
 arch/arm/xen/mm.c                        |  7 +++--
 arch/arm64/mm/kasan_init.c               |  8 ++---
 arch/arm64/mm/mmu.c                      | 11 ++-----
 arch/c6x/kernel/setup.c                  |  9 +++---
 arch/microblaze/mm/init.c                |  9 +++---
 arch/mips/cavium-octeon/dma-octeon.c     | 12 ++++----
 arch/mips/kernel/setup.c                 | 31 +++++++++----------
 arch/openrisc/mm/init.c                  |  8 +++--
 arch/powerpc/kernel/fadump.c             | 27 +++++++---------
 arch/powerpc/mm/book3s64/hash_utils.c    | 16 +++++-----
 arch/powerpc/mm/book3s64/radix_pgtable.c | 11 +++----
 arch/powerpc/mm/kasan/kasan_init_32.c    |  8 ++---
 arch/powerpc/mm/mem.c                    | 16 ++++++----
 arch/powerpc/mm/pgtable_32.c             |  8 ++---
 arch/riscv/mm/init.c                     | 24 ++++++---------
 arch/riscv/mm/kasan_init.c               | 10 +++---
 arch/s390/kernel/setup.c                 | 27 ++++++++++------
 arch/s390/mm/vmem.c                      | 16 +++++-----
 arch/sparc/mm/init_64.c                  | 12 +++-----
 drivers/bus/mvebu-mbus.c                 | 12 ++++----
 drivers/s390/char/zcore.c                |  9 +++---
 25 files changed, 187 insertions(+), 198 deletions(-)

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index d8e18cdd96d3..3f65d0ac9f63 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -843,19 +843,25 @@ early_param("mem", early_mem);
 
 static void __init request_standard_resources(const struct machine_desc *mdesc)
 {
-	struct memblock_region *region;
+	phys_addr_t start, end, res_end;
 	struct resource *res;
+	u64 i;
 
 	kernel_code.start   = virt_to_phys(_text);
 	kernel_code.end     = virt_to_phys(__init_begin - 1);
 	kernel_data.start   = virt_to_phys(_sdata);
 	kernel_data.end     = virt_to_phys(_end - 1);
 
-	for_each_memblock(memory, region) {
-		phys_addr_t start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
-		phys_addr_t end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
+	for_each_mem_range(i, &start, &end) {
 		unsigned long boot_alias_start;
 
+		/*
+		 * In memblock, end points to the first byte after the
+		 * range while in resourses, end points to the last byte in
+		 * the range.
+		 */
+		res_end = end - 1;
+
 		/*
 		 * Some systems have a special memory alias which is only
 		 * used for booting.  We need to advertise this region to
@@ -869,7 +875,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
 				      __func__, sizeof(*res));
 			res->name = "System RAM (boot alias)";
 			res->start = boot_alias_start;
-			res->end = phys_to_idmap(end);
+			res->end = phys_to_idmap(res_end);
 			res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 			request_resource(&iomem_resource, res);
 		}
@@ -880,7 +886,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
 			      sizeof(*res));
 		res->name  = "System RAM";
 		res->start = start;
-		res->end = end;
+		res->end = res_end;
 		res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
 		request_resource(&iomem_resource, res);
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 628028bfbb92..a149d9cb4fdb 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1155,9 +1155,8 @@ phys_addr_t arm_lowmem_limit __initdata = 0;
 
 void __init adjust_lowmem_bounds(void)
 {
-	phys_addr_t memblock_limit = 0;
-	u64 vmalloc_limit;
-	struct memblock_region *reg;
+	phys_addr_t block_start, block_end, memblock_limit = 0;
+	u64 vmalloc_limit, i;
 	phys_addr_t lowmem_limit = 0;
 
 	/*
@@ -1173,26 +1172,18 @@ void __init adjust_lowmem_bounds(void)
 	 * The first usable region must be PMD aligned. Mark its start
 	 * as MEMBLOCK_NOMAP if it isn't
 	 */
-	for_each_memblock(memory, reg) {
-		if (!memblock_is_nomap(reg)) {
-			if (!IS_ALIGNED(reg->base, PMD_SIZE)) {
-				phys_addr_t len;
+	for_each_mem_range(i, &block_start, &block_end) {
+		if (!IS_ALIGNED(block_start, PMD_SIZE)) {
+			phys_addr_t len;
 
-				len = round_up(reg->base, PMD_SIZE) - reg->base;
-				memblock_mark_nomap(reg->base, len);
-			}
-			break;
+			len = round_up(block_start, PMD_SIZE) - block_start;
+			memblock_mark_nomap(block_start, len);
 		}
+		break;
 	}
 
-	for_each_memblock(memory, reg) {
-		phys_addr_t block_start = reg->base;
-		phys_addr_t block_end = reg->base + reg->size;
-
-		if (memblock_is_nomap(reg))
-			continue;
-
-		if (reg->base < vmalloc_limit) {
+	for_each_mem_range(i, &block_start, &block_end) {
+		if (block_start < vmalloc_limit) {
 			if (block_end > lowmem_limit)
 				/*
 				 * Compare as u64 to ensure vmalloc_limit does
@@ -1441,19 +1432,15 @@ static void __init kmap_init(void)
 
 static void __init map_lowmem(void)
 {
-	struct memblock_region *reg;
 	phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START), SECTION_SIZE);
 	phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);
+	phys_addr_t start, end;
+	u64 i;
 
 	/* Map all the lowmem memory banks. */
-	for_each_memblock(memory, reg) {
-		phys_addr_t start = reg->base;
-		phys_addr_t end = start + reg->size;
+	for_each_mem_range(i, &start, &end) {
 		struct map_desc map;
 
-		if (memblock_is_nomap(reg))
-			continue;
-
 		if (end > arm_lowmem_limit)
 			end = arm_lowmem_limit;
 		if (start >= end)
diff --git a/arch/arm/mm/pmsa-v7.c b/arch/arm/mm/pmsa-v7.c
index 699fa2e88725..44b7644a4237 100644
--- a/arch/arm/mm/pmsa-v7.c
+++ b/arch/arm/mm/pmsa-v7.c
@@ -231,10 +231,9 @@ static int __init allocate_region(phys_addr_t base, phys_addr_t size,
 void __init pmsav7_adjust_lowmem_bounds(void)
 {
 	phys_addr_t  specified_mem_size = 0, total_mem_size = 0;
-	struct memblock_region *reg;
-	bool first = true;
 	phys_addr_t mem_start;
 	phys_addr_t mem_end;
+	phys_addr_t reg_start, reg_end;
 	unsigned int mem_max_regions;
 	int num, i;
 
@@ -262,20 +261,19 @@ void __init pmsav7_adjust_lowmem_bounds(void)
 	mem_max_regions -= num;
 #endif
 
-	for_each_memblock(memory, reg) {
-		if (first) {
+	for_each_mem_range(i, &reg_start, &reg_end) {
+		if (i == 0) {
 			phys_addr_t phys_offset = PHYS_OFFSET;
 
 			/*
 			 * Initially only use memory continuous from
 			 * PHYS_OFFSET */
-			if (reg->base != phys_offset)
+			if (reg_start != phys_offset)
 				panic("First memory bank must be contiguous from PHYS_OFFSET");
 
-			mem_start = reg->base;
-			mem_end = reg->base + reg->size;
-			specified_mem_size = reg->size;
-			first = false;
+			mem_start = reg_start;
+			mem_end = reg_end
+			specified_mem_size = mem_end - mem_start;
 		} else {
 			/*
 			 * memblock auto merges contiguous blocks, remove
@@ -283,8 +281,8 @@ void __init pmsav7_adjust_lowmem_bounds(void)
 			 * blocks separately while iterating)
 			 */
 			pr_notice("Ignoring RAM after %pa, memory at %pa ignored\n",
-				  &mem_end, &reg->base);
-			memblock_remove(reg->base, 0 - reg->base);
+				  &mem_end, &reg_start);
+			memblock_remove(reg_start, 0 - reg_start);
 			break;
 		}
 	}
diff --git a/arch/arm/mm/pmsa-v8.c b/arch/arm/mm/pmsa-v8.c
index 0d7d5fb59247..b39e74b48437 100644
--- a/arch/arm/mm/pmsa-v8.c
+++ b/arch/arm/mm/pmsa-v8.c
@@ -94,20 +94,19 @@ static __init bool is_region_fixed(int number)
 void __init pmsav8_adjust_lowmem_bounds(void)
 {
 	phys_addr_t mem_end;
-	struct memblock_region *reg;
-	bool first = true;
+	phys_addr_t reg_start, reg_end;
+	int i;
 
-	for_each_memblock(memory, reg) {
-		if (first) {
+	for_each_mem_range(i, &reg_start, &reg_end) {
+		if (i == 0) {
 			phys_addr_t phys_offset = PHYS_OFFSET;
 
 			/*
 			 * Initially only use memory continuous from
 			 * PHYS_OFFSET */
-			if (reg->base != phys_offset)
+			if (reg_start != phys_offset)
 				panic("First memory bank must be contiguous from PHYS_OFFSET");
-			mem_end = reg->base + reg->size;
-			first = false;
+			mem_end = reg_end;
 		} else {
 			/*
 			 * memblock auto merges contiguous blocks, remove
@@ -115,8 +114,8 @@ void __init pmsav8_adjust_lowmem_bounds(void)
 			 * blocks separately while iterating)
 			 */
 			pr_notice("Ignoring RAM after %pa, memory at %pa ignored\n",
-				  &mem_end, &reg->base);
-			memblock_remove(reg->base, 0 - reg->base);
+				  &mem_end, &reg_start);
+			memblock_remove(reg_start, 0 - reg_start);
 			break;
 		}
 	}
diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
index d40e9e5fc52b..05f24ff41e36 100644
--- a/arch/arm/xen/mm.c
+++ b/arch/arm/xen/mm.c
@@ -24,11 +24,12 @@
 
 unsigned long xen_get_swiotlb_free_pages(unsigned int order)
 {
-	struct memblock_region *reg;
+	phys_addr_t base;
 	gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM;
+	u64 i;
 
-	for_each_memblock(memory, reg) {
-		if (reg->base < (phys_addr_t)0xffffffff) {
+	for_each_mem_range(i, &base, NULL) {
+		if (base < (phys_addr_t)0xffffffff) {
 			if (IS_ENABLED(CONFIG_ZONE_DMA32))
 				flags |= __GFP_DMA32;
 			else
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 7291b26ce788..1faa086f9193 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -212,7 +212,7 @@ void __init kasan_init(void)
 {
 	u64 kimg_shadow_start, kimg_shadow_end;
 	u64 mod_shadow_start, mod_shadow_end;
-	struct memblock_region *reg;
+	phys_addr_t _start, _end;
 	int i;
 
 	kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK;
@@ -246,9 +246,9 @@ void __init kasan_init(void)
 		kasan_populate_early_shadow((void *)mod_shadow_end,
 					    (void *)kimg_shadow_start);
 
-	for_each_memblock(memory, reg) {
-		void *start = (void *)__phys_to_virt(reg->base);
-		void *end = (void *)__phys_to_virt(reg->base + reg->size);
+	for_each_mem_range(i, &start, &end) {
+		void *_start = (void *)__phys_to_virt(_start);
+		void *end = (void *)__phys_to_virt(_end);
 
 		if (start >= end)
 			break;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1df25f26571d..327264fb83fb 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -461,8 +461,9 @@ static void __init map_mem(pgd_t *pgdp)
 {
 	phys_addr_t kernel_start = __pa_symbol(_text);
 	phys_addr_t kernel_end = __pa_symbol(__init_begin);
-	struct memblock_region *reg;
+	phys_addr_t start, end;
 	int flags = 0;
+	u64 i;
 
 	if (rodata_full || debug_pagealloc_enabled())
 		flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
@@ -481,15 +482,9 @@ static void __init map_mem(pgd_t *pgdp)
 #endif
 
 	/* map all the memory banks */
-	for_each_memblock(memory, reg) {
-		phys_addr_t start = reg->base;
-		phys_addr_t end = start + reg->size;
-
+	for_each_mem_range(i, &start, &end) {
 		if (start >= end)
 			break;
-		if (memblock_is_nomap(reg))
-			continue;
-
 		__map_memblock(pgdp, start, end, PAGE_KERNEL, flags);
 	}
 
diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c
index 8ef35131f999..9254c3b794a5 100644
--- a/arch/c6x/kernel/setup.c
+++ b/arch/c6x/kernel/setup.c
@@ -287,7 +287,8 @@ notrace void __init machine_init(unsigned long dt_ptr)
 
 void __init setup_arch(char **cmdline_p)
 {
-	struct memblock_region *reg;
+	phys_addr_t start, end;
+	u64 i;
 
 	printk(KERN_INFO "Initializing kernel\n");
 
@@ -351,9 +352,9 @@ void __init setup_arch(char **cmdline_p)
 	disable_caching(ram_start, ram_end - 1);
 
 	/* Set caching of external RAM used by Linux */
-	for_each_memblock(memory, reg)
-		enable_caching(CACHE_REGION_START(reg->base),
-			       CACHE_REGION_START(reg->base + reg->size - 1));
+	for_each_mem_range(i, &start, &end)
+		enable_caching(CACHE_REGION_START(start),
+			       CACHE_REGION_START(end - 1));
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	/*
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 49e0c241f9b1..15403b5adfcf 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -106,13 +106,14 @@ static void __init paging_init(void)
 void __init setup_memory(void)
 {
 #ifndef CONFIG_MMU
-	struct memblock_region *reg;
 	u32 kernel_align_start, kernel_align_size;
+	phys_addr_t start, end;
+	u64 i;
 
 	/* Find main memory where is the kernel */
-	for_each_memblock(memory, reg) {
-		memory_start = (u32)reg->base;
-		lowmem_size = reg->size;
+	for_each_mem_range(i, &start, &end) {
+		memory_start = start;
+		lowmem_size = end - start;
 		if ((memory_start <= (u32)_text) &&
 			((u32)_text <= (memory_start + lowmem_size - 1))) {
 			memory_size = lowmem_size;
diff --git a/arch/mips/cavium-octeon/dma-octeon.c b/arch/mips/cavium-octeon/dma-octeon.c
index 14ea680d180e..d938c1f7c1e1 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -190,25 +190,25 @@ char *octeon_swiotlb;
 
 void __init plat_swiotlb_setup(void)
 {
-	struct memblock_region *mem;
+	phys_addr_t start, end;
 	phys_addr_t max_addr;
 	phys_addr_t addr_size;
 	size_t swiotlbsize;
 	unsigned long swiotlb_nslabs;
+	u64 i;
 
 	max_addr = 0;
 	addr_size = 0;
 
-	for_each_memblock(memory, mem) {
+	for_each_mem_range(i, &start, &end) {
 		/* These addresses map low for PCI. */
 		if (mem->base > 0x410000000ull && !OCTEON_IS_OCTEON2())
 			continue;
 
-		addr_size += mem->size;
-
-		if (max_addr < mem->base + mem->size)
-			max_addr = mem->base + mem->size;
+		addr_size += (end - start);
 
+		if (max_addr < end)
+			max_addr = end;
 	}
 
 	swiotlbsize = PAGE_SIZE;
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 7b537fa2035d..eaac1b66026d 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -300,8 +300,9 @@ static void __init bootmem_init(void)
 
 static void __init bootmem_init(void)
 {
-	struct memblock_region *mem;
 	phys_addr_t ramstart, ramend;
+	phys_addr_t start, end;
+	u64 i;
 
 	ramstart = memblock_start_of_DRAM();
 	ramend = memblock_end_of_DRAM();
@@ -338,18 +339,13 @@ static void __init bootmem_init(void)
 
 	min_low_pfn = ARCH_PFN_OFFSET;
 	max_pfn = PFN_DOWN(ramend);
-	for_each_memblock(memory, mem) {
-		unsigned long start = memblock_region_memory_base_pfn(mem);
-		unsigned long end = memblock_region_memory_end_pfn(mem);
-
+	for_each_mem_range(i, &start, &end) {
 		/*
 		 * Skip highmem here so we get an accurate max_low_pfn if low
 		 * memory stops short of high memory.
 		 * If the region overlaps HIGHMEM_START, end is clipped so
 		 * max_pfn excludes the highmem portion.
 		 */
-		if (memblock_is_nomap(mem))
-			continue;
 		if (start >= PFN_DOWN(HIGHMEM_START))
 			continue;
 		if (end > PFN_DOWN(HIGHMEM_START))
@@ -458,13 +454,12 @@ early_param("memmap", early_parse_memmap);
 unsigned long setup_elfcorehdr, setup_elfcorehdr_size;
 static int __init early_parse_elfcorehdr(char *p)
 {
-	struct memblock_region *mem;
+	phys_addr_t start, end;
+	u64 i;
 
 	setup_elfcorehdr = memparse(p, &p);
 
-	 for_each_memblock(memory, mem) {
-		unsigned long start = mem->base;
-		unsigned long end = start + mem->size;
+	for_each_mem_range(i, &start, &end) {
 		if (setup_elfcorehdr >= start && setup_elfcorehdr < end) {
 			/*
 			 * Reserve from the elf core header to the end of
@@ -728,7 +723,8 @@ static void __init arch_mem_init(char **cmdline_p)
 
 static void __init resource_init(void)
 {
-	struct memblock_region *region;
+	phys_addr_t start, end;
+	u64 i;
 
 	if (UNCAC_BASE != IO_BASE)
 		return;
@@ -740,9 +736,7 @@ static void __init resource_init(void)
 	bss_resource.start = __pa_symbol(&__bss_start);
 	bss_resource.end = __pa_symbol(&__bss_stop) - 1;
 
-	for_each_memblock(memory, region) {
-		phys_addr_t start = PFN_PHYS(memblock_region_memory_base_pfn(region));
-		phys_addr_t end = PFN_PHYS(memblock_region_memory_end_pfn(region)) - 1;
+	for_each_mem_range(i, &start, &end) {
 		struct resource *res;
 
 		res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES);
@@ -751,7 +745,12 @@ static void __init resource_init(void)
 			      sizeof(struct resource));
 
 		res->start = start;
-		res->end = end;
+		/*
+		 * In memblock, end points to the first byte after the
+		 * range while in resourses, end points to the last byte in
+		 * the range.
+		 */
+		res->end = end - 1;
 		res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 		res->name = "System RAM";
 
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index 3d7c79c7745d..8348feaaf46e 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -64,6 +64,7 @@ extern const char _s_kernel_ro[], _e_kernel_ro[];
  */
 static void __init map_ram(void)
 {
+	phys_addr_t start, end;
 	unsigned long v, p, e;
 	pgprot_t prot;
 	pgd_t *pge;
@@ -71,6 +72,7 @@ static void __init map_ram(void)
 	pud_t *pue;
 	pmd_t *pme;
 	pte_t *pte;
+	u64 i;
 	/* These mark extents of read-only kernel pages...
 	 * ...from vmlinux.lds.S
 	 */
@@ -78,9 +80,9 @@ static void __init map_ram(void)
 
 	v = PAGE_OFFSET;
 
-	for_each_memblock(memory, region) {
-		p = (u32) region->base & PAGE_MASK;
-		e = p + (u32) region->size;
+	for_each_mem_range(i, &start, &end) {
+		p = (u32) start & PAGE_MASK;
+		e = (u32) end;
 
 		v = (u32) __va(p);
 		pge = pgd_offset_k(v);
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index fdbafe417139..435b98d069eb 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -180,13 +180,13 @@ int is_fadump_active(void)
  */
 static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end)
 {
-	struct memblock_region *reg;
+	phys_addr_t reg_start, reg_end;
 	bool ret = false;
-	u64 start, end;
+	u64 i, start, end;
 
-	for_each_memblock(memory, reg) {
-		start = max_t(u64, d_start, reg->base);
-		end = min_t(u64, d_end, (reg->base + reg->size));
+	for_each_mem_range(i, &reg_start, &reg_end) {
+		start = max_t(u64, d_start, reg_start);
+		end = min_t(u64, d_end, reg_end));
 		if (d_start < end) {
 			/* Memory hole from d_start to start */
 			if (start > d_start)
@@ -413,7 +413,7 @@ static int __init fadump_get_boot_mem_regions(void)
 {
 	unsigned long base, size, cur_size, hole_size, last_end;
 	unsigned long mem_size = fw_dump.boot_memory_size;
-	struct memblock_region *reg;
+	phys_addr_t reg_start, reg_end;
 	int ret = 1;
 
 	fw_dump.boot_mem_regs_cnt = 0;
@@ -421,9 +421,8 @@ static int __init fadump_get_boot_mem_regions(void)
 	last_end = 0;
 	hole_size = 0;
 	cur_size = 0;
-	for_each_memblock(memory, reg) {
-		base = reg->base;
-		size = reg->size;
+	for_each_mem_range(i, &reg_start, &reg_end) {
+		size = reg_end - reg_start;
 		hole_size += (base - last_end);
 
 		if ((cur_size + size) >= mem_size) {
@@ -959,9 +958,8 @@ static int fadump_init_elfcore_header(char *bufp)
  */
 static int fadump_setup_crash_memory_ranges(void)
 {
-	struct memblock_region *reg;
-	u64 start, end;
-	int i, ret;
+	u64 i, start, end;
+	int ret;
 
 	pr_debug("Setup crash memory ranges.\n");
 	crash_mrange_info.mem_range_cnt = 0;
@@ -979,10 +977,7 @@ static int fadump_setup_crash_memory_ranges(void)
 			return ret;
 	}
 
-	for_each_memblock(memory, reg) {
-		start = (u64)reg->base;
-		end = start + (u64)reg->size;
-
+	for_each_mem_range(i, &start, end) {
 		/*
 		 * skip the memory chunk that is already added
 		 * (0 through boot_memory_top).
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 468169e33c86..9ba76b075b11 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -7,7 +7,7 @@
  *
  * SMP scalability work:
  *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- * 
+ *
  *    Module name: htab.c
  *
  *    Description:
@@ -862,8 +862,8 @@ static void __init htab_initialize(void)
 	unsigned long table;
 	unsigned long pteg_count;
 	unsigned long prot;
-	unsigned long base = 0, size = 0;
-	struct memblock_region *reg;
+	phys_addr_t base = 0, size = 0, end;
+	u64 i;
 
 	DBG(" -> htab_initialize()\n");
 
@@ -879,7 +879,7 @@ static void __init htab_initialize(void)
 	/*
 	 * Calculate the required size of the htab.  We want the number of
 	 * PTEGs to equal one half the number of real pages.
-	 */ 
+	 */
 	htab_size_bytes = htab_get_table_size();
 	pteg_count = htab_size_bytes >> 7;
 
@@ -889,7 +889,7 @@ static void __init htab_initialize(void)
 	    firmware_has_feature(FW_FEATURE_PS3_LV1)) {
 		/* Using a hypervisor which owns the htab */
 		htab_address = NULL;
-		_SDR1 = 0; 
+		_SDR1 = 0;
 #ifdef CONFIG_FA_DUMP
 		/*
 		 * If firmware assisted dump is active firmware preserves
@@ -955,9 +955,9 @@ static void __init htab_initialize(void)
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
 	/* create bolted the linear mapping in the hash table */
-	for_each_memblock(memory, reg) {
-		base = (unsigned long)__va(reg->base);
-		size = reg->size;
+	for_each_mem_range(i, &base, &end) {
+		size = end - base;
+		base = (unsigned long)__va(base);
 
 		DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
 		    base, size, prot);
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index bb00e0cba119..65657b920847 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -318,28 +318,27 @@ static int __meminit create_physical_mapping(unsigned long start,
 static void __init radix_init_pgtable(void)
 {
 	unsigned long rts_field;
-	struct memblock_region *reg;
+	phys_addr_t start, end;
+	u64 i;
 
 	/* We don't support slb for radix */
 	mmu_slb_size = 0;
 	/*
 	 * Create the linear mapping, using standard page size for now
 	 */
-	for_each_memblock(memory, reg) {
+	for_each_mem_range(i, &start, &end) {
 		/*
 		 * The memblock allocator  is up at this point, so the
 		 * page tables will be allocated within the range. No
 		 * need or a node (which we don't have yet).
 		 */
 
-		if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
+		if (end >= RADIX_VMALLOC_START) {
 			pr_warn("Outside the supported range\n");
 			continue;
 		}
 
-		WARN_ON(create_physical_mapping(reg->base,
-						reg->base + reg->size,
-						-1, PAGE_KERNEL));
+		WARN_ON(create_physical_mapping(start, end, -1, PAGE_KERNEL));
 	}
 
 	/* Find out how many PID bits are supported */
diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c
index 0760e1e754e4..6e73434e4e41 100644
--- a/arch/powerpc/mm/kasan/kasan_init_32.c
+++ b/arch/powerpc/mm/kasan/kasan_init_32.c
@@ -120,11 +120,11 @@ static void __init kasan_unmap_early_shadow_vmalloc(void)
 static void __init kasan_mmu_init(void)
 {
 	int ret;
-	struct memblock_region *reg;
+	phys_addr_t base, end;
+	u64 i;
 
-	for_each_memblock(memory, reg) {
-		phys_addr_t base = reg->base;
-		phys_addr_t top = min(base + reg->size, total_lowmem);
+	for_each_mem_range(i, &base, &end) {
+		phys_addr_t top = min(end, total_lowmem);
 
 		if (base >= top)
 			continue;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 38d1acd7c8ef..0248b6d58fcd 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -593,20 +593,24 @@ void flush_icache_user_page(struct vm_area_struct *vma, struct page *page,
  */
 static int __init add_system_ram_resources(void)
 {
-	struct memblock_region *reg;
+	phys_addr_t start, end;
+	u64 i;
 
-	for_each_memblock(memory, reg) {
+	for_each_mem_range(i, &start, &end) {
 		struct resource *res;
-		unsigned long base = reg->base;
-		unsigned long size = reg->size;
 
 		res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 		WARN_ON(!res);
 
 		if (res) {
 			res->name = "System RAM";
-			res->start = base;
-			res->end = base + size - 1;
+			res->start = start;
+			/*
+			 * In memblock, end points to the first byte after
+			 * the range while in resourses, end points to the
+			 * last byte in the range.
+			 */
+			res->end = end - 1;
 			res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 			WARN_ON(request_resource(&iomem_resource, res) < 0);
 		}
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 6eb4eab79385..079159e97bca 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -123,11 +123,11 @@ static void __init __mapin_ram_chunk(unsigned long offset, unsigned long top)
 
 void __init mapin_ram(void)
 {
-	struct memblock_region *reg;
+	phys_addr_t base, end;
+	u64 i;
 
-	for_each_memblock(memory, reg) {
-		phys_addr_t base = reg->base;
-		phys_addr_t top = min(base + reg->size, total_lowmem);
+	for_each_mem_range(i, &base, &end) {
+		phys_addr_t top = min(end, total_lowmem);
 
 		if (base >= top)
 			continue;
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 7440ba2cdaaa..2abe1165fe56 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -145,21 +145,22 @@ static phys_addr_t dtb_early_pa __initdata;
 
 void __init setup_bootmem(void)
 {
-	struct memblock_region *reg;
+	phys_addr_t start, end;
 	phys_addr_t mem_size = 0;
 	phys_addr_t total_mem = 0;
 	phys_addr_t mem_start, end = 0;
 	phys_addr_t vmlinux_end = __pa_symbol(&_end);
 	phys_addr_t vmlinux_start = __pa_symbol(&_start);
+	u64 i;
 
 	/* Find the memory region containing the kernel */
-	for_each_memblock(memory, reg) {
-		end = reg->base + reg->size;
+	for_each_mem_range(i, &start, &end) {
+		phys_addr_t size = end - start;
 		if (!total_mem)
-			mem_start = reg->base;
-		if (reg->base <= vmlinux_start && vmlinux_end <= end)
-			BUG_ON(reg->size == 0);
-		total_mem = total_mem + reg->size;
+			mem_start = start;
+		if (start <= vmlinux_start && vmlinux_end <= end)
+			BUG_ON(size == 0);
+		total_mem = total_mem + size;
 	}
 
 	/*
@@ -456,7 +457,7 @@ static void __init setup_vm_final(void)
 {
 	uintptr_t va, map_size;
 	phys_addr_t pa, start, end;
-	struct memblock_region *reg;
+	u64 i;
 
 	/* Set mmu_enabled flag */
 	mmu_enabled = true;
@@ -467,14 +468,9 @@ static void __init setup_vm_final(void)
 			   PGDIR_SIZE, PAGE_TABLE);
 
 	/* Map all memory banks */
-	for_each_memblock(memory, reg) {
-		start = reg->base;
-		end = start + reg->size;
-
+	for_each_mem_range(i, &start, &end) {
 		if (start >= end)
 			break;
-		if (memblock_is_nomap(reg))
-			continue;
 		if (start <= __pa(PAGE_OFFSET) &&
 		    __pa(PAGE_OFFSET) < end)
 			start = __pa(PAGE_OFFSET);
diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index 87b4ab3d3c77..12ddd1f6bf70 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -85,16 +85,16 @@ static void __init populate(void *start, void *end)
 
 void __init kasan_init(void)
 {
-	struct memblock_region *reg;
-	unsigned long i;
+	phys_addr_t _start, _end;
+	u64 i;
 
 	kasan_populate_early_shadow((void *)KASAN_SHADOW_START,
 				    (void *)kasan_mem_to_shadow((void *)
 								VMALLOC_END));
 
-	for_each_memblock(memory, reg) {
-		void *start = (void *)__va(reg->base);
-		void *end = (void *)__va(reg->base + reg->size);
+	for_each_mem_range(i, &_start, &_end) {
+		void *start = (void *)_start;
+		void *end = (void *)_end;
 
 		if (start >= end)
 			break;
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 8b284cf6e199..b6c4a0c5ff86 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -198,7 +198,7 @@ static void __init conmode_default(void)
 		cpcmd("QUERY TERM", query_buffer, 1024, NULL);
 		ptr = strstr(query_buffer, "CONMODE");
 		/*
-		 * Set the conmode to 3215 so that the device recognition 
+		 * Set the conmode to 3215 so that the device recognition
 		 * will set the cu_type of the console to 3215. If the
 		 * conmode is 3270 and we don't set it back then both
 		 * 3215 and the 3270 driver will try to access the console
@@ -258,7 +258,7 @@ static inline void setup_zfcpdump(void) {}
 
  /*
  * Reboot, halt and power_off stubs. They just call _machine_restart,
- * _machine_halt or _machine_power_off. 
+ * _machine_halt or _machine_power_off.
  */
 
 void machine_restart(char *command)
@@ -484,8 +484,9 @@ static struct resource __initdata *standard_resources[] = {
 static void __init setup_resources(void)
 {
 	struct resource *res, *std_res, *sub_res;
-	struct memblock_region *reg;
+	phys_addr_t start, end;
 	int j;
+	u64 i;
 
 	code_resource.start = (unsigned long) _text;
 	code_resource.end = (unsigned long) _etext - 1;
@@ -494,7 +495,7 @@ static void __init setup_resources(void)
 	bss_resource.start = (unsigned long) __bss_start;
 	bss_resource.end = (unsigned long) __bss_stop - 1;
 
-	for_each_memblock(memory, reg) {
+	for_each_mem_range(i, &start, &end) {
 		res = memblock_alloc(sizeof(*res), 8);
 		if (!res)
 			panic("%s: Failed to allocate %zu bytes align=0x%x\n",
@@ -502,8 +503,13 @@ static void __init setup_resources(void)
 		res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
 
 		res->name = "System RAM";
-		res->start = reg->base;
-		res->end = reg->base + reg->size - 1;
+		res->start = start;
+		/*
+		 * In memblock, end points to the first byte after the
+		 * range while in resourses, end points to the last byte in
+		 * the range.
+		 */
+		res->end = end - 1;
 		request_resource(&iomem_resource, res);
 
 		for (j = 0; j < ARRAY_SIZE(standard_resources); j++) {
@@ -819,14 +825,15 @@ static void __init reserve_kernel(void)
 
 static void __init setup_memory(void)
 {
-	struct memblock_region *reg;
+	phys_addr_t start, end;
+	u64 i;
 
 	/*
 	 * Init storage key for present memory
 	 */
-	for_each_memblock(memory, reg) {
-		storage_key_init_range(reg->base, reg->base + reg->size);
-	}
+	for_each_mem_range(i, &start, &end)
+		storage_key_init_range(start, end);
+
 	psw_set_key(PAGE_DEFAULT_KEY);
 
 	/* Only cosmetics */
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 8b6282cf7d13..30076ecc3eb7 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -399,10 +399,11 @@ int vmem_add_mapping(unsigned long start, unsigned long size)
  */
 void __init vmem_map_init(void)
 {
-	struct memblock_region *reg;
+	phys_addr_t start, end;
+	u64 i;
 
-	for_each_memblock(memory, reg)
-		vmem_add_mem(reg->base, reg->size);
+	for_each_mem_range(i, &start, &end)
+		vmem_add_mem(start, end - start);
 	__set_memory((unsigned long)_stext,
 		     (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
 		     SET_MEMORY_RO | SET_MEMORY_X);
@@ -428,16 +429,17 @@ void __init vmem_map_init(void)
  */
 static int __init vmem_convert_memory_chunk(void)
 {
-	struct memblock_region *reg;
+	phys_addr_t start, end;
 	struct memory_segment *seg;
+	u64 i;
 
 	mutex_lock(&vmem_mutex);
-	for_each_memblock(memory, reg) {
+	for_each_mem_range(i, &start, &end) {
 		seg = kzalloc(sizeof(*seg), GFP_KERNEL);
 		if (!seg)
 			panic("Out of memory...\n");
-		seg->start = reg->base;
-		seg->size = reg->size;
+		seg->start = start;
+		seg->size = end - start;
 		insert_memory_segment(seg);
 	}
 	mutex_unlock(&vmem_mutex);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 02e6e5e0f106..de63c002638e 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1192,18 +1192,14 @@ int of_node_to_nid(struct device_node *dp)
 
 static void __init add_node_ranges(void)
 {
-	struct memblock_region *reg;
+	phys_addr_t start, end;
 	unsigned long prev_max;
+	u64 i;
 
 memblock_resized:
 	prev_max = memblock.memory.max;
 
-	for_each_memblock(memory, reg) {
-		unsigned long size = reg->size;
-		unsigned long start, end;
-
-		start = reg->base;
-		end = start + size;
+	for_each_mem_range(i, &start, &end) {
 		while (start < end) {
 			unsigned long this_end;
 			int nid;
@@ -1211,7 +1207,7 @@ static void __init add_node_ranges(void)
 			this_end = memblock_nid_range(start, end, &nid);
 
 			numadbg("Setting memblock NUMA node nid[%d] "
-				"start[%lx] end[%lx]\n",
+				"start[%llx] end[%lx]\n",
 				nid, start, this_end);
 
 			memblock_set_node(start, this_end - start,
diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c
index 5b2a11a88951..2519ceede64b 100644
--- a/drivers/bus/mvebu-mbus.c
+++ b/drivers/bus/mvebu-mbus.c
@@ -610,23 +610,23 @@ static unsigned int armada_xp_mbus_win_remap_offset(int win)
 static void __init
 mvebu_mbus_find_bridge_hole(uint64_t *start, uint64_t *end)
 {
-	struct memblock_region *r;
-	uint64_t s = 0;
+	phys_addr_t reg_start, reg_end;
+	uint64_t i, s = 0;
 
-	for_each_memblock(memory, r) {
+	for_each_mem_range(i, &reg_start, &reg_end) {
 		/*
 		 * This part of the memory is above 4 GB, so we don't
 		 * care for the MBus bridge hole.
 		 */
-		if (r->base >= 0x100000000ULL)
+		if (reg_start >= 0x100000000ULL)
 			continue;
 
 		/*
 		 * The MBus bridge hole is at the end of the RAM under
 		 * the 4 GB limit.
 		 */
-		if (r->base + r->size > s)
-			s = r->base + r->size;
+		if (reg_end > s)
+			s = reg_end;
 	}
 
 	*start = s;
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index 08f812475f5e..484b1ec9a1bc 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -148,18 +148,19 @@ static ssize_t zcore_memmap_read(struct file *filp, char __user *buf,
 
 static int zcore_memmap_open(struct inode *inode, struct file *filp)
 {
-	struct memblock_region *reg;
+	phys_addr_t start, end;
 	char *buf;
 	int i = 0;
+	u64 r;
 
 	buf = kcalloc(memblock.memory.cnt, CHUNK_INFO_SIZE, GFP_KERNEL);
 	if (!buf) {
 		return -ENOMEM;
 	}
-	for_each_memblock(memory, reg) {
+	for_each_mem_range(r, &start, &end) {
 		sprintf(buf + (i++ * CHUNK_INFO_SIZE), "%016llx %016llx ",
-			(unsigned long long) reg->base,
-			(unsigned long long) reg->size);
+			(unsigned long long) start,
+			(unsigned long long) (end - start));
 	}
 	filp->private_data = buf;
 	return nonseekable_open(inode, filp);
-- 
2.26.2


^ permalink raw reply related

* [PATCH 14/15] x86/numa: remove redundant iteration over memblock.reserved
From: Mike Rapoport @ 2020-07-28  5:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linuxppc-dev, linux-kernel, iommu, Palmer Dabbelt,
	Christoph Hellwig, Mike Rapoport
In-Reply-To: <20200728051153.1590-1-rppt@kernel.org>

From: Mike Rapoport <rppt@linux.ibm.com>

numa_clear_kernel_node_hotplug() function first traverses numa_meminfo
regions to set node ID in memblock.reserved and than traverses
memblock.reserved to update reserved_nodemask to include node IDs that were
set in the first loop.

Remove redundant traversal over memblock.reserved and update
reserved_nodemask while iterating over numa_meminfo.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 arch/x86/mm/numa.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 8ee952038c80..4078abd33938 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -498,31 +498,25 @@ static void __init numa_clear_kernel_node_hotplug(void)
 	 * and use those ranges to set the nid in memblock.reserved.
 	 * This will split up the memblock regions along node
 	 * boundaries and will set the node IDs as well.
+	 *
+	 * The nid will also be set in reserved_nodemask which is later
+	 * used to clear MEMBLOCK_HOTPLUG flag.
+	 *
+	 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
+	 *   numa_meminfo might not include all memblock.reserved
+	 *   memory ranges, because quirks such as trim_snb_memory()
+	 *   reserve specific pages for Sandy Bridge graphics.
+	 *   These ranges will remain with nid == MAX_NUMNODES. ]
 	 */
 	for (i = 0; i < numa_meminfo.nr_blks; i++) {
 		struct numa_memblk *mb = numa_meminfo.blk + i;
 		int ret;
 
 		ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
+		node_set(mb->nid, reserved_nodemask);
 		WARN_ON_ONCE(ret);
 	}
 
-	/*
-	 * Now go over all reserved memblock regions, to construct a
-	 * node mask of all kernel reserved memory areas.
-	 *
-	 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
-	 *   numa_meminfo might not include all memblock.reserved
-	 *   memory ranges, because quirks such as trim_snb_memory()
-	 *   reserve specific pages for Sandy Bridge graphics. ]
-	 */
-	for_each_memblock(reserved, mb_region) {
-		int nid = memblock_get_region_node(mb_region);
-
-		if (nid != MAX_NUMNODES)
-			node_set(nid, reserved_nodemask);
-	}
-
 	/*
 	 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
 	 * belonging to the reserved node mask.
-- 
2.26.2


^ permalink raw reply related

* [PATCH 15/15] memblock: remove 'type' parameter from for_each_memblock()
From: Mike Rapoport @ 2020-07-28  5:11 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linuxppc-dev, linux-kernel, iommu, Palmer Dabbelt,
	Christoph Hellwig, Mike Rapoport
In-Reply-To: <20200728051153.1590-1-rppt@kernel.org>

From: Mike Rapoport <rppt@linux.ibm.com>

for_each_memblock() is used exclusively to iterate over memblock.memory in
a few places that use data from memblock_region rather than the memory
ranges.

Remove type parameter from the for_each_memblock() iterator to improve
encapsulation of memblock internals from its users.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 arch/arm64/kernel/setup.c      |  2 +-
 arch/arm64/mm/numa.c           |  2 +-
 arch/mips/netlogic/xlp/setup.c |  2 +-
 include/linux/memblock.h       | 10 +++++++---
 mm/memblock.c                  |  4 ++--
 mm/page_alloc.c                |  8 ++++----
 6 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 93b3844cf442..23da7908cbed 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -217,7 +217,7 @@ static void __init request_standard_resources(void)
 	if (!standard_resources)
 		panic("%s: Failed to allocate %zu bytes\n", __func__, res_size);
 
-	for_each_memblock(memory, region) {
+	for_each_memblock(region) {
 		res = &standard_resources[i++];
 		if (memblock_is_nomap(region)) {
 			res->name  = "reserved";
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 0cbdbcc885fb..08721d2c0b79 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -350,7 +350,7 @@ static int __init numa_register_nodes(void)
 	struct memblock_region *mblk;
 
 	/* Check that valid nid is set to memblks */
-	for_each_memblock(memory, mblk) {
+	for_each_memblock(mblk) {
 		int mblk_nid = memblock_get_region_node(mblk);
 
 		if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) {
diff --git a/arch/mips/netlogic/xlp/setup.c b/arch/mips/netlogic/xlp/setup.c
index 1a0fc5b62ba4..e69d9fc468cf 100644
--- a/arch/mips/netlogic/xlp/setup.c
+++ b/arch/mips/netlogic/xlp/setup.c
@@ -70,7 +70,7 @@ static void nlm_fixup_mem(void)
 	const int pref_backup = 512;
 	struct memblock_region *mem;
 
-	for_each_memblock(memory, mem) {
+	for_each_memblock(mem) {
 		memblock_remove(mem->base + mem->size - pref_backup,
 			pref_backup);
 	}
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index d70c2835e913..c901cb8ecf92 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -527,9 +527,13 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
 	return PFN_UP(reg->base + reg->size);
 }
 
-#define for_each_memblock(memblock_type, region)					\
-	for (region = memblock.memblock_type.regions;					\
-	     region < (memblock.memblock_type.regions + memblock.memblock_type.cnt);	\
+/**
+ * for_each_memblock - itereate over registered memory regions
+ * @region: loop variable
+ */
+#define for_each_memblock(region)					\
+	for (region = memblock.memory.regions;				\
+	     region < (memblock.memory.regions + memblock.memory.cnt);	\
 	     region++)
 
 extern void *alloc_large_system_hash(const char *tablename,
diff --git a/mm/memblock.c b/mm/memblock.c
index 2ad5e6e47215..550bb72cf6cb 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1694,7 +1694,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
 	 * the memory memblock regions, if the @limit exceeds the total size
 	 * of those regions, max_addr will keep original value PHYS_ADDR_MAX
 	 */
-	for_each_memblock(memory, r) {
+	for_each_memblock(r) {
 		if (limit <= r->size) {
 			max_addr = r->base + limit;
 			break;
@@ -1864,7 +1864,7 @@ void __init_memblock memblock_trim_memory(phys_addr_t align)
 	phys_addr_t start, end, orig_start, orig_end;
 	struct memblock_region *r;
 
-	for_each_memblock(memory, r) {
+	for_each_memblock(r) {
 		orig_start = r->base;
 		orig_end = r->base + r->size;
 		start = round_up(orig_start, align);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 95af111d69d3..8a19f46dc86e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5927,7 +5927,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
 
 	if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
 		if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
-			for_each_memblock(memory, r) {
+			for_each_memblock(r) {
 				if (*pfn < memblock_region_memory_end_pfn(r))
 					break;
 			}
@@ -6528,7 +6528,7 @@ static unsigned long __init zone_absent_pages_in_node(int nid,
 		unsigned long start_pfn, end_pfn;
 		struct memblock_region *r;
 
-		for_each_memblock(memory, r) {
+		for_each_memblock(r) {
 			start_pfn = clamp(memblock_region_memory_base_pfn(r),
 					  zone_start_pfn, zone_end_pfn);
 			end_pfn = clamp(memblock_region_memory_end_pfn(r),
@@ -7122,7 +7122,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 	 * options.
 	 */
 	if (movable_node_is_enabled()) {
-		for_each_memblock(memory, r) {
+		for_each_memblock(r) {
 			if (!memblock_is_hotpluggable(r))
 				continue;
 
@@ -7143,7 +7143,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 	if (mirrored_kernelcore) {
 		bool mem_below_4gb_not_mirrored = false;
 
-		for_each_memblock(memory, r) {
+		for_each_memblock(r) {
 			if (memblock_is_mirror(r))
 				continue;
 
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH v2 0/2] Rework secure memslot dropping
From: Paul Mackerras @ 2020-07-28  5:52 UTC (permalink / raw)
  To: Ram Pai
  Cc: ldufour, cclaudio, kvm-ppc, bharata, sathnaga, aneesh.kumar,
	sukadev, linuxppc-dev, bauerman, david
In-Reply-To: <1595877869-2746-1-git-send-email-linuxram@us.ibm.com>

On Mon, Jul 27, 2020 at 12:24:27PM -0700, Ram Pai wrote:
> From: Laurent Dufour <ldufour@linux.ibm.com>
> 
> When doing memory hotplug on a secure VM, the secure pages are not well
> cleaned from the secure device when dropping the memslot.  This silent
> error, is then preventing the SVM to reboot properly after the following
> sequence of commands are run in the Qemu monitor:
> 
> device_add pc-dimm,id=dimm1,memdev=mem1
> device_del dimm1
> device_add pc-dimm,id=dimm1,memdev=mem1
> 
> At reboot time, when the kernel is booting again and switching to the
> secure mode, the page_in is failing for the pages in the memslot because
> the cleanup was not done properly, because the memslot is flagged as
> invalid during the hot unplug and thus the page fault mechanism is not
> triggered.
> 
> To prevent that during the memslot dropping, instead of belonging on the
> page fault mechanism to trigger the page out of the secured pages, it seems
> simpler to directly call the function doing the page out. This way the
> state of the memslot is not interfering on the page out process.
> 
> This series applies on top of the Ram's one titled:
> "[v6 0/5] Migrate non-migrated pages of a SVM."

Thanks, series applied to my kvm-ppc-next branch and pull request sent.

Paul.

^ permalink raw reply

* Re: [PATCH v6 0/5] Migrate non-migrated pages of a SVM.
From: Paul Mackerras @ 2020-07-28  5:51 UTC (permalink / raw)
  To: Ram Pai
  Cc: ldufour, cclaudio, kvm-ppc, bharata, sathnaga, aneesh.kumar,
	sukadev, linuxppc-dev, bauerman, david
In-Reply-To: <1595873238-26184-1-git-send-email-linuxram@us.ibm.com>

On Mon, Jul 27, 2020 at 11:07:13AM -0700, Ram Pai wrote:
> The time to switch a VM to Secure-VM, increases by the size of the VM.
> A 100GB VM takes about 7minutes. This is unacceptable.  This linear
> increase is caused by a suboptimal behavior by the Ultravisor and the
> Hypervisor.  The Ultravisor unnecessarily migrates all the GFN of the
> VM from normal-memory to secure-memory. It has to just migrate the
> necessary and sufficient GFNs.
> 
> However when the optimization is incorporated in the Ultravisor, the
> Hypervisor starts misbehaving. The Hypervisor has a inbuilt assumption
> that the Ultravisor will explicitly request to migrate, each and every
> GFN of the VM. If only necessary and sufficient GFNs are requested for
> migration, the Hypervisor continues to manage the remaining GFNs as
> normal GFNs. This leads to memory corruption; manifested
> consistently when the SVM reboots.
> 
> The same is true, when a memory slot is hotplugged into a SVM. The
> Hypervisor expects the ultravisor to request migration of all GFNs to
> secure-GFN.  But the hypervisor cannot handle any H_SVM_PAGE_IN
> requests from the Ultravisor, done in the context of
> UV_REGISTER_MEM_SLOT ucall.  This problem manifests as random errors
> in the SVM, when a memory-slot is hotplugged.
> 
> This patch series automatically migrates the non-migrated pages of a
> SVM, and thus solves the problem.
> 
> Testing: Passed rigorous testing using various sized SVMs.

Thanks, series applied to my kvm-ppc-next branch and pull request sent.

Paul.

^ permalink raw reply

* Re: [PATCH 02/15] dma-contiguous: simplify cma_early_percent_memory()
From: Christoph Hellwig @ 2020-07-28  6:37 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linuxppc-dev, linux-kernel, iommu, Palmer Dabbelt,
	Andrew Morton, Christoph Hellwig
In-Reply-To: <20200728051153.1590-3-rppt@kernel.org>

On Tue, Jul 28, 2020 at 08:11:40AM +0300, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> The memory size calculation in cma_early_percent_memory() traverses
> memblock.memory rather than simply call memblock_phys_mem_size(). The
> comment in that function suggests that at some point there should have been
> call to memblock_analyze() before memblock_phys_mem_size() could be used.
> As of now, there is no memblock_analyze() at all and
> memblock_phys_mem_size() can be used as soon as cold-plug memory is
> registerd with memblock.
> 
> Replace loop over memblock.memory with a call to memblock_phys_mem_size().
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply

* Re: [PATCHv3 1/2] powerpc/pseries: group lmb operation and memblock's
From: Pingfan Liu @ 2020-07-28  6:39 UTC (permalink / raw)
  To: Nathan Lynch; +Cc: Kexec Mailing List, linuxppc-dev, Hari Bathini
In-Reply-To: <87ft9i1egt.fsf@linux.ibm.com>

On Thu, Jul 23, 2020 at 10:41 PM Nathan Lynch <nathanl@linux.ibm.com> wrote:
>
> Pingfan Liu <kernelfans@gmail.com> writes:
> > This patch prepares for the incoming patch which swaps the order of KOBJ_
> > uevent and dt's updating.
> >
> > It has no functional effect, just groups lmb operation and memblock's in
> > order to insert dt updating operation easily, and makes it easier to
> > review.
>
> ...
>
> > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> > index 5d545b7..1a3ac3b 100644
> > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> > @@ -355,7 +355,8 @@ static int dlpar_add_lmb(struct drmem_lmb *);
> >  static int dlpar_remove_lmb(struct drmem_lmb *lmb)
> >  {
> >       unsigned long block_sz;
> > -     int rc;
> > +     phys_addr_t base_addr;
> > +     int rc, nid;
> >
> >       if (!lmb_is_removable(lmb))
> >               return -EINVAL;
> > @@ -364,17 +365,19 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
> >       if (rc)
> >               return rc;
> >
> > +     base_addr = lmb->base_addr;
> > +     nid = lmb->nid;
> >       block_sz = pseries_memory_block_size();
> >
> > -     __remove_memory(lmb->nid, lmb->base_addr, block_sz);
> > -
> > -     /* Update memory regions for memory remove */
> > -     memblock_remove(lmb->base_addr, block_sz);
> > -
> >       invalidate_lmb_associativity_index(lmb);
> >       lmb_clear_nid(lmb);
> >       lmb->flags &= ~DRCONF_MEM_ASSIGNED;
> >
> > +     __remove_memory(nid, base_addr, block_sz);
> > +
> > +     /* Update memory regions for memory remove */
> > +     memblock_remove(base_addr, block_sz);
> > +
> >       return 0;
> >  }
>
> I don't understand; the commit message should not claim this has no
> functional effect when it changes the order of operations like
> this. Maybe this is an improvement over the current behavior, but it's
> not explained why it would be.
One group of functions, which name contains lmb, are powerpc specific,
and used to form dt.

The other group __remove_memory() and memblock_remove() are integrated
with linux mm.

And [2/2] arrange dt-updating just before __remove_memory()

Thanks,
Pingfan

^ permalink raw reply

* [Bug 206203] kmemleak reports various leaks in drivers/of/unittest.c
From: bugzilla-daemon @ 2020-07-28  7:18 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-206203-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=206203

--- Comment #16 from Erhard F. (erhard_f@mailbox.org) ---
Created attachment 290639
  --> https://bugzilla.kernel.org/attachment.cgi?id=290639&action=edit
dmesg (kernel 5.8-rc7, PowerMac G4 3,6)

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* [Bug 206203] kmemleak reports various leaks in drivers/of/unittest.c
From: bugzilla-daemon @ 2020-07-28  7:19 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-206203-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=206203

--- Comment #17 from Erhard F. (erhard_f@mailbox.org) ---
Created attachment 290641
  --> https://bugzilla.kernel.org/attachment.cgi?id=290641&action=edit
kmemleak output (kernel 5.8-rc7, PowerMac G4 3,6)

Also happens on my G4 DP.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [PATCH 03/15] arm, xtensa: simplify initialization of high memory pages
From: Max Filippov @ 2020-07-28  8:09 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: open list:SUPERH, Peter Zijlstra, Dave Hansen, linux-mips,
	Linux Memory Management List, Paul Mackerras,
	open list:SPARC + UltraSPAR..., linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, maintainer:X86 ARCHITECTURE..., Russell King,
	Mike Rapoport, clang-built-linux, Ingo Molnar, Catalin Marinas,
	moderated list:H8/300 ARCHITECTURE,
	open list:TENSILICA XTENSA PORT (xtensa), openrisc,
	Borislav Petkov, Andy Lutomirski, Paul Walmsley, Thomas Gleixner,
	linux-arm-kernel, Michal Simek, linuxppc-dev, LKML, iommu,
	Palmer Dabbelt, Andrew Morton, Christoph Hellwig
In-Reply-To: <20200728051153.1590-4-rppt@kernel.org>

On Mon, Jul 27, 2020 at 10:12 PM Mike Rapoport <rppt@kernel.org> wrote:
>
> From: Mike Rapoport <rppt@linux.ibm.com>
>
> The function free_highpages() in both arm and xtensa essentially open-code
> for_each_free_mem_range() loop to detect high memory pages that were not
> reserved and that should be initialized and passed to the buddy allocator.
>
> Replace open-coded implementation of for_each_free_mem_range() with usage
> of memblock API to simplify the code.
>
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  arch/arm/mm/init.c    | 48 +++++++------------------------------
>  arch/xtensa/mm/init.c | 55 ++++++++-----------------------------------
>  2 files changed, 18 insertions(+), 85 deletions(-)

For the xtensa part:
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
Tested-by: Max Filippov <jcmvbkbc@gmail.com>

-- 
Thanks.
-- Max

^ permalink raw reply

* Re: [PATCH 14/15] x86/numa: remove redundant iteration over memblock.reserved
From: Ingo Molnar @ 2020-07-28 10:44 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linuxppc-dev, linux-kernel, iommu, Palmer Dabbelt,
	Andrew Morton, Christoph Hellwig
In-Reply-To: <20200728051153.1590-15-rppt@kernel.org>


* Mike Rapoport <rppt@kernel.org> wrote:

> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> numa_clear_kernel_node_hotplug() function first traverses numa_meminfo
> regions to set node ID in memblock.reserved and than traverses
> memblock.reserved to update reserved_nodemask to include node IDs that were
> set in the first loop.
> 
> Remove redundant traversal over memblock.reserved and update
> reserved_nodemask while iterating over numa_meminfo.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  arch/x86/mm/numa.c | 26 ++++++++++----------------
>  1 file changed, 10 insertions(+), 16 deletions(-)

I suspect you'd like to carry this in the -mm tree?

Acked-by: Ingo Molnar <mingo@kernel.org>

Thanks,

	Ingo

^ permalink raw reply

* Re: [patch 01/15] mm/memory.c: avoid access flag update TLB flush for retried page fault
From: Nicholas Piggin @ 2020-07-28 10:53 UTC (permalink / raw)
  To: linux-arch, Linus Torvalds, Yang Shi
  Cc: Hillf Danton, mm-commits, Catalin Marinas, Hugh Dickins,
	Josef Bacik, Will Deacon, Linux-MM, Matthew Wilcox,
	Johannes Weiner, Yu Xu, Andrew Morton, linuxppc-dev,
	Kirill A . Shutemov
In-Reply-To: <CAHk-=wha6f0gF1SJg96R77h0oTuc_oO7-37wD=mYGy6TyJOwbQ@mail.gmail.com>

Excerpts from Linus Torvalds's message of July 28, 2020 4:37 am:
> [ Adding linux-arch, just to make other architectures aware of this issue too.
> 
>   We have a "flush_tlb_fix_spurious_fault()" thing to take care of the
> "TLB may contain stale entries, we can't take the same fault over and
> over again" situation.
> 
>   On x86, it's a no-op, because x86 doesn't do that. x86 will re-walk
> the page tables - or possibly just always invalidate the faulting TLB
> entry - before taking a fault, so there can be no long-term stale
> TLB's.

[snip]

>   It looks like powerpc people at least thought about this, and only
> do it if there is a coprocessor. Which sounds a bit confused, but I
> don't know the rules.

I'm not sure about ppc32 and 64e, I'm almost certain they should do a 
local flush if anyting, and someone with a good understanding of the 
ISAs and CPUs might be able to nop it entirely. I agree global can't 
ever really make sense (except as a default because we have no generic 
local flush).

powerpc/64s reloads translations after taking a fault, so it's fine with 
a nop here.

The quirk is a problem with coprocessor where it's supposed to 
invalidate the translation after a fault but it doesn't, so we can get a 
read-only TLB stuck after something else does a RO->RW upgrade on the 
TLB. Something like that IIRC.  Coprocessors have their own MMU which 
lives in the nest not the core, so you need a global TLB flush to
invalidate that thing.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH 14/15] x86/numa: remove redundant iteration over memblock.reserved
From: Mike Rapoport @ 2020-07-28 10:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linuxppc-dev, linux-kernel, iommu, Palmer Dabbelt,
	Andrew Morton, Christoph Hellwig
In-Reply-To: <20200728104440.GA222284@gmail.com>

On Tue, Jul 28, 2020 at 12:44:40PM +0200, Ingo Molnar wrote:
> 
> * Mike Rapoport <rppt@kernel.org> wrote:
> 
> > From: Mike Rapoport <rppt@linux.ibm.com>
> > 
> > numa_clear_kernel_node_hotplug() function first traverses numa_meminfo
> > regions to set node ID in memblock.reserved and than traverses
> > memblock.reserved to update reserved_nodemask to include node IDs that were
> > set in the first loop.
> > 
> > Remove redundant traversal over memblock.reserved and update
> > reserved_nodemask while iterating over numa_meminfo.
> > 
> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > ---
> >  arch/x86/mm/numa.c | 26 ++++++++++----------------
> >  1 file changed, 10 insertions(+), 16 deletions(-)
> 
> I suspect you'd like to carry this in the -mm tree?

Yes.
 
> Acked-by: Ingo Molnar <mingo@kernel.org>

Thanks!

> Thanks,
> 
> 	Ingo

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH 14/15] x86/numa: remove redundant iteration over memblock.reserved
From: Baoquan He @ 2020-07-28 11:02 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linuxppc-dev, linux-kernel, iommu, Palmer Dabbelt,
	Andrew Morton, Christoph Hellwig
In-Reply-To: <20200728051153.1590-15-rppt@kernel.org>

On 07/28/20 at 08:11am, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> numa_clear_kernel_node_hotplug() function first traverses numa_meminfo
> regions to set node ID in memblock.reserved and than traverses
> memblock.reserved to update reserved_nodemask to include node IDs that were
> set in the first loop.
> 
> Remove redundant traversal over memblock.reserved and update
> reserved_nodemask while iterating over numa_meminfo.
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  arch/x86/mm/numa.c | 26 ++++++++++----------------
>  1 file changed, 10 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 8ee952038c80..4078abd33938 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -498,31 +498,25 @@ static void __init numa_clear_kernel_node_hotplug(void)
>  	 * and use those ranges to set the nid in memblock.reserved.
>  	 * This will split up the memblock regions along node
>  	 * boundaries and will set the node IDs as well.
> +	 *
> +	 * The nid will also be set in reserved_nodemask which is later
> +	 * used to clear MEMBLOCK_HOTPLUG flag.
> +	 *
> +	 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
> +	 *   numa_meminfo might not include all memblock.reserved
> +	 *   memory ranges, because quirks such as trim_snb_memory()
> +	 *   reserve specific pages for Sandy Bridge graphics.
> +	 *   These ranges will remain with nid == MAX_NUMNODES. ]
>  	 */
>  	for (i = 0; i < numa_meminfo.nr_blks; i++) {
>  		struct numa_memblk *mb = numa_meminfo.blk + i;
>  		int ret;
>  
>  		ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
> +		node_set(mb->nid, reserved_nodemask);

Really? This will set all node id into reserved_nodemask. But in the
current code, it's setting nid into memblock reserved region which
interleaves with numa_memoinfo, then get those nid and set it in
reserved_nodemask. This is so different, with my understanding. Please
correct me if I am wrong.

Thanks
Baoquan

>  		WARN_ON_ONCE(ret);
>  	}
>  
> -	/*
> -	 * Now go over all reserved memblock regions, to construct a
> -	 * node mask of all kernel reserved memory areas.
> -	 *
> -	 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
> -	 *   numa_meminfo might not include all memblock.reserved
> -	 *   memory ranges, because quirks such as trim_snb_memory()
> -	 *   reserve specific pages for Sandy Bridge graphics. ]
> -	 */
> -	for_each_memblock(reserved, mb_region) {
> -		int nid = memblock_get_region_node(mb_region);
> -
> -		if (nid != MAX_NUMNODES)
> -			node_set(nid, reserved_nodemask);
> -	}
> -
>  	/*
>  	 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
>  	 * belonging to the reserved node mask.
> -- 
> 2.26.2
> 
> 


^ permalink raw reply

* Re: [PATCH 1/2] lockdep: improve current->(hard|soft)irqs_enabled synchronisation with actual irq state
From: Nicholas Piggin @ 2020-07-28 11:22 UTC (permalink / raw)
  To: peterz
  Cc: linux-arch, Alexey Kardashevskiy, Will Deacon, linux-kernel,
	Ingo Molnar, linuxppc-dev
In-Reply-To: <20200726121138.GC119549@hirez.programming.kicks-ass.net>

Excerpts from peterz@infradead.org's message of July 26, 2020 10:11 pm:
> On Sun, Jul 26, 2020 at 02:14:34PM +1000, Nicholas Piggin wrote:
>> Excerpts from Peter Zijlstra's message of July 26, 2020 6:26 am:
> 
>> > Which is 'funny' when it interleaves like:
>> > 
>> > 	local_irq_disable();
>> > 	...
>> > 	local_irq_enable()
>> > 	  trace_hardirqs_on();
>> > 	  <NMI/>
>> > 	  raw_local_irq_enable();
>> > 
>> > Because then it will undo the trace_hardirqs_on() we just did. With the
>> > result that both tracing and lockdep will see a hardirqs-disable without
>> > a matching enable, while the hardware state is enabled.
>> 
>> Seems like an arch problem -- why not disable if it was enabled only?
>> I guess the local_irq tracing calls are a mess so maybe they copied 
>> those.
> 
> Because, as I wrote earlier, then we can miss updating software state.
> So your proposal has:
> 
> 	raw_local_irq_disable()
> 	<NMI>
> 	  if (!arch_irqs_disabled(regs->flags) // false
> 	    trace_hardirqs_off();
> 
> 	  // tracing/lockdep still think IRQs are enabled
> 	  // hardware IRQ state is disabled.

... and then lockdep_nmi_enter can disable IRQs if they were enabled?

The only reason it's done this way as opposed to a much simple counter 
increment/decrement AFAIKS is to avoid some overhead of calling 
trace_hardirqs_on/off (which seems a bit dubious but let's go with it).

In that case the lockdep_nmi_enter code is the right spot to clean up 
that gap vs NMIs. I guess there's an argument that arch_nmi_enter could
do it. I don't see the problem with fixing it up here though, this is a 
slow path so it doesn't matter if we have some more logic for it.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH 14/15] x86/numa: remove redundant iteration over memblock.reserved
From: Ingo Molnar @ 2020-07-28 11:31 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linuxppc-dev, linux-kernel, iommu, Palmer Dabbelt,
	Andrew Morton, Christoph Hellwig
In-Reply-To: <20200728105602.GB3655207@kernel.org>


* Mike Rapoport <rppt@kernel.org> wrote:

> On Tue, Jul 28, 2020 at 12:44:40PM +0200, Ingo Molnar wrote:
> > 
> > * Mike Rapoport <rppt@kernel.org> wrote:
> > 
> > > From: Mike Rapoport <rppt@linux.ibm.com>
> > > 
> > > numa_clear_kernel_node_hotplug() function first traverses numa_meminfo
> > > regions to set node ID in memblock.reserved and than traverses
> > > memblock.reserved to update reserved_nodemask to include node IDs that were
> > > set in the first loop.
> > > 
> > > Remove redundant traversal over memblock.reserved and update
> > > reserved_nodemask while iterating over numa_meminfo.
> > > 
> > > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > > ---
> > >  arch/x86/mm/numa.c | 26 ++++++++++----------------
> > >  1 file changed, 10 insertions(+), 16 deletions(-)
> > 
> > I suspect you'd like to carry this in the -mm tree?
> 
> Yes.
>  
> > Acked-by: Ingo Molnar <mingo@kernel.org>
> 
> Thanks!

Assuming it is correct and works. :-)

Thanks,

	Ingo

^ permalink raw reply

* Re: [RESEND PATCH v5 03/11] powerpc/kexec_file: add helper functions for getting memory ranges
From: Michael Ellerman @ 2020-07-28 12:58 UTC (permalink / raw)
  To: Hari Bathini, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Thiago Jung Bauermann, Dave Young, Vivek Goyal, Eric Biederman
In-Reply-To: <159579222211.5790.10294144969496171475.stgit@hbathini>

Hi Hari,

Some comments inline ...

Hari Bathini <hbathini@linux.ibm.com> writes:
> diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c
> new file mode 100644
> index 000000000000..21bea1b78443
> --- /dev/null
> +++ b/arch/powerpc/kexec/ranges.c
> @@ -0,0 +1,417 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * powerpc code to implement the kexec_file_load syscall
> + *
> + * Copyright (C) 2004  Adam Litke (agl@us.ibm.com)
> + * Copyright (C) 2004  IBM Corp.
> + * Copyright (C) 2004,2005  Milton D Miller II, IBM Corporation
> + * Copyright (C) 2005  R Sharada (sharada@in.ibm.com)
> + * Copyright (C) 2006  Mohan Kumar M (mohan@in.ibm.com)
> + * Copyright (C) 2020  IBM Corporation
> + *
> + * Based on kexec-tools' kexec-ppc64.c, fs2dt.c.
> + * Heavily modified for the kernel by
> + * Hari Bathini <hbathini@linux.ibm.com>.

Please just use your name, email addresses bit rot. It's in the commit
log anyway.

> + */
> +
> +#undef DEBUG
    ^
Dont do that in new code please.

> +#define pr_fmt(fmt) "kexec ranges: " fmt
> +
> +#include <linux/sort.h>
> +#include <linux/kexec.h>
> +#include <linux/of_device.h>
> +#include <linux/slab.h>
> +#include <asm/sections.h>
> +#include <asm/kexec_ranges.h>
> +
> +/**
> + * get_max_nr_ranges - Get the max no. of ranges crash_mem structure
> + *                     could hold, given the size allocated for it.
> + * @size:              Allocation size of crash_mem structure.
> + *
> + * Returns the maximum no. of ranges.
> + */
> +static inline unsigned int get_max_nr_ranges(size_t size)
> +{
> +	return ((size - sizeof(struct crash_mem)) /
> +		sizeof(struct crash_mem_range));
> +}
> +
> +/**
> + * get_mem_rngs_size - Get the allocated size of mrngs based on
> + *                     max_nr_ranges and chunk size.
> + * @mrngs:             Memory ranges.

mrngs is not a great name, what about memory_ranges or ranges?

Ditto everywhere else you use mrngs.

> + *
> + * Returns the maximum size of @mrngs.
> + */
> +static inline size_t get_mem_rngs_size(struct crash_mem *mrngs)
> +{
> +	size_t size;
> +
> +	if (!mrngs)
> +		return 0;
> +
> +	size = (sizeof(struct crash_mem) +
> +		(mrngs->max_nr_ranges * sizeof(struct crash_mem_range)));
> +
> +	/*
> +	 * Memory is allocated in size multiple of MEM_RANGE_CHUNK_SZ.
> +	 * So, align to get the actual length.
> +	 */
> +	return ALIGN(size, MEM_RANGE_CHUNK_SZ);
> +}
> +
> +/**
> + * __add_mem_range - add a memory range to memory ranges list.
> + * @mem_ranges:      Range list to add the memory range to.
> + * @base:            Base address of the range to add.
> + * @size:            Size of the memory range to add.
> + *
> + * (Re)allocates memory, if needed.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +static int __add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
> +{
> +	struct crash_mem *mrngs = *mem_ranges;
> +
> +	if ((mrngs == NULL) || (mrngs->nr_ranges == mrngs->max_nr_ranges)) {

(mrngs == NULL) should just be !mrngs.

> +		mrngs = realloc_mem_ranges(mem_ranges);
> +		if (!mrngs)
> +			return -ENOMEM;
> +	}
> +
> +	mrngs->ranges[mrngs->nr_ranges].start = base;
> +	mrngs->ranges[mrngs->nr_ranges].end = base + size - 1;
> +	pr_debug("Added memory range [%#016llx - %#016llx] at index %d\n",
> +		 base, base + size - 1, mrngs->nr_ranges);
> +	mrngs->nr_ranges++;
> +	return 0;
> +}
> +
> +/**
> + * __merge_memory_ranges - Merges the given memory ranges list.
> + * @mem_ranges:            Range list to merge.
> + *
> + * Assumes a sorted range list.
> + *
> + * Returns nothing.
> + */

A lot of this code is annoyingly similar to the memblock code, though
the internals of that are all static these days.

I guess for now we'll just have to add all this. Maybe in future it can
be consolidated.

> +static void __merge_memory_ranges(struct crash_mem *mrngs)
> +{
> +	struct crash_mem_range *rngs;
> +	int i, idx;
> +
> +	if (!mrngs)
> +		return;
> +
> +	idx = 0;
> +	rngs = &mrngs->ranges[0];
> +	for (i = 1; i < mrngs->nr_ranges; i++) {
> +		if (rngs[i].start <= (rngs[i-1].end + 1))
> +			rngs[idx].end = rngs[i].end;
> +		else {
> +			idx++;
> +			if (i == idx)
> +				continue;
> +
> +			rngs[idx] = rngs[i];
> +		}
> +	}
> +	mrngs->nr_ranges = idx + 1;
> +}
> +
> +/**
> + * realloc_mem_ranges - reallocate mem_ranges with size incremented
> + *                      by MEM_RANGE_CHUNK_SZ. Frees up the old memory,
> + *                      if memory allocation fails.
> + * @mem_ranges:         Memory ranges to reallocate.
> + *
> + * Returns pointer to reallocated memory on success, NULL otherwise.
> + */
> +struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges)
> +{
> +	struct crash_mem *mrngs = *mem_ranges;
> +	unsigned int nr_ranges;
> +	size_t size;
> +
> +	size = get_mem_rngs_size(mrngs);
> +	nr_ranges = mrngs ? mrngs->nr_ranges : 0;
> +
> +	size += MEM_RANGE_CHUNK_SZ;
> +	mrngs = krealloc(*mem_ranges, size, GFP_KERNEL);
> +	if (!mrngs) {
> +		kfree(*mem_ranges);
> +		*mem_ranges = NULL;
> +		return NULL;
> +	}
> +
> +	mrngs->nr_ranges = nr_ranges;
> +	mrngs->max_nr_ranges = get_max_nr_ranges(size);
> +	*mem_ranges = mrngs;
> +
> +	return mrngs;
> +}
> +
> +/**
> + * add_mem_range - Updates existing memory range, if there is an overlap.
> + *                 Else, adds a new memory range.
> + * @mem_ranges:    Range list to add the memory range to.
> + * @base:          Base address of the range to add.
> + * @size:          Size of the memory range to add.
> + *
> + * (Re)allocates memory, if needed.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size)
> +{
> +	struct crash_mem *mrngs = *mem_ranges;
> +	u64 mstart, mend, end;
> +	unsigned int i;
> +
> +	if (!size)
> +		return 0;
> +
> +	end = base + size - 1;
> +
> +	if ((mrngs == NULL) || (mrngs->nr_ranges == 0))
> +		return __add_mem_range(mem_ranges, base, size);
> +
> +	for (i = 0; i < mrngs->nr_ranges; i++) {
> +		mstart = mrngs->ranges[i].start;
> +		mend = mrngs->ranges[i].end;
> +		if (base < mend && end > mstart) {
> +			if (base < mstart)
> +				mrngs->ranges[i].start = base;
> +			if (end > mend)
> +				mrngs->ranges[i].end = end;
> +			return 0;
> +		}
> +	}
> +
> +	return __add_mem_range(mem_ranges, base, size);
> +}
> +
> +/**
> + * add_tce_mem_ranges - Adds tce-table range to the given memory ranges list.
> + * @mem_ranges:         Range list to add the memory range(s) to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_tce_mem_ranges(struct crash_mem **mem_ranges)

Not sure this and the other add_foo_mem_ranges() really belong in this patch.

> +{
> +	struct device_node *dn;
> +	int ret = 0;
> +
> +	for_each_node_by_type(dn, "pci") {
> +		u64 base;
> +		u32 size;
> +		int rc;

Do you really need ret and rc?

> +		/*
> +		 * It is ok to have pci nodes without tce. So, ignore
> +		 * any read errors here.
> +		 */
> +		rc = of_property_read_u64(dn, "linux,tce-base", &base);
> +		rc |= of_property_read_u32(dn, "linux,tce-size", &size);
> +		if (rc)
> +			continue;
> +
> +		ret = add_mem_range(mem_ranges, base, size);
> +		if (ret)
> +			break;
                        ^
                        dn leaked.
> +	}
> +
> +	return ret;
> +}
> +
> +/**
> + * add_initrd_mem_range - Adds initrd range to the given memory ranges list,
> + *                        if the initrd was retained.
> + * @mem_ranges:           Range list to add the memory range to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_initrd_mem_range(struct crash_mem **mem_ranges)
> +{
> +	u64 base, end;
> +	char *str;
> +	int ret;
> +
> +	/* This range means something only if initrd was retained */
> +	str = strstr(saved_command_line, "retain_initrd");
> +	if (!str)
> +		return 0;

Unfortunate that we have to go and scan the command line again. But I
don't see a better way ATM.

Could be more concise:

	if (!strstr(saved_command_line, "retain_initrd"))
		return 0;

> +
> +	ret = of_property_read_u64(of_chosen, "linux,initrd-start", &base);
> +	ret |= of_property_read_u64(of_chosen, "linux,initrd-end", &end);
> +	if (!ret)
> +		ret = add_mem_range(mem_ranges, base, end - base + 1);
> +	return ret;
> +}
> +
> +#ifdef CONFIG_PPC_BOOK3S_64
> +/**
> + * add_htab_mem_range - Adds htab range to the given memory ranges list,
> + *                      if it exists
> + * @mem_ranges:         Range list to add the memory range to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_htab_mem_range(struct crash_mem **mem_ranges)
> +{
> +	if (!htab_address)
> +		return 0;
> +
> +	return add_mem_range(mem_ranges, __pa(htab_address), htab_size_bytes);
> +}
> +#endif
> +
> +/**
> + * add_kernel_mem_range - Adds kernel text region to the given
> + *                        memory ranges list.
> + * @mem_ranges:           Range list to add the memory range to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_kernel_mem_range(struct crash_mem **mem_ranges)
> +{
> +	return add_mem_range(mem_ranges, 0, __pa(_end));
> +}
> +
> +/**
> + * add_rtas_mem_range - Adds RTAS region to the given memory ranges list.
> + * @mem_ranges:         Range list to add the memory range to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_rtas_mem_range(struct crash_mem **mem_ranges)
> +{
> +	struct device_node *dn;
> +	int ret = 0;
> +
> +	dn = of_find_node_by_path("/rtas");
> +	if (dn) {
> +		u32 base, size;
> +
> +		ret = of_property_read_u32(dn, "linux,rtas-base", &base);
> +		ret |= of_property_read_u32(dn, "rtas-size", &size);
> +		if (ret)
> +			goto out;
> +
> +		ret = add_mem_range(mem_ranges, base, size);
> +	}
> +
> +out:
> +	of_node_put(dn);
> +	return ret;
> +}

Or:
	struct device_node *dn;
        u32 base, size;
	int rc;

	dn = of_find_node_by_path("/rtas");
	if (!dn)
        	return 0;

	rc  = of_property_read_u32(dn, "linux,rtas-base", &base);
	rc |= of_property_read_u32(dn, "rtas-size", &size);
	if (rc == 0)
		rc = add_mem_range(mem_ranges, base, size);

	of_node_put(dn);
	return rc;
}


> +
> +/**
> + * add_opal_mem_range - Adds OPAL region to the given memory ranges list.
> + * @mem_ranges:         Range list to add the memory range to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_opal_mem_range(struct crash_mem **mem_ranges)
> +{
> +	struct device_node *dn;
> +	int ret = 0;
> +
> +	dn = of_find_node_by_path("/ibm,opal");
> +	if (dn) {
> +		u64 base, size;
> +
> +		ret = of_property_read_u64(dn, "opal-base-address", &base);
> +		ret |= of_property_read_u64(dn, "opal-runtime-size", &size);
> +		if (ret)
> +			goto out;
> +
> +		ret = add_mem_range(mem_ranges, base, size);
> +	}
> +
> +out:
> +	of_node_put(dn);
> +	return ret;
> +}
> +
> +/**
> + * add_reserved_ranges - Adds "/reserved-ranges" regions exported by f/w
> + *                       to the given memory ranges list.
> + * @mem_ranges:          Range list to add the memory ranges to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_reserved_ranges(struct crash_mem **mem_ranges)
> +{
> +	int n_mem_addr_cells, n_mem_size_cells, i, len, cells, ret = 0;
> +	const __be32 *prop;
> +
> +	prop = of_get_property(of_root, "reserved-ranges", &len);
> +	if (!prop)
> +		return 0;
> +
> +	of_node_get(of_root);

You shouldn't need to get the root node, you already used it above anyway.

> +	n_mem_addr_cells = of_n_addr_cells(of_root);
> +	n_mem_size_cells = of_n_size_cells(of_root);
> +	cells = n_mem_addr_cells + n_mem_size_cells;
> +
> +	/* Each reserved range is an (address,size) pair */
> +	for (i = 0; i < (len / (sizeof(*prop) * cells)); i++) {
                                       ^
                                       just u32 would be clearer I think.



cheers

^ permalink raw reply

* [PATCH v4 0/3] Add support for divde[.] and divdeu[.] instruction emulation
From: Balamuruhan S @ 2020-07-28 13:03 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev

Hi All,

This patchset adds support to emulate divde, divde., divdeu and divdeu.
instructions and testcases for it.

Resend v4: rebased on latest powerpc next branch

Changes in v4:
-------------
Fix review comments from Naveen,
* replace TEST_DIVDEU() instead of wrongly used TEST_DIVDEU_DOT() in
  divdeu testcase.
* Include `acked-by` tag from Naveen for the series.
* Rebase it on latest mpe's merge tree.

Changes in v3:
-------------
* Fix suggestion from Sandipan to remove `PPC_INST_DIVDE_DOT` and
  `PPC_INST_DIVDEU_DOT` opcode macros defined in ppc-opcode.h, reuse
  `PPC_INST_DIVDE` and `PPC_INST_DIVDEU` in test_emulate_step.c to
  derive them respectively.

Changes in v2:
-------------
* Fix review comments from Paul to make divde_dot and divdeu_dot simple
  by using divde and divdeu, then goto `arith_done` instead of
  `compute_done`.
* Include `Reviewed-by` tag from Sandipan Das.
* Rebase with recent mpe's merge tree.

I would request for your review and suggestions for making it better.

Boot Log:
--------
:: ::
:: ::
291494043: (291493996): [    0.352649][    T1] emulate_step_test: divde          : RA = LONG_MIN, RB = LONG_MIN                       PASS
291517665: (291517580): [    0.352695][    T1] emulate_step_test: divde          : RA = 1L, RB = 0                                    PASS
291541357: (291541234): [    0.352742][    T1] emulate_step_test: divde          : RA = LONG_MIN, RB = LONG_MAX                       PASS
291565107: (291564946): [    0.352788][    T1] emulate_step_test: divde.         : RA = LONG_MIN, RB = LONG_MIN                       PASS
291588757: (291588558): [    0.352834][    T1] emulate_step_test: divde.         : RA = 1L, RB = 0                                    PASS
291612477: (291612240): [    0.352881][    T1] emulate_step_test: divde.         : RA = LONG_MIN, RB = LONG_MAX                       PASS
291636201: (291635926): [    0.352927][    T1] emulate_step_test: divdeu         : RA = LONG_MIN, RB = LONG_MIN                       PASS
291659830: (291659517): [    0.352973][    T1] emulate_step_test: divdeu         : RA = 1L, RB = 0                                    PASS
291683529: (291683178): [    0.353019][    T1] emulate_step_test: divdeu         : RA = LONG_MIN, RB = LONG_MAX                       PASS
291707248: (291706859): [    0.353066][    T1] emulate_step_test: divdeu         : RA = LONG_MAX - 1, RB = LONG_MAX                   PASS
291730962: (291730535): [    0.353112][    T1] emulate_step_test: divdeu         : RA = LONG_MIN + 1, RB = LONG_MIN                   PASS
291754714: (291754249): [    0.353158][    T1] emulate_step_test: divdeu.        : RA = LONG_MIN, RB = LONG_MIN                       PASS
291778371: (291777868): [    0.353205][    T1] emulate_step_test: divdeu.        : RA = 1L, RB = 0                                    PASS
291802098: (291801557): [    0.353251][    T1] emulate_step_test: divdeu.        : RA = LONG_MIN, RB = LONG_MAX                       PASS
291825844: (291825265): [    0.353297][    T1] emulate_step_test: divdeu.        : RA = LONG_MAX - 1, RB = LONG_MAX                   PASS
291849586: (291848969): [    0.353344][    T1] emulate_step_test: divdeu.        : RA = LONG_MIN + 1, RB = LONG_MIN                   PASS
:: ::
:: ::
292520225: (292519608): [    0.354654][    T1] registered taskstats version 1
292584751: (292584134): [    0.354780][    T1] pstore: Using crash dump compression: deflate
296454422: (296453805): [    0.362338][    T1] Freeing unused kernel memory: 1408K
296467838: (296467221): [    0.362364][    T1] This architecture does not have kernel memory protection.
296485387: (296484770): [    0.362398][    T1] Run /init as init process
297987339: (297986761): [    0.365332][   T46] mount (46) used greatest stack depth: 12512 bytes left
298889548: (298888992): [    0.367094][   T47] mount (47) used greatest stack depth: 11824 bytes left

355356256: (355355821): Welcome to Buildroot
355376898: (355376463): buildroot login:

Balamuruhan S (3):
  powerpc ppc-opcode: add divde and divdeu opcodes
  powerpc sstep: add support for divde[.] and divdeu[.] instructions
  powerpc test_emulate_step: add testcases for divde[.] and divdeu[.]
    instructions

 arch/powerpc/include/asm/ppc-opcode.h |   6 +
 arch/powerpc/lib/sstep.c              |  13 ++-
 arch/powerpc/lib/test_emulate_step.c  | 156 ++++++++++++++++++++++++++
 3 files changed, 174 insertions(+), 1 deletion(-)


base-commit: 7a9912e4cf048b607c8fafcfbdca7566660f1d78
-- 
2.24.1


^ permalink raw reply

* [PATCH v4 1/3] powerpc ppc-opcode: add divde and divdeu opcodes
From: Balamuruhan S @ 2020-07-28 13:03 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200728130308.1790982-1-bala24@linux.ibm.com>

include instruction opcodes for divde and divdeu as macros.

Reviewed-by: Sandipan Das <sandipan@linux.ibm.com>
Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/ppc-opcode.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 4c0bdafb6a7b..a6e3700c4566 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -466,6 +466,10 @@
 #define PPC_RAW_MULI(d, a, i)		(0x1c000000 | ___PPC_RT(d) | ___PPC_RA(a) | IMM_L(i))
 #define PPC_RAW_DIVWU(d, a, b)		(0x7c000396 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b))
 #define PPC_RAW_DIVDU(d, a, b)		(0x7c000392 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_DIVDE(t, a, b)		(0x7c000352 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_DIVDE_DOT(t, a, b)	(0x7c000352 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
+#define PPC_RAW_DIVDEU(t, a, b)		(0x7c000312 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_DIVDEU_DOT(t, a, b)	(0x7c000312 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
 #define PPC_RAW_AND(d, a, b)		(0x7c000038 | ___PPC_RA(d) | ___PPC_RS(a) | ___PPC_RB(b))
 #define PPC_RAW_ANDI(d, a, i)		(0x70000000 | ___PPC_RA(d) | ___PPC_RS(a) | IMM_L(i))
 #define PPC_RAW_AND_DOT(d, a, b)	(0x7c000039 | ___PPC_RA(d) | ___PPC_RS(a) | ___PPC_RB(b))
@@ -510,6 +514,8 @@
 #define PPC_DARN(t, l)		stringify_in_c(.long PPC_RAW_DARN(t, l))
 #define	PPC_DCBAL(a, b)		stringify_in_c(.long PPC_RAW_DCBAL(a, b))
 #define	PPC_DCBZL(a, b)		stringify_in_c(.long PPC_RAW_DCBZL(a, b))
+#define	PPC_DIVDE(t, a, b)	stringify_in_c(.long PPC_RAW_DIVDE(t, a, b))
+#define	PPC_DIVDEU(t, a, b)	stringify_in_c(.long PPC_RAW_DIVDEU(t, a, b))
 #define PPC_LQARX(t, a, b, eh)	stringify_in_c(.long PPC_RAW_LQARX(t, a, b, eh))
 #define PPC_LDARX(t, a, b, eh)	stringify_in_c(.long PPC_RAW_LDARX(t, a, b, eh))
 #define PPC_LWARX(t, a, b, eh)	stringify_in_c(.long PPC_RAW_LWARX(t, a, b, eh))
-- 
2.24.1


^ permalink raw reply related

* [PATCH v4 2/3] powerpc sstep: add support for divde[.] and divdeu[.] instructions
From: Balamuruhan S @ 2020-07-28 13:03 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200728130308.1790982-1-bala24@linux.ibm.com>

This patch adds emulation support for divde, divdeu instructions,
	* Divide Doubleword Extended (divde[.])
	* Divide Doubleword Extended Unsigned (divdeu[.])

Reviewed-by: Sandipan Das <sandipan@linux.ibm.com>
Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
---
 arch/powerpc/lib/sstep.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index c58ea9e787cb..caee8cc77e19 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1806,7 +1806,18 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 			op->val = (int) regs->gpr[ra] /
 				(int) regs->gpr[rb];
 			goto arith_done;
-
+#ifdef __powerpc64__
+		case 425:	/* divde[.] */
+			asm volatile(PPC_DIVDE(%0, %1, %2) :
+				"=r" (op->val) : "r" (regs->gpr[ra]),
+				"r" (regs->gpr[rb]));
+			goto arith_done;
+		case 393:	/* divdeu[.] */
+			asm volatile(PPC_DIVDEU(%0, %1, %2) :
+				"=r" (op->val) : "r" (regs->gpr[ra]),
+				"r" (regs->gpr[rb]));
+			goto arith_done;
+#endif
 		case 755:	/* darn */
 			if (!cpu_has_feature(CPU_FTR_ARCH_300))
 				return -1;
-- 
2.24.1


^ permalink raw reply related

* [PATCH v4 3/3] powerpc test_emulate_step: add testcases for divde[.] and divdeu[.] instructions
From: Balamuruhan S @ 2020-07-28 13:03 UTC (permalink / raw)
  To: mpe
  Cc: ravi.bangoria, jniethe5, Balamuruhan S, paulus, sandipan,
	naveen.n.rao, linuxppc-dev
In-Reply-To: <20200728130308.1790982-1-bala24@linux.ibm.com>

add testcases for divde, divde., divdeu, divdeu. emulated
instructions to cover few scenarios,
        * with same dividend and divisor to have undefine RT
          for divdeu[.]
        * with divide by zero to have undefine RT for both
          divde[.] and divdeu[.]
        * with negative dividend to cover -|divisor| < r <= 0 if
          the dividend is negative for divde[.]
        * normal case with proper dividend and divisor for both
          divde[.] and divdeu[.]

Reviewed-by: Sandipan Das <sandipan@linux.ibm.com>
Signed-off-by: Balamuruhan S <bala24@linux.ibm.com>
Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
---
 arch/powerpc/lib/test_emulate_step.c | 156 +++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)

diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c
index d242e9f72e0c..0a201b771477 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -1019,6 +1019,162 @@ static struct compute_test compute_tests[] = {
 			}
 		}
 	},
+	{
+		.mnemonic = "divde",
+		.subtests = {
+			{
+				.descr = "RA = LONG_MIN, RB = LONG_MIN",
+				.instr = ppc_inst(PPC_RAW_DIVDE(20, 21, 22)),
+				.regs = {
+					.gpr[21] = LONG_MIN,
+					.gpr[22] = LONG_MIN,
+				}
+			},
+			{
+				.descr = "RA = 1L, RB = 0",
+				.instr = ppc_inst(PPC_RAW_DIVDE(20, 21, 22)),
+				.flags = IGNORE_GPR(20),
+				.regs = {
+					.gpr[21] = 1L,
+					.gpr[22] = 0,
+				}
+			},
+			{
+				.descr = "RA = LONG_MIN, RB = LONG_MAX",
+				.instr = ppc_inst(PPC_RAW_DIVDE(20, 21, 22)),
+				.regs = {
+					.gpr[21] = LONG_MIN,
+					.gpr[22] = LONG_MAX,
+				}
+			}
+		}
+	},
+	{
+		.mnemonic = "divde.",
+		.subtests = {
+			{
+				.descr = "RA = LONG_MIN, RB = LONG_MIN",
+				.instr = ppc_inst(PPC_RAW_DIVDE_DOT(20, 21, 22)),
+				.regs = {
+					.gpr[21] = LONG_MIN,
+					.gpr[22] = LONG_MIN,
+				}
+			},
+			{
+				.descr = "RA = 1L, RB = 0",
+				.instr = ppc_inst(PPC_RAW_DIVDE_DOT(20, 21, 22)),
+				.flags = IGNORE_GPR(20),
+				.regs = {
+					.gpr[21] = 1L,
+					.gpr[22] = 0,
+				}
+			},
+			{
+				.descr = "RA = LONG_MIN, RB = LONG_MAX",
+				.instr = ppc_inst(PPC_RAW_DIVDE_DOT(20, 21, 22)),
+				.regs = {
+					.gpr[21] = LONG_MIN,
+					.gpr[22] = LONG_MAX,
+				}
+			}
+		}
+	},
+	{
+		.mnemonic = "divdeu",
+		.subtests = {
+			{
+				.descr = "RA = LONG_MIN, RB = LONG_MIN",
+				.instr = ppc_inst(PPC_RAW_DIVDEU(20, 21, 22)),
+				.flags = IGNORE_GPR(20),
+				.regs = {
+					.gpr[21] = LONG_MIN,
+					.gpr[22] = LONG_MIN,
+				}
+			},
+			{
+				.descr = "RA = 1L, RB = 0",
+				.instr = ppc_inst(PPC_RAW_DIVDEU(20, 21, 22)),
+				.flags = IGNORE_GPR(20),
+				.regs = {
+					.gpr[21] = 1L,
+					.gpr[22] = 0,
+				}
+			},
+			{
+				.descr = "RA = LONG_MIN, RB = LONG_MAX",
+				.instr = ppc_inst(PPC_RAW_DIVDEU(20, 21, 22)),
+				.regs = {
+					.gpr[21] = LONG_MIN,
+					.gpr[22] = LONG_MAX,
+				}
+			},
+			{
+				.descr = "RA = LONG_MAX - 1, RB = LONG_MAX",
+				.instr = ppc_inst(PPC_RAW_DIVDEU(20, 21, 22)),
+				.regs = {
+					.gpr[21] = LONG_MAX - 1,
+					.gpr[22] = LONG_MAX,
+				}
+			},
+			{
+				.descr = "RA = LONG_MIN + 1, RB = LONG_MIN",
+				.instr = ppc_inst(PPC_RAW_DIVDEU(20, 21, 22)),
+				.flags = IGNORE_GPR(20),
+				.regs = {
+					.gpr[21] = LONG_MIN + 1,
+					.gpr[22] = LONG_MIN,
+				}
+			}
+		}
+	},
+	{
+		.mnemonic = "divdeu.",
+		.subtests = {
+			{
+				.descr = "RA = LONG_MIN, RB = LONG_MIN",
+				.instr = ppc_inst(PPC_RAW_DIVDEU_DOT(20, 21, 22)),
+				.flags = IGNORE_GPR(20),
+				.regs = {
+					.gpr[21] = LONG_MIN,
+					.gpr[22] = LONG_MIN,
+				}
+			},
+			{
+				.descr = "RA = 1L, RB = 0",
+				.instr = ppc_inst(PPC_RAW_DIVDEU_DOT(20, 21, 22)),
+				.flags = IGNORE_GPR(20),
+				.regs = {
+					.gpr[21] = 1L,
+					.gpr[22] = 0,
+				}
+			},
+			{
+				.descr = "RA = LONG_MIN, RB = LONG_MAX",
+				.instr = ppc_inst(PPC_RAW_DIVDEU_DOT(20, 21, 22)),
+				.regs = {
+					.gpr[21] = LONG_MIN,
+					.gpr[22] = LONG_MAX,
+				}
+			},
+			{
+				.descr = "RA = LONG_MAX - 1, RB = LONG_MAX",
+				.instr = ppc_inst(PPC_RAW_DIVDEU_DOT(20, 21, 22)),
+				.regs = {
+					.gpr[21] = LONG_MAX - 1,
+					.gpr[22] = LONG_MAX,
+				}
+			},
+			{
+				.descr = "RA = LONG_MIN + 1, RB = LONG_MIN",
+				.instr = ppc_inst(PPC_RAW_DIVDEU_DOT(20, 21, 22)),
+				.flags = IGNORE_GPR(20),
+				.regs = {
+					.gpr[21] = LONG_MIN + 1,
+					.gpr[22] = LONG_MIN,
+				}
+			}
+		}
+	},
 	{
 		.mnemonic = "paddi",
 		.cpu_feature = CPU_FTR_ARCH_31,
-- 
2.24.1


^ permalink raw reply related

* Re: [PATCH v3 1/2] cpuidle: Trace IPI based and timer based wakeup latency from idle states
From: Pratik Sampat @ 2020-07-28 13:30 UTC (permalink / raw)
  To: Rafael J. Wysocki
  Cc: Gautham R. Shenoy, pratik.r.sampat, Linux PM, Daniel Lezcano,
	Rafael J. Wysocki, linuxppc-dev, Nicholas Piggin, Paul Mackerras,
	linux-kselftest, Shuah Khan, srivatsa, Linux Kernel Mailing List
In-Reply-To: <CAJZ5v0j3ip77opkaW3Rtn0cqT7VTL_8goctFBDVehWoZowDY0Q@mail.gmail.com>

Hello Rafael,


On 27/07/20 7:12 pm, Rafael J. Wysocki wrote:
> On Tue, Jul 21, 2020 at 2:43 PM Pratik Rajesh Sampat
> <psampat@linux.ibm.com> wrote:
>> Fire directed smp_call_function_single IPIs from a specified source
>> CPU to the specified target CPU to reduce the noise we have to wade
>> through in the trace log.
> And what's the purpose of it?

The idea for this comes from that fact that estimating wake-up
latencies and residencies for stop states is not an easy task.

The purpose is essentially to determine wakeup latencies, that are
caused by either, an IPI or a timer and compare with the advertised
wakeup latencies for each stop state.

This might help in determining the accuracy of our advertised values
and/or if they need any re-calibration.

>> The module is based on the idea written by Srivatsa Bhat and maintained
>> by Vaidyanathan Srinivasan internally.
>>
>> Queue HR timer and measure jitter. Wakeup latency measurement for idle
>> states using hrtimer.  Echo a value in ns to timer_test_function and
>> watch trace. A HRtimer will be queued and when it fires the expected
>> wakeup vs actual wakeup is computes and delay printed in ns.
>>
>> Implemented as a module which utilizes debugfs so that it can be
>> integrated with selftests.
>>
>> To include the module, check option and include as module
>> kernel hacking -> Cpuidle latency selftests
>>
>> [srivatsa.bhat@linux.vnet.ibm.com: Initial implementation in
>>   cpidle/sysfs]
>>
>> [svaidy@linux.vnet.ibm.com: wakeup latency measurements using hrtimer
>>   and fix some of the time calculation]
>>
>> [ego@linux.vnet.ibm.com: Fix some whitespace and tab errors and
>>   increase the resolution of IPI wakeup]
>>
>> Signed-off-by: Pratik Rajesh Sampat <psampat@linux.ibm.com>
>> Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
>> ---
>>   drivers/cpuidle/Makefile               |   1 +
>>   drivers/cpuidle/test-cpuidle_latency.c | 150 +++++++++++++++++++++++++
>>   lib/Kconfig.debug                      |  10 ++
>>   3 files changed, 161 insertions(+)
>>   create mode 100644 drivers/cpuidle/test-cpuidle_latency.c
>>
>> diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
>> index f07800cbb43f..2ae05968078c 100644
>> --- a/drivers/cpuidle/Makefile
>> +++ b/drivers/cpuidle/Makefile
>> @@ -8,6 +8,7 @@ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
>>   obj-$(CONFIG_DT_IDLE_STATES)             += dt_idle_states.o
>>   obj-$(CONFIG_ARCH_HAS_CPU_RELAX)         += poll_state.o
>>   obj-$(CONFIG_HALTPOLL_CPUIDLE)           += cpuidle-haltpoll.o
>> +obj-$(CONFIG_IDLE_LATENCY_SELFTEST)      += test-cpuidle_latency.o
>>
>>   ##################################################################################
>>   # ARM SoC drivers
>> diff --git a/drivers/cpuidle/test-cpuidle_latency.c b/drivers/cpuidle/test-cpuidle_latency.c
>> new file mode 100644
>> index 000000000000..61574665e972
>> --- /dev/null
>> +++ b/drivers/cpuidle/test-cpuidle_latency.c
>> @@ -0,0 +1,150 @@
>> +// SPDX-License-Identifier: GPL-2.0-or-later
>> +/*
>> + * Module-based API test facility for cpuidle latency using IPIs and timers
> I'd like to see a more detailed description of what it does and how it
> works here.

Right, I'll add that.
Based on comments from Daniel I have also been working on a
user-space only variant of this test as that does seem like
a better way to go.

The only downside is that the latency will be higher, but as we are
taking baseline measurements the diff of that from our observed reading
should still remain the same. Just that the test will take longer to run.
I'm yet to accurately confirm this.

I would appreciate your thoughts on that.

>> + */
>> +
>> +#include <linux/debugfs.h>
>> +#include <linux/kernel.h>
>> +#include <linux/module.h>
>> +
>> +/* IPI based wakeup latencies */
>> +struct latency {
>> +       unsigned int src_cpu;
>> +       unsigned int dest_cpu;
>> +       ktime_t time_start;
>> +       ktime_t time_end;
>> +       u64 latency_ns;
>> +} ipi_wakeup;
>> +
>> +static void measure_latency(void *info)
>> +{
>> +       struct latency *v;
>> +       ktime_t time_diff;
>> +
>> +       v = (struct latency *)info;
>> +       v->time_end = ktime_get();
>> +       time_diff = ktime_sub(v->time_end, v->time_start);
>> +       v->latency_ns = ktime_to_ns(time_diff);
>> +}
>> +
>> +void run_smp_call_function_test(unsigned int cpu)
>> +{
>> +       ipi_wakeup.src_cpu = smp_processor_id();
>> +       ipi_wakeup.dest_cpu = cpu;
>> +       ipi_wakeup.time_start = ktime_get();
>> +       smp_call_function_single(cpu, measure_latency, &ipi_wakeup, 1);
>> +}
>> +
>> +/* Timer based wakeup latencies */
>> +struct timer_data {
>> +       unsigned int src_cpu;
>> +       u64 timeout;
>> +       ktime_t time_start;
>> +       ktime_t time_end;
>> +       struct hrtimer timer;
>> +       u64 timeout_diff_ns;
>> +} timer_wakeup;
>> +
>> +static enum hrtimer_restart timer_called(struct hrtimer *hrtimer)
>> +{
>> +       struct timer_data *w;
>> +       ktime_t time_diff;
>> +
>> +       w = container_of(hrtimer, struct timer_data, timer);
>> +       w->time_end = ktime_get();
>> +
>> +       time_diff = ktime_sub(w->time_end, w->time_start);
>> +       time_diff = ktime_sub(time_diff, ns_to_ktime(w->timeout));
>> +       w->timeout_diff_ns = ktime_to_ns(time_diff);
>> +       return HRTIMER_NORESTART;
>> +}
>> +
>> +static void run_timer_test(unsigned int ns)
>> +{
>> +       hrtimer_init(&timer_wakeup.timer, CLOCK_MONOTONIC,
>> +                    HRTIMER_MODE_REL);
>> +       timer_wakeup.timer.function = timer_called;
>> +       timer_wakeup.time_start = ktime_get();
>> +       timer_wakeup.src_cpu = smp_processor_id();
>> +       timer_wakeup.timeout = ns;
>> +
>> +       hrtimer_start(&timer_wakeup.timer, ns_to_ktime(ns),
>> +                     HRTIMER_MODE_REL_PINNED);
>> +}
>> +
>> +static struct dentry *dir;
>> +
>> +static int cpu_read_op(void *data, u64 *value)
>> +{
>> +       *value = ipi_wakeup.dest_cpu;
>> +       return 0;
>> +}
>> +
>> +static int cpu_write_op(void *data, u64 value)
>> +{
>> +       run_smp_call_function_test(value);
>> +       return 0;
>> +}
>> +DEFINE_SIMPLE_ATTRIBUTE(ipi_ops, cpu_read_op, cpu_write_op, "%llu\n");
>> +
>> +static int timeout_read_op(void *data, u64 *value)
>> +{
>> +       *value = timer_wakeup.timeout;
>> +       return 0;
>> +}
>> +
>> +static int timeout_write_op(void *data, u64 value)
>> +{
>> +       run_timer_test(value);
>> +       return 0;
>> +}
>> +DEFINE_SIMPLE_ATTRIBUTE(timeout_ops, timeout_read_op, timeout_write_op, "%llu\n");
>> +
>> +static int __init latency_init(void)
>> +{
>> +       struct dentry *temp;
>> +
>> +       dir = debugfs_create_dir("latency_test", 0);
>> +       if (!dir) {
>> +               pr_alert("latency_test: failed to create /sys/kernel/debug/latency_test\n");
>> +               return -1;
>> +       }
>> +       temp = debugfs_create_file("ipi_cpu_dest",
>> +                                  0666,
>> +                                  dir,
>> +                                  NULL,
>> +                                  &ipi_ops);
>> +       if (!temp) {
>> +               pr_alert("latency_test: failed to create /sys/kernel/debug/ipi_cpu_dest\n");
>> +               return -1;
>> +       }
>> +       debugfs_create_u64("ipi_latency_ns", 0444, dir, &ipi_wakeup.latency_ns);
>> +       debugfs_create_u32("ipi_cpu_src", 0444, dir, &ipi_wakeup.src_cpu);
>> +
>> +       temp = debugfs_create_file("timeout_expected_ns",
>> +                                  0666,
>> +                                  dir,
>> +                                  NULL,
>> +                                  &timeout_ops);
>> +       if (!temp) {
>> +               pr_alert("latency_test: failed to create /sys/kernel/debug/timeout_expected_ns\n");
>> +               return -1;
>> +       }
>> +       debugfs_create_u64("timeout_diff_ns", 0444, dir, &timer_wakeup.timeout_diff_ns);
>> +       debugfs_create_u32("timeout_cpu_src", 0444, dir, &timer_wakeup.src_cpu);
>> +       pr_info("Latency Test module loaded\n");
>> +       return 0;
>> +}
>> +
>> +static void __exit latency_cleanup(void)
>> +{
>> +       pr_info("Cleaning up Latency Test module.\n");
>> +       debugfs_remove_recursive(dir);
>> +}
>> +
>> +module_init(latency_init);
>> +module_exit(latency_cleanup);
>> +
>> +MODULE_LICENSE("GPL");
>> +MODULE_AUTHOR("IBM Corporation");
>> +MODULE_DESCRIPTION("Measuring idle latency for IPIs and Timers");
>> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
>> index d74ac0fd6b2d..e2283790245a 100644
>> --- a/lib/Kconfig.debug
>> +++ b/lib/Kconfig.debug
>> @@ -1375,6 +1375,16 @@ config DEBUG_KOBJECT
>>            If you say Y here, some extra kobject debugging messages will be sent
>>            to the syslog.
>>
>> +config IDLE_LATENCY_SELFTEST
>> +       tristate "Cpuidle latency selftests"
>> +       depends on CPU_IDLE
>> +       help
>> +         This option provides a kernel module that runs tests using the IPI and
>> +         timers to measure latency.
> What latency does it measure?

It measures latencies incurred on wakeup after an IPI and a timer interrupt.

>> +
>> +         Say M if you want these self tests to build as a module.
>> +         Say N if you are unsure.
>> +
>>   config DEBUG_KOBJECT_RELEASE
>>          bool "kobject release debugging"
>>          depends on DEBUG_OBJECTS_TIMERS
>> --
>> 2.25.4
>>
Thanks,
Pratik


^ permalink raw reply

* Re: [RESEND PATCH v5 06/11] ppc64/kexec_file: restrict memory usage of kdump kernel
From: Michael Ellerman @ 2020-07-28 13:44 UTC (permalink / raw)
  To: Hari Bathini, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Eric Biederman, Thiago Jung Bauermann, Dave Young, Vivek Goyal
In-Reply-To: <159579231812.5790.16096865978767385505.stgit@hbathini>

Hari Bathini <hbathini@linux.ibm.com> writes:
> diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
> index 2df6f4273ddd..8df085a22fd7 100644
> --- a/arch/powerpc/kexec/file_load_64.c
> +++ b/arch/powerpc/kexec/file_load_64.c
> @@ -17,9 +17,21 @@
>  #include <linux/kexec.h>
>  #include <linux/of_fdt.h>
>  #include <linux/libfdt.h>
> +#include <linux/of_device.h>
>  #include <linux/memblock.h>
> +#include <linux/slab.h>
> +#include <asm/drmem.h>
>  #include <asm/kexec_ranges.h>
>  
> +struct umem_info {
> +	uint64_t *buf; /* data buffer for usable-memory property */
> +	uint32_t idx;  /* current index */
> +	uint32_t size; /* size allocated for the data buffer */

Use kernel types please, u64, u32.

> +	/* usable memory ranges to look up */
> +	const struct crash_mem *umrngs;

"umrngs".

Given it's part of the umem_info struct could it just be "ranges"?

> +};
> +
>  const struct kexec_file_ops * const kexec_file_loaders[] = {
>  	&kexec_elf64_ops,
>  	NULL
> @@ -74,6 +86,42 @@ static int get_exclude_memory_ranges(struct crash_mem **mem_ranges)
>  	return ret;
>  }
>  
> +/**
> + * get_usable_memory_ranges - Get usable memory ranges. This list includes
> + *                            regions like crashkernel, opal/rtas & tce-table,
> + *                            that kdump kernel could use.
> + * @mem_ranges:               Range list to add the memory ranges to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +static int get_usable_memory_ranges(struct crash_mem **mem_ranges)
> +{
> +	int ret;
> +
> +	/*
> +	 * prom code doesn't take kindly to missing low memory. So, add

I don't know what that's referring to, "prom code" is too vague.

> +	 * [0, crashk_res.end] instead of [crashk_res.start, crashk_res.end]
> +	 * to keep it happy.
> +	 */
> +	ret = add_mem_range(mem_ranges, 0, crashk_res.end + 1);
> +	if (ret)
> +		goto out;
> +
> +	ret = add_rtas_mem_range(mem_ranges);
> +	if (ret)
> +		goto out;
> +
> +	ret = add_opal_mem_range(mem_ranges);
> +	if (ret)
> +		goto out;
> +
> +	ret = add_tce_mem_ranges(mem_ranges);
> +out:
> +	if (ret)
> +		pr_err("Failed to setup usable memory ranges\n");
> +	return ret;
> +}
> +
>  /**
>   * __locate_mem_hole_top_down - Looks top down for a large enough memory hole
>   *                              in the memory regions between buf_min & buf_max
> @@ -273,6 +321,382 @@ static int locate_mem_hole_bottom_up_ppc64(struct kexec_buf *kbuf,
>  	return ret;
>  }
>  
> +/**
> + * check_realloc_usable_mem - Reallocate buffer if it can't accommodate entries
> + * @um_info:                  Usable memory buffer and ranges info.
> + * @cnt:                      No. of entries to accommodate.
> + *
> + * Frees up the old buffer if memory reallocation fails.
> + *
> + * Returns buffer on success, NULL on error.
> + */
> +static uint64_t *check_realloc_usable_mem(struct umem_info *um_info, int cnt)
> +{
> +	void *tbuf;
> +
> +	if (um_info->size >=
> +	    ((um_info->idx + cnt) * sizeof(*(um_info->buf))))
> +		return um_info->buf;

This is awkward.

AFAICS you only use um_info->size here, so instead why not store the
number of u64s you have space for, as num for example.

Then the above comparison becomes:

	if (um_info->num >= (um_info->idx + count))

Then you only have to calculate the size internally here for the
realloc.

> +
> +	um_info->size += MEM_RANGE_CHUNK_SZ;

	new_size = um_info->size + MEM_RANGE_CHUNK_SZ;
	tbuf = krealloc(um_info->buf, new_size, GFP_KERNEL);

> +	tbuf = krealloc(um_info->buf, um_info->size, GFP_KERNEL);
> +	if (!tbuf) {
> +		um_info->size -= MEM_RANGE_CHUNK_SZ;

Then you can drop this.

> +		return NULL;
> +	}

	um_info->size = new_size;

> +
> +	memset(tbuf + um_info->idx, 0, MEM_RANGE_CHUNK_SZ);

Just pass __GFP_ZERO to krealloc?

> +	return tbuf;
> +}
> +
> +/**
> + * add_usable_mem - Add the usable memory ranges within the given memory range
> + *                  to the buffer
> + * @um_info:        Usable memory buffer and ranges info.
> + * @base:           Base address of memory range to look for.
> + * @end:            End address of memory range to look for.
> + * @cnt:            No. of usable memory ranges added to buffer.

One caller never uses this AFAICS.

Couldn't the other caller just compare the um_info->idx before and after
the call, and avoid another pass by reference parameter.

> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +static int add_usable_mem(struct umem_info *um_info, uint64_t base,
> +			  uint64_t end, int *cnt)
> +{
> +	uint64_t loc_base, loc_end, *buf;
> +	const struct crash_mem *umrngs;
> +	int i, add;

add should be bool.

> +	*cnt = 0;
> +	umrngs = um_info->umrngs;
> +	for (i = 0; i < umrngs->nr_ranges; i++) {
> +		add = 0;
> +		loc_base = umrngs->ranges[i].start;
> +		loc_end = umrngs->ranges[i].end;
> +		if (loc_base >= base && loc_end <= end)
> +			add = 1;
> +		else if (base < loc_end && end > loc_base) {
> +			if (loc_base < base)
> +				loc_base = base;
> +			if (loc_end > end)
> +				loc_end = end;
> +			add = 1;
> +		}
> +
> +		if (add) {
> +			buf = check_realloc_usable_mem(um_info, 2);
> +			if (!buf)
> +				return -ENOMEM;
> +
> +			um_info->buf = buf;
> +			buf[um_info->idx++] = cpu_to_be64(loc_base);
> +			buf[um_info->idx++] =
> +					cpu_to_be64(loc_end - loc_base + 1);
> +			(*cnt)++;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * kdump_setup_usable_lmb - This is a callback function that gets called by
> + *                          walk_drmem_lmbs for every LMB to set its
> + *                          usable memory ranges.
> + * @lmb:                    LMB info.
> + * @usm:                    linux,drconf-usable-memory property value.
> + * @data:                   Pointer to usable memory buffer and ranges info.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +static int kdump_setup_usable_lmb(struct drmem_lmb *lmb, const __be32 **usm,
> +				  void *data)
> +{
> +	struct umem_info *um_info;
> +	uint64_t base, end, *buf;
> +	int cnt, tmp_idx, ret;
> +
> +	/*
> +	 * kdump load isn't supported on kernels already booted with
> +	 * linux,drconf-usable-memory property.
> +	 */
> +	if (*usm) {
> +		pr_err("linux,drconf-usable-memory property already exists!");
> +		return -EINVAL;
> +	}
> +
> +	um_info = data;
> +	tmp_idx = um_info->idx;
> +	buf = check_realloc_usable_mem(um_info, 1);
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	um_info->idx++;
> +	um_info->buf = buf;
> +	base = lmb->base_addr;
> +	end = base + drmem_lmb_size() - 1;
> +	ret = add_usable_mem(um_info, base, end, &cnt);
> +	if (!ret)
> +		um_info->buf[tmp_idx] = cpu_to_be64(cnt);
> +
> +	return ret;
> +}
> +
> +/**
> + * get_node_path_size - Get the full path length of the given node.
> + * @dn:                 Device Node.
> + *
> + * Also, counts '\0' at the end of the path.
> + * For example, /memory@0 will be "/memory@0\0" => 10 bytes.
> + *
> + * Returns the string size of the node's full path.
> + */
> +static int get_node_path_size(struct device_node *dn)
> +{
> +	int len = 0;
> +
> +	if (!dn)
> +		return 0;
> +
> +	/* Root node */
> +	if (!(dn->parent))
> +		return 2;
> +
> +	while (dn) {
> +		len += strlen(dn->full_name) + 1;
> +		dn = dn->parent;
> +	}
> +
> +	return len;
> +}
> +
> +/**
> + * get_node_path - Get the full path of the given node.
> + * @node:          Device node.
> + *
> + * Allocates buffer for node path. The caller must free the buffer
> + * after use.
> + *
> + * Returns buffer with path on success, NULL otherwise.
> + */
> +static char *get_node_path(struct device_node *node)
> +{


As discussed this can probably be replaced with snprintf(buf, "%pOF") ?


cheers

^ permalink raw reply

* Re: [RESEND PATCH v5 07/11] ppc64/kexec_file: enable early kernel's OPAL calls
From: Michael Ellerman @ 2020-07-28 13:46 UTC (permalink / raw)
  To: Hari Bathini, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Eric Biederman, Thiago Jung Bauermann, Dave Young, Vivek Goyal
In-Reply-To: <159579233676.5790.10701756666641782647.stgit@hbathini>

Hari Bathini <hbathini@linux.ibm.com> writes:
> Kernel built with CONFIG_PPC_EARLY_DEBUG_OPAL enabled expects r8 & r9
> to be filled with OPAL base & entry addresses respectively. Setting
> these registers allows the kernel to perform OPAL calls before the
> device tree is parsed.

I'm not convinced we want to do this.

If we do it becomes part of the kexec ABI and we have to honour it into
the future.

And in practice there are no non-development kernels built with OPAL early
debugging enabled, so it's not clear it actually helps anyone other than
developers.

cheers

> v4 -> v5:
> * New patch. Updated opal_base & opal_entry values in r8 & r9 respectively.
>   This change was part of the below dropped patch in v4:
>     - https://lore.kernel.org/patchwork/patch/1275667/
>
>
>  arch/powerpc/kexec/file_load_64.c      |   16 ++++++++++++++++
>  arch/powerpc/purgatory/trampoline_64.S |   15 +++++++++++++++
>  2 files changed, 31 insertions(+)
>
> diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
> index 8df085a22fd7..a5c1442590b2 100644
> --- a/arch/powerpc/kexec/file_load_64.c
> +++ b/arch/powerpc/kexec/file_load_64.c
> @@ -713,6 +713,8 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
>  			  const void *fdt, unsigned long kernel_load_addr,
>  			  unsigned long fdt_load_addr)
>  {
> +	struct device_node *dn = NULL;
> +	uint64_t val;
>  	int ret;
>  
>  	ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
> @@ -735,9 +737,23 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
>  			goto out;
>  	}
>  
> +	/* Setup OPAL base & entry values */
> +	dn = of_find_node_by_path("/ibm,opal");
> +	if (dn) {
> +		of_property_read_u64(dn, "opal-base-address", &val);
> +		ret = kexec_purgatory_get_set_symbol(image, "opal_base", &val,
> +						     sizeof(val), false);
> +		if (ret)
> +			goto out;
> +
> +		of_property_read_u64(dn, "opal-entry-address", &val);
> +		ret = kexec_purgatory_get_set_symbol(image, "opal_entry", &val,
> +						     sizeof(val), false);
> +	}
>  out:
>  	if (ret)
>  		pr_err("Failed to setup purgatory symbols");
> +	of_node_put(dn);
>  	return ret;
>  }
>  
> diff --git a/arch/powerpc/purgatory/trampoline_64.S b/arch/powerpc/purgatory/trampoline_64.S
> index a5a83c3f53e6..464af8e8a4cb 100644
> --- a/arch/powerpc/purgatory/trampoline_64.S
> +++ b/arch/powerpc/purgatory/trampoline_64.S
> @@ -61,6 +61,10 @@ master:
>  	li	%r4,28
>  	STWX_BE	%r17,%r3,%r4	/* Store my cpu as __be32 at byte 28 */
>  1:
> +	/* Load opal base and entry values in r8 & r9 respectively */
> +	ld	%r8,(opal_base - 0b)(%r18)
> +	ld	%r9,(opal_entry - 0b)(%r18)
> +
>  	/* load the kernel address */
>  	ld	%r4,(kernel - 0b)(%r18)
>  
> @@ -102,6 +106,17 @@ dt_offset:
>  	.8byte  0x0
>  	.size dt_offset, . - dt_offset
>  
> +	.balign 8
> +	.globl opal_base
> +opal_base:
> +	.8byte  0x0
> +	.size opal_base, . - opal_base
> +
> +	.balign 8
> +	.globl opal_entry
> +opal_entry:
> +	.8byte  0x0
> +	.size opal_entry, . - opal_entry
>  
>  	.data
>  	.balign 8

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox