LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/3] cpuidle-pseries: Set the latency-hint before entering CEDE
From: Gautham R. Shenoy @ 2020-07-29  6:47 UTC (permalink / raw)
  To: Nicholas Piggin, Anton Blanchard, Nathan Lynch, Michael Ellerman,
	Michael Neuling, Vaidyanathan Srinivasan
  Cc: linuxppc-dev, Gautham R. Shenoy, linux-kernel, linux-pm
In-Reply-To: <1596005254-25753-1-git-send-email-ego@linux.vnet.ibm.com>

From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>

As per the PAPR, each H_CEDE call is associated with a latency-hint to
be passed in the VPA field "cede_latency_hint". The CEDE states that
we were implicitly entering so far is CEDE with latency-hint = 0.

This patch explicitly sets the latency hint corresponding to the CEDE
state that we are currently entering. While at it, we save the
previous hint, to be restored once we wakeup from CEDE. This will be
required in the future when we expose extended-cede states through the
cpuidle framework, where each of them will have a different
cede-latency hint.

Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
---
 drivers/cpuidle/cpuidle-pseries.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index 3e058ad2..88e71c3 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -86,19 +86,27 @@ static void check_and_cede_processor(void)
 	}
 }
 
+#define NR_CEDE_STATES		1  /* CEDE with latency-hint 0 */
+#define NR_DEDICATED_STATES	(NR_CEDE_STATES + 1) /* Includes snooze */
+
+u8 cede_latency_hint[NR_DEDICATED_STATES];
 static int dedicated_cede_loop(struct cpuidle_device *dev,
 				struct cpuidle_driver *drv,
 				int index)
 {
+	u8 old_latency_hint;
 
 	pseries_idle_prolog();
 	get_lppaca()->donate_dedicated_cpu = 1;
+	old_latency_hint = get_lppaca()->cede_latency_hint;
+	get_lppaca()->cede_latency_hint = cede_latency_hint[index];
 
 	HMT_medium();
 	check_and_cede_processor();
 
 	local_irq_disable();
 	get_lppaca()->donate_dedicated_cpu = 0;
+	get_lppaca()->cede_latency_hint = old_latency_hint;
 
 	pseries_idle_epilog();
 
@@ -130,7 +138,7 @@ static int shared_cede_loop(struct cpuidle_device *dev,
 /*
  * States for dedicated partition case.
  */
-static struct cpuidle_state dedicated_states[] = {
+static struct cpuidle_state dedicated_states[NR_DEDICATED_STATES] = {
 	{ /* Snooze */
 		.name = "snooze",
 		.desc = "snooze",
-- 
1.9.4


^ permalink raw reply related

* [PATCH v2 2/3] cpuidle-pseries: Add function to parse extended CEDE records
From: Gautham R. Shenoy @ 2020-07-29  6:47 UTC (permalink / raw)
  To: Nicholas Piggin, Anton Blanchard, Nathan Lynch, Michael Ellerman,
	Michael Neuling, Vaidyanathan Srinivasan
  Cc: linuxppc-dev, Gautham R. Shenoy, linux-kernel, linux-pm
In-Reply-To: <1596005254-25753-1-git-send-email-ego@linux.vnet.ibm.com>

From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>

Currently we use CEDE with latency-hint 0 as the only other idle state
on a dedicated LPAR apart from the polling "snooze" state.

The platform might support additional extended CEDE idle states, which
can be discovered through the "ibm,get-system-parameter" rtas-call
made with CEDE_LATENCY_TOKEN.

This patch adds a function to obtain information about the extended
CEDE idle states from the platform and parse the contents to populate
an array of extended CEDE states. These idle states thus discovered
will be added to the cpuidle framework in the next patch.

dmesg on a POWER9 LPAR, demonstrating the output of parsing the
extended CEDE latency parameters.

[    5.913180] xcede : xcede_record_size = 10
[    5.913183] xcede : Record 0 : hint = 1, latency =0x400 tb-ticks, Wake-on-irq = 1
[    5.913188] xcede : Record 1 : hint = 2, latency =0x3e8000 tb-ticks, Wake-on-irq = 0
[    5.913193] cpuidle : Skipping the 2 Extended CEDE idle states

Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
---
 drivers/cpuidle/cpuidle-pseries.c | 129 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 127 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index 88e71c3..b1dc24d 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -21,6 +21,7 @@
 #include <asm/runlatch.h>
 #include <asm/idle.h>
 #include <asm/plpar_wrappers.h>
+#include <asm/rtas.h>
 
 static struct cpuidle_driver pseries_idle_driver = {
 	.name             = "pseries_idle",
@@ -86,9 +87,120 @@ static void check_and_cede_processor(void)
 	}
 }
 
-#define NR_CEDE_STATES		1  /* CEDE with latency-hint 0 */
+struct xcede_latency_records {
+	u8  latency_hint;
+	u64 wakeup_latency_tb_ticks;
+	u8  responsive_to_irqs;
+};
+
+/*
+ * XCEDE : Extended CEDE states discovered through the
+ *         "ibm,get-systems-parameter" rtas-call with the token
+ *         CEDE_LATENCY_TOKEN
+ */
+#define MAX_XCEDE_STATES		4
+#define	XCEDE_LATENCY_RECORD_SIZE	10
+#define XCEDE_LATENCY_PARAM_MAX_LENGTH	(2 + 2 + \
+					(MAX_XCEDE_STATES * XCEDE_LATENCY_RECORD_SIZE))
+
+#define CEDE_LATENCY_TOKEN		45
+
+#define NR_CEDE_STATES		(MAX_XCEDE_STATES + 1) /* CEDE with latency-hint 0 */
 #define NR_DEDICATED_STATES	(NR_CEDE_STATES + 1) /* Includes snooze */
 
+struct xcede_latency_records xcede_records[MAX_XCEDE_STATES];
+unsigned int nr_xcede_records;
+char xcede_parameters[XCEDE_LATENCY_PARAM_MAX_LENGTH];
+
+static int parse_cede_parameters(void)
+{
+	int ret = -1, i;
+	u16 payload_length;
+	u8 xcede_record_size;
+	u32 total_xcede_records_size;
+	char *payload;
+
+	memset(xcede_parameters, 0, XCEDE_LATENCY_PARAM_MAX_LENGTH);
+
+	ret = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+			NULL, CEDE_LATENCY_TOKEN, __pa(xcede_parameters),
+			XCEDE_LATENCY_PARAM_MAX_LENGTH);
+
+	if (ret) {
+		pr_err("xcede: Error parsing CEDE_LATENCY_TOKEN\n");
+		return ret;
+	}
+
+	payload_length = be16_to_cpu(*(__be16 *)(&xcede_parameters[0]));
+	payload = &xcede_parameters[2];
+
+	/*
+	 * If the platform supports the cede latency settings
+	 * information system parameter it must provide the following
+	 * information in the NULL terminated parameter string:
+	 *
+	 * a. The first byte is the length “N” of each cede
+	 *    latency setting record minus one (zero indicates a length
+	 *    of 1 byte).
+	 *
+	 * b. For each supported cede latency setting a cede latency
+	 *    setting record consisting of the first “N” bytes as per
+	 *    the following table.
+	 *
+	 *	-----------------------------
+	 *	| Field           | Field  |
+	 *	| Name            | Length |
+	 *	-----------------------------
+	 *	| Cede Latency    | 1 Byte |
+	 *	| Specifier Value |        |
+	 *	-----------------------------
+	 *	| Maximum wakeup  |        |
+	 *	| latency in      | 8 Bytes|
+	 *	| tb-ticks        |        |
+	 *	-----------------------------
+	 *	| Responsive to   |        |
+	 *	| external        | 1 Byte |
+	 *	| interrupts      |        |
+	 *	-----------------------------
+	 *
+	 * This version has cede latency record size = 10.
+	 */
+	xcede_record_size = (u8)payload[0] + 1;
+
+	if (xcede_record_size != XCEDE_LATENCY_RECORD_SIZE) {
+		pr_err("xcede : Expected record-size %d. Observed size %d.\n",
+		       XCEDE_LATENCY_RECORD_SIZE, xcede_record_size);
+		return -EINVAL;
+	}
+
+	pr_info("xcede : xcede_record_size = %d\n", xcede_record_size);
+
+	/*
+	 * Since the payload_length includes the last NULL byte and
+	 * the xcede_record_size, the remaining bytes correspond to
+	 * array of all cede_latency settings.
+	 */
+	total_xcede_records_size = payload_length - 2;
+	nr_xcede_records = total_xcede_records_size / xcede_record_size;
+
+	payload++;
+	for (i = 0; i < nr_xcede_records; i++) {
+		struct xcede_latency_records *record = &xcede_records[i];
+
+		record->latency_hint = (u8)payload[0];
+		record->wakeup_latency_tb_ticks  =
+			be64_to_cpu(*(__be64 *)(&payload[1]));
+		record->responsive_to_irqs = (u8)payload[9];
+		payload += xcede_record_size;
+		pr_info("xcede : Record %d : hint = %u, latency =0x%llx tb-ticks, Wake-on-irq = %u\n",
+			i, record->latency_hint,
+			record->wakeup_latency_tb_ticks,
+			record->responsive_to_irqs);
+	}
+
+	return 0;
+}
+
 u8 cede_latency_hint[NR_DEDICATED_STATES];
 static int dedicated_cede_loop(struct cpuidle_device *dev,
 				struct cpuidle_driver *drv,
@@ -219,6 +331,19 @@ static int pseries_cpuidle_driver_init(void)
 	return 0;
 }
 
+static int add_pseries_idle_states(void)
+{
+	int nr_states = 2; /* By default we have snooze, CEDE */
+
+	if (parse_cede_parameters())
+		return nr_states;
+
+	pr_info("cpuidle : Skipping the %d Extended CEDE idle states\n",
+		nr_xcede_records);
+
+	return nr_states;
+}
+
 /*
  * pseries_idle_probe()
  * Choose state table for shared versus dedicated partition
@@ -241,7 +366,7 @@ static int pseries_idle_probe(void)
 			max_idle_state = ARRAY_SIZE(shared_states);
 		} else {
 			cpuidle_state_table = dedicated_states;
-			max_idle_state = ARRAY_SIZE(dedicated_states);
+			max_idle_state = add_pseries_idle_states();
 		}
 	} else
 		return -ENODEV;
-- 
1.9.4


^ permalink raw reply related

* Re: [PATCH v4 09/10] Powerpc/smp: Create coregroup domain
From: Srikar Dronamraju @ 2020-07-29  6:13 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
	LKML, Nicholas Piggin, Oliver O'Halloran, Jordan Niethe,
	linuxppc-dev, Ingo Molnar
In-Reply-To: <jhjr1sviswg.mognet@arm.com>

* Valentin Schneider <valentin.schneider@arm.com> [2020-07-28 16:03:11]:

Hi Valentin,

Thanks for looking into the patches.

> On 27/07/20 06:32, Srikar Dronamraju wrote:
> > Add percpu coregroup maps and masks to create coregroup domain.
> > If a coregroup doesn't exist, the coregroup domain will be degenerated
> > in favour of SMT/CACHE domain.
> >
> 
> So there's at least one arm64 platform out there with the same "pairs of
> cores share L2" thing (Ampere eMAG), and that lives quite happily with the
> default scheduler topology (SMT/MC/DIE). Each pair of core gets its MC
> domain, and the whole system is covered by DIE.
> 
> Now arguably it's not a perfect representation; DIE doesn't have
> SD_SHARE_PKG_RESOURCES so the highest level sd_llc can point to is MC. That
> will impact all callsites using cpus_share_cache(): in the eMAG case, only
> pairs of cores will be seen as sharing cache, even though *all* cores share
> the same L3.
> 

Okay, Its good to know that we have a chip which is similar to P9 in
topology.

> I'm trying to paint a picture of what the P9 topology looks like (the one
> you showcase in your cover letter) to see if there are any similarities;
> from what I gather in [1], wikichips and your cover letter, with P9 you can
> have something like this in a single DIE (somewhat unsure about L3 setup;
> it looks to be distributed?)
> 
>      +---------------------------------------------------------------------+
>      |                                  L3                                 |
>      +---------------+-+---------------+-+---------------+-+---------------+
>      |       L2      | |       L2      | |       L2      | |       L2      |
>      +------+-+------+ +------+-+------+ +------+-+------+ +------+-+------+
>      |  L1  | |  L1  | |  L1  | |  L1  | |  L1  | |  L1  | |  L1  | |  L1  |
>      +------+ +------+ +------+ +------+ +------+ +------+ +------+ +------+
>      |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs|
>      +------+ +------+ +------+ +------+ +------+ +------+ +------+ +------+
> 
> Which would lead to (ignoring the whole SMT CPU numbering shenanigans)
> 
> NUMA     [                                                   ...
> DIE      [                                             ]
> MC       [         ] [         ] [         ] [         ]
> BIGCORE  [         ] [         ] [         ] [         ]
> SMT      [   ] [   ] [   ] [   ] [   ] [   ] [   ] [   ]
>          00-03 04-07 08-11 12-15 16-19 20-23 24-27 28-31  <other node here>
> 

What you have summed up is perfectly what a P9 topology looks like. I dont
think I could have explained it better than this.

> This however has MC == BIGCORE; what makes it you can have different spans
> for these two domains? If it's not too much to ask, I'd love to have a P9
> topology diagram.
> 
> [1]: 20200722081822.GG9290@linux.vnet.ibm.com

At this time the current topology would be good enough i.e BIGCORE would
always be equal to a MC. However in future we could have chips that can have
lesser/larger number of CPUs in llc than in a BIGCORE or we could have
granular or split L3 caches within a DIE. In such a case BIGCORE != MC.

Also in the current P9 itself, two neighbouring core-pairs form a quad.
Cache latency within a quad is better than a latency to a distant core-pair.
Cache latency within a core pair is way better than latency within a quad.
So if we have only 4 threads running on a DIE all of them accessing the same
cache-lines, then we could probably benefit if all the tasks were to run
within the quad aka MC/Coregroup.

I have found some benchmarks which are latency sensitive to benefit by
having a grouping a quad level (using kernel hacks and not backed by
firmware changes). Gautham also found similar results in his experiments
but he only used binding within the stock kernel.

I am not setting SD_SHARE_PKG_RESOURCES in MC/Coregroup sd_flags as in MC
domain need not be LLC domain for Power.

-- 
Thanks and Regards
Srikar Dronamraju

^ permalink raw reply

* Re: [PATCH] powerpc/64s/hash: Fix hash_preload running with interrupts enabled
From: Athira Rajeev @ 2020-07-29  4:18 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: Aneesh Kumar K . V, linuxppc-dev, Nicholas Piggin
In-Reply-To: <87h7ts79iv.fsf@mpe.ellerman.id.au>



> On 28-Jul-2020, at 6:14 AM, Michael Ellerman <mpe@ellerman.id.au> wrote:
> 
> Athira Rajeev <atrajeev@linux.vnet.ibm.com> writes:
>>> On 27-Jul-2020, at 6:05 PM, Michael Ellerman <mpe@ellerman.id.au> wrote:
>>> 
>>> Athira Rajeev <atrajeev@linux.vnet.ibm.com> writes:
>>>>> On 27-Jul-2020, at 11:39 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>>>>> 
>>>>> Commit 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
>>>>> caller") removed the local_irq_disable from hash_preload, but it was
>>>>> required for more than just the page table walk: the hash pte busy bit is
>>>>> effectively a lock which may be taken in interrupt context, and the local
>>>>> update flag test must not be preempted before it's used.
>>>>> 
>>>>> This solves apparent lockups with perf interrupting __hash_page_64K. If
>>>>> get_perf_callchain then also takes a hash fault on the same page while it
>>>>> is already locked, it will loop forever taking hash faults, which looks like
>>>>> this:
>>>>> 
>>>>> cpu 0x49e: Vector: 100 (System Reset) at [c00000001a4f7d70]
>>>>>  pc: c000000000072dc8: hash_page_mm+0x8/0x800
>>>>>  lr: c00000000000c5a4: do_hash_page+0x24/0x38
>>>>>  sp: c0002ac1cc69ac70
>>>>> msr: 8000000000081033
>>>>> current = 0xc0002ac1cc602e00
>>>>> paca    = 0xc00000001de1f280   irqmask: 0x03   irq_happened: 0x01
>>>>>  pid   = 20118, comm = pread2_processe
>>>>> Linux version 5.8.0-rc6-00345-g1fad14f18bc6
>>>>> 49e:mon> t
>>>>> [c0002ac1cc69ac70] c00000000000c5a4 do_hash_page+0x24/0x38 (unreliable)
>>>>> --- Exception: 300 (Data Access) at c00000000008fa60 __copy_tofrom_user_power7+0x20c/0x7ac
>>>>> [link register   ] c000000000335d10 copy_from_user_nofault+0xf0/0x150
>>>>> [c0002ac1cc69af70] c00032bf9fa3c880 (unreliable)
>>>>> [c0002ac1cc69afa0] c000000000109df0 read_user_stack_64+0x70/0xf0
>>>>> [c0002ac1cc69afd0] c000000000109fcc perf_callchain_user_64+0x15c/0x410
>>>>> [c0002ac1cc69b060] c000000000109c00 perf_callchain_user+0x20/0x40
>>>>> [c0002ac1cc69b080] c00000000031c6cc get_perf_callchain+0x25c/0x360
>>>>> [c0002ac1cc69b120] c000000000316b50 perf_callchain+0x70/0xa0
>>>>> [c0002ac1cc69b140] c000000000316ddc perf_prepare_sample+0x25c/0x790
>>>>> [c0002ac1cc69b1a0] c000000000317350 perf_event_output_forward+0x40/0xb0
>>>>> [c0002ac1cc69b220] c000000000306138 __perf_event_overflow+0x88/0x1a0
>>>>> [c0002ac1cc69b270] c00000000010cf70 record_and_restart+0x230/0x750
>>>>> [c0002ac1cc69b620] c00000000010d69c perf_event_interrupt+0x20c/0x510
>>>>> [c0002ac1cc69b730] c000000000027d9c performance_monitor_exception+0x4c/0x60
>>>>> [c0002ac1cc69b750] c00000000000b2f8 performance_monitor_common_virt+0x1b8/0x1c0
>>>>> --- Exception: f00 (Performance Monitor) at c0000000000cb5b0 pSeries_lpar_hpte_insert+0x0/0x160
>>>>> [link register   ] c0000000000846f0 __hash_page_64K+0x210/0x540
>>>>> [c0002ac1cc69ba50] 0000000000000000 (unreliable)
>>>>> [c0002ac1cc69bb00] c000000000073ae0 update_mmu_cache+0x390/0x3a0
>>>>> [c0002ac1cc69bb70] c00000000037f024 wp_page_copy+0x364/0xce0
>>>>> [c0002ac1cc69bc20] c00000000038272c do_wp_page+0xdc/0xa60
>>>>> [c0002ac1cc69bc70] c0000000003857bc handle_mm_fault+0xb9c/0x1b60
>>>>> [c0002ac1cc69bd50] c00000000006c434 __do_page_fault+0x314/0xc90
>>>>> [c0002ac1cc69be20] c00000000000c5c8 handle_page_fault+0x10/0x2c
>>>>> --- Exception: 300 (Data Access) at 00007fff8c861fe8
>>>>> SP (7ffff6b19660) is in userspace
>>>>> 
>>>>> Reported-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>>>>> Reported-by: Anton Blanchard <anton@ozlabs.org>
>>>>> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>> Fixes: 2f92447f9f96 ("powerpc/book3s64/hash: Use the pte_t address from the
>>>>> caller")
>>>>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>>>> 
>>>> 
>>>> Hi,
>>>> 
>>>> Tested with the patch and it fixes the lockups I was seeing with my test run.
>>>> Thanks for the fix.
>>>> 
>>>> Tested-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
>>> 
>>> Thanks for testing.
>>> 
>>> What test are you running?
>> 
>> Hi Michael
>> 
>> I was running  “perf record”  and Unixbench tests ( https://github.com/kdlucas/byte-unixbench ) in parallel where we were getting soft lockups
>> 
>> 1. Perf command run:
>> # perf record -a -g -c 10000000 -o <data_file> sleep 60
>> 
>> 2. Unixbench tests
>> # Run -q -c <nr_threads> spawn
> 
> Thanks, I can reproduce it with that.

Sure Michael


> 
> cheers


^ permalink raw reply

* [PATCH] powerpc: Fix MMCRA_BHRB_DISABLE define to work with binutils version < 2.28
From: Athira Rajeev @ 2020-07-29  4:16 UTC (permalink / raw)
  To: mpe; +Cc: maddy, linuxppc-dev

commit 9908c826d5ed ("powerpc/perf: Add Power10 PMU feature to
DT CPU features") defines MMCRA_BHRB_DISABLE as `0x2000000000UL`.
Binutils version less than 2.28 doesn't support UL suffix.

linux-ppc/arch/powerpc/kernel/cpu_setup_power.S: Assembler messages:
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: junk at end of line, first unrecognized character is `L'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: junk at end of line, first unrecognized character is `L'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: found 'L', expected: ')'
linux-ppc/arch/powerpc/kernel/cpu_setup_power.S:250: Error: operand out of range (0x0000002000000000 is not between 0xffffffffffff8000 and 0x000000000000ffff)

Fix this by wrapping it around `_UL` macro.

Fixes: 9908c826d5ed ("Add Power10 PMU feature to DT CPU features")
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/reg.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index ae71027..41419f1 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -12,6 +12,7 @@
 #ifdef __KERNEL__
 
 #include <linux/stringify.h>
+#include <linux/const.h>
 #include <asm/cputable.h>
 #include <asm/asm-const.h>
 #include <asm/feature-fixups.h>
@@ -888,7 +889,7 @@
 #define   MMCRA_SLOT	0x07000000UL /* SLOT bits (37-39) */
 #define   MMCRA_SLOT_SHIFT	24
 #define   MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */
-#define   MMCRA_BHRB_DISABLE  0x2000000000UL // BHRB disable bit for ISA v3.1
+#define   MMCRA_BHRB_DISABLE  _UL(0x2000000000) // BHRB disable bit for ISA v3.1
 #define   POWER6_MMCRA_SDSYNC 0x0000080000000000ULL	/* SDAR/SIAR synced */
 #define   POWER6_MMCRA_SIHV   0x0000040000000000ULL
 #define   POWER6_MMCRA_SIPR   0x0000020000000000ULL
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH] powerpc/configs: Add BLK_DEV_NVME to pseries_defconfig
From: Anton Blanchard @ 2020-07-29  4:08 UTC (permalink / raw)
  To: benh, paulus, mpe; +Cc: linuxppc-dev

I've forgotten to manual enable NVME when building pseries kernels
for machines with NVME adapters. Since it's a reasonably common
configuration, enable it by default.

Signed-off-by: Anton Blanchard <anton@ozlabs.org>
---
 arch/powerpc/configs/pseries_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index dfa4a726333b..358642d6f46d 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -94,6 +94,7 @@ CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=65536
 CONFIG_VIRTIO_BLK=m
+CONFIG_BLK_DEV_NVME=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_ST=m
 CONFIG_BLK_DEV_SR=y
-- 
2.26.2


^ permalink raw reply related

* Re: [RESEND PATCH v5 07/11] ppc64/kexec_file: enable early kernel's OPAL calls
From: Michael Ellerman @ 2020-07-29  1:15 UTC (permalink / raw)
  To: Hari Bathini, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
	Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <23baef6a-6ddc-572a-82c5-21a7fa441485@linux.ibm.com>

Hari Bathini <hbathini@linux.ibm.com> writes:
> On 28/07/20 7:16 pm, Michael Ellerman wrote:
>> Hari Bathini <hbathini@linux.ibm.com> writes:
>>> Kernel built with CONFIG_PPC_EARLY_DEBUG_OPAL enabled expects r8 & r9
>>> to be filled with OPAL base & entry addresses respectively. Setting
>>> these registers allows the kernel to perform OPAL calls before the
>>> device tree is parsed.
>> 
>> I'm not convinced we want to do this.
>> 
>> If we do it becomes part of the kexec ABI and we have to honour it into
>> the future.
>> 
>> And in practice there are no non-development kernels built with OPAL early
>> debugging enabled, so it's not clear it actually helps anyone other than
>> developers.
>> 
>
> Hmmm.. kexec-tools does it since commit d58ad564852c ("kexec/ppc64
> Enable early kernel's OPAL calls") for kexec_load syscall. So, we would
> be breaking kexec ABI either way, I guess.

Ugh, OK.

> Let me put this patch at the end of the series in the respin to let you
> decide whether to have it or not..

Thanks.

cheers

^ permalink raw reply

* Re: [PATCH 13/15] arch, drivers: replace for_each_membock() with for_each_mem_range()
From: Emil Renner Berthing @ 2020-07-28 15:02 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-kernel, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Thomas Gleixner, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Christoph Hellwig,
	Catalin Marinas, uclinux-h8-devel, linux-xtensa, openrisc,
	Borislav Petkov, Andy Lutomirski, Paul Walmsley, Stafford Horne,
	linux-arm-kernel, Michal Simek, linux-mm, linux-mips, iommu,
	Palmer Dabbelt, Andrew Morton, linuxppc-dev
In-Reply-To: <20200728051153.1590-14-rppt@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 44043 bytes --]

On Tue, 28 Jul 2020, 07:16 Mike Rapoport, <rppt@kernel.org> wrote:

> From: Mike Rapoport <rppt@linux.ibm.com>
>
> There are several occurrences of the following pattern:
>
>         for_each_memblock(memory, reg) {
>                 start = __pfn_to_phys(memblock_region_memory_base_pfn(reg);
>                 end = __pfn_to_phys(memblock_region_memory_end_pfn(reg));
>
>                 /* do something with start and end */
>         }
>
> Using for_each_mem_range() iterator is more appropriate in such cases and
> allows simpler and cleaner code.
>
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  arch/arm/kernel/setup.c                  | 18 +++++++----
>  arch/arm/mm/mmu.c                        | 39 ++++++++----------------
>  arch/arm/mm/pmsa-v7.c                    | 20 ++++++------
>  arch/arm/mm/pmsa-v8.c                    | 17 +++++------
>  arch/arm/xen/mm.c                        |  7 +++--
>  arch/arm64/mm/kasan_init.c               |  8 ++---
>  arch/arm64/mm/mmu.c                      | 11 ++-----
>  arch/c6x/kernel/setup.c                  |  9 +++---
>  arch/microblaze/mm/init.c                |  9 +++---
>  arch/mips/cavium-octeon/dma-octeon.c     | 12 ++++----
>  arch/mips/kernel/setup.c                 | 31 +++++++++----------
>  arch/openrisc/mm/init.c                  |  8 +++--
>  arch/powerpc/kernel/fadump.c             | 27 +++++++---------
>  arch/powerpc/mm/book3s64/hash_utils.c    | 16 +++++-----
>  arch/powerpc/mm/book3s64/radix_pgtable.c | 11 +++----
>  arch/powerpc/mm/kasan/kasan_init_32.c    |  8 ++---
>  arch/powerpc/mm/mem.c                    | 16 ++++++----
>  arch/powerpc/mm/pgtable_32.c             |  8 ++---
>  arch/riscv/mm/init.c                     | 24 ++++++---------
>  arch/riscv/mm/kasan_init.c               | 10 +++---
>  arch/s390/kernel/setup.c                 | 27 ++++++++++------
>  arch/s390/mm/vmem.c                      | 16 +++++-----
>  arch/sparc/mm/init_64.c                  | 12 +++-----
>  drivers/bus/mvebu-mbus.c                 | 12 ++++----
>  drivers/s390/char/zcore.c                |  9 +++---
>  25 files changed, 187 insertions(+), 198 deletions(-)
>
> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
> index d8e18cdd96d3..3f65d0ac9f63 100644
> --- a/arch/arm/kernel/setup.c
> +++ b/arch/arm/kernel/setup.c
> @@ -843,19 +843,25 @@ early_param("mem", early_mem);
>
>  static void __init request_standard_resources(const struct machine_desc
> *mdesc)
>  {
> -       struct memblock_region *region;
> +       phys_addr_t start, end, res_end;
>         struct resource *res;
> +       u64 i;
>
>         kernel_code.start   = virt_to_phys(_text);
>         kernel_code.end     = virt_to_phys(__init_begin - 1);
>         kernel_data.start   = virt_to_phys(_sdata);
>         kernel_data.end     = virt_to_phys(_end - 1);
>
> -       for_each_memblock(memory, region) {
> -               phys_addr_t start =
> __pfn_to_phys(memblock_region_memory_base_pfn(region));
> -               phys_addr_t end =
> __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
> +       for_each_mem_range(i, &start, &end) {
>                 unsigned long boot_alias_start;
>
> +               /*
> +                * In memblock, end points to the first byte after the
> +                * range while in resourses, end points to the last byte in
> +                * the range.
> +                */
> +               res_end = end - 1;
> +
>                 /*
>                  * Some systems have a special memory alias which is only
>                  * used for booting.  We need to advertise this region to
> @@ -869,7 +875,7 @@ static void __init request_standard_resources(const
> struct machine_desc *mdesc)
>                                       __func__, sizeof(*res));
>                         res->name = "System RAM (boot alias)";
>                         res->start = boot_alias_start;
> -                       res->end = phys_to_idmap(end);
> +                       res->end = phys_to_idmap(res_end);
>                         res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
>                         request_resource(&iomem_resource, res);
>                 }
> @@ -880,7 +886,7 @@ static void __init request_standard_resources(const
> struct machine_desc *mdesc)
>                               sizeof(*res));
>                 res->name  = "System RAM";
>                 res->start = start;
> -               res->end = end;
> +               res->end = res_end;
>                 res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
>
>                 request_resource(&iomem_resource, res);
> diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> index 628028bfbb92..a149d9cb4fdb 100644
> --- a/arch/arm/mm/mmu.c
> +++ b/arch/arm/mm/mmu.c
> @@ -1155,9 +1155,8 @@ phys_addr_t arm_lowmem_limit __initdata = 0;
>
>  void __init adjust_lowmem_bounds(void)
>  {
> -       phys_addr_t memblock_limit = 0;
> -       u64 vmalloc_limit;
> -       struct memblock_region *reg;
> +       phys_addr_t block_start, block_end, memblock_limit = 0;
> +       u64 vmalloc_limit, i;
>         phys_addr_t lowmem_limit = 0;
>
>         /*
> @@ -1173,26 +1172,18 @@ void __init adjust_lowmem_bounds(void)
>          * The first usable region must be PMD aligned. Mark its start
>          * as MEMBLOCK_NOMAP if it isn't
>          */
> -       for_each_memblock(memory, reg) {
> -               if (!memblock_is_nomap(reg)) {
> -                       if (!IS_ALIGNED(reg->base, PMD_SIZE)) {
> -                               phys_addr_t len;
> +       for_each_mem_range(i, &block_start, &block_end) {
> +               if (!IS_ALIGNED(block_start, PMD_SIZE)) {
> +                       phys_addr_t len;
>
> -                               len = round_up(reg->base, PMD_SIZE) -
> reg->base;
> -                               memblock_mark_nomap(reg->base, len);
> -                       }
> -                       break;
> +                       len = round_up(block_start, PMD_SIZE) -
> block_start;
> +                       memblock_mark_nomap(block_start, len);
>                 }
> +               break;
>         }
>
> -       for_each_memblock(memory, reg) {
> -               phys_addr_t block_start = reg->base;
> -               phys_addr_t block_end = reg->base + reg->size;
> -
> -               if (memblock_is_nomap(reg))
> -                       continue;
> -
> -               if (reg->base < vmalloc_limit) {
> +       for_each_mem_range(i, &block_start, &block_end) {
> +               if (block_start < vmalloc_limit) {
>                         if (block_end > lowmem_limit)
>                                 /*
>                                  * Compare as u64 to ensure vmalloc_limit
> does
> @@ -1441,19 +1432,15 @@ static void __init kmap_init(void)
>
>  static void __init map_lowmem(void)
>  {
> -       struct memblock_region *reg;
>         phys_addr_t kernel_x_start = round_down(__pa(KERNEL_START),
> SECTION_SIZE);
>         phys_addr_t kernel_x_end = round_up(__pa(__init_end),
> SECTION_SIZE);
> +       phys_addr_t start, end;
> +       u64 i;
>
>         /* Map all the lowmem memory banks. */
> -       for_each_memblock(memory, reg) {
> -               phys_addr_t start = reg->base;
> -               phys_addr_t end = start + reg->size;
> +       for_each_mem_range(i, &start, &end) {
>                 struct map_desc map;
>
> -               if (memblock_is_nomap(reg))
> -                       continue;
> -
>                 if (end > arm_lowmem_limit)
>                         end = arm_lowmem_limit;
>                 if (start >= end)
> diff --git a/arch/arm/mm/pmsa-v7.c b/arch/arm/mm/pmsa-v7.c
> index 699fa2e88725..44b7644a4237 100644
> --- a/arch/arm/mm/pmsa-v7.c
> +++ b/arch/arm/mm/pmsa-v7.c
> @@ -231,10 +231,9 @@ static int __init allocate_region(phys_addr_t base,
> phys_addr_t size,
>  void __init pmsav7_adjust_lowmem_bounds(void)
>  {
>         phys_addr_t  specified_mem_size = 0, total_mem_size = 0;
> -       struct memblock_region *reg;
> -       bool first = true;
>         phys_addr_t mem_start;
>         phys_addr_t mem_end;
> +       phys_addr_t reg_start, reg_end;
>         unsigned int mem_max_regions;
>         int num, i;
>
> @@ -262,20 +261,19 @@ void __init pmsav7_adjust_lowmem_bounds(void)
>         mem_max_regions -= num;
>  #endif
>
> -       for_each_memblock(memory, reg) {
> -               if (first) {
> +       for_each_mem_range(i, &reg_start, &reg_end) {
> +               if (i == 0) {
>                         phys_addr_t phys_offset = PHYS_OFFSET;
>
>                         /*
>                          * Initially only use memory continuous from
>                          * PHYS_OFFSET */
> -                       if (reg->base != phys_offset)
> +                       if (reg_start != phys_offset)
>                                 panic("First memory bank must be
> contiguous from PHYS_OFFSET");
>
> -                       mem_start = reg->base;
> -                       mem_end = reg->base + reg->size;
> -                       specified_mem_size = reg->size;
> -                       first = false;
> +                       mem_start = reg_start;
> +                       mem_end = reg_end
> +                       specified_mem_size = mem_end - mem_start;
>                 } else {
>                         /*
>                          * memblock auto merges contiguous blocks, remove
> @@ -283,8 +281,8 @@ void __init pmsav7_adjust_lowmem_bounds(void)
>                          * blocks separately while iterating)
>                          */
>                         pr_notice("Ignoring RAM after %pa, memory at %pa
> ignored\n",
> -                                 &mem_end, &reg->base);
> -                       memblock_remove(reg->base, 0 - reg->base);
> +                                 &mem_end, &reg_start);
> +                       memblock_remove(reg_start, 0 - reg_start);
>                         break;
>                 }
>         }
> diff --git a/arch/arm/mm/pmsa-v8.c b/arch/arm/mm/pmsa-v8.c
> index 0d7d5fb59247..b39e74b48437 100644
> --- a/arch/arm/mm/pmsa-v8.c
> +++ b/arch/arm/mm/pmsa-v8.c
> @@ -94,20 +94,19 @@ static __init bool is_region_fixed(int number)
>  void __init pmsav8_adjust_lowmem_bounds(void)
>  {
>         phys_addr_t mem_end;
> -       struct memblock_region *reg;
> -       bool first = true;
> +       phys_addr_t reg_start, reg_end;
> +       int i;
>
> -       for_each_memblock(memory, reg) {
> -               if (first) {
> +       for_each_mem_range(i, &reg_start, &reg_end) {
> +               if (i == 0) {
>                         phys_addr_t phys_offset = PHYS_OFFSET;
>
>                         /*
>                          * Initially only use memory continuous from
>                          * PHYS_OFFSET */
> -                       if (reg->base != phys_offset)
> +                       if (reg_start != phys_offset)
>                                 panic("First memory bank must be
> contiguous from PHYS_OFFSET");
> -                       mem_end = reg->base + reg->size;
> -                       first = false;
> +                       mem_end = reg_end;
>                 } else {
>                         /*
>                          * memblock auto merges contiguous blocks, remove
> @@ -115,8 +114,8 @@ void __init pmsav8_adjust_lowmem_bounds(void)
>                          * blocks separately while iterating)
>                          */
>                         pr_notice("Ignoring RAM after %pa, memory at %pa
> ignored\n",
> -                                 &mem_end, &reg->base);
> -                       memblock_remove(reg->base, 0 - reg->base);
> +                                 &mem_end, &reg_start);
> +                       memblock_remove(reg_start, 0 - reg_start);
>                         break;
>                 }
>         }
> diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
> index d40e9e5fc52b..05f24ff41e36 100644
> --- a/arch/arm/xen/mm.c
> +++ b/arch/arm/xen/mm.c
> @@ -24,11 +24,12 @@
>
>  unsigned long xen_get_swiotlb_free_pages(unsigned int order)
>  {
> -       struct memblock_region *reg;
> +       phys_addr_t base;
>         gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM;
> +       u64 i;
>
> -       for_each_memblock(memory, reg) {
> -               if (reg->base < (phys_addr_t)0xffffffff) {
> +       for_each_mem_range(i, &base, NULL) {
> +               if (base < (phys_addr_t)0xffffffff) {
>                         if (IS_ENABLED(CONFIG_ZONE_DMA32))
>                                 flags |= __GFP_DMA32;
>                         else
> diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
> index 7291b26ce788..1faa086f9193 100644
> --- a/arch/arm64/mm/kasan_init.c
> +++ b/arch/arm64/mm/kasan_init.c
> @@ -212,7 +212,7 @@ void __init kasan_init(void)
>  {
>         u64 kimg_shadow_start, kimg_shadow_end;
>         u64 mod_shadow_start, mod_shadow_end;
> -       struct memblock_region *reg;
> +       phys_addr_t _start, _end;
>         int i;
>
>         kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK;
> @@ -246,9 +246,9 @@ void __init kasan_init(void)
>                 kasan_populate_early_shadow((void *)mod_shadow_end,
>                                             (void *)kimg_shadow_start);
>
> -       for_each_memblock(memory, reg) {
> -               void *start = (void *)__phys_to_virt(reg->base);
> -               void *end = (void *)__phys_to_virt(reg->base + reg->size);
> +       for_each_mem_range(i, &start, &end) {
> +               void *_start = (void *)__phys_to_virt(_start);
> +               void *end = (void *)__phys_to_virt(_end);
>
>                 if (start >= end)
>                         break;
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 1df25f26571d..327264fb83fb 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -461,8 +461,9 @@ static void __init map_mem(pgd_t *pgdp)
>  {
>         phys_addr_t kernel_start = __pa_symbol(_text);
>         phys_addr_t kernel_end = __pa_symbol(__init_begin);
> -       struct memblock_region *reg;
> +       phys_addr_t start, end;
>         int flags = 0;
> +       u64 i;
>
>         if (rodata_full || debug_pagealloc_enabled())
>                 flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> @@ -481,15 +482,9 @@ static void __init map_mem(pgd_t *pgdp)
>  #endif
>
>         /* map all the memory banks */
> -       for_each_memblock(memory, reg) {
> -               phys_addr_t start = reg->base;
> -               phys_addr_t end = start + reg->size;
> -
> +       for_each_mem_range(i, &start, &end) {
>                 if (start >= end)
>                         break;
> -               if (memblock_is_nomap(reg))
> -                       continue;
> -
>                 __map_memblock(pgdp, start, end, PAGE_KERNEL, flags);
>         }
>
> diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c
> index 8ef35131f999..9254c3b794a5 100644
> --- a/arch/c6x/kernel/setup.c
> +++ b/arch/c6x/kernel/setup.c
> @@ -287,7 +287,8 @@ notrace void __init machine_init(unsigned long dt_ptr)
>
>  void __init setup_arch(char **cmdline_p)
>  {
> -       struct memblock_region *reg;
> +       phys_addr_t start, end;
> +       u64 i;
>
>         printk(KERN_INFO "Initializing kernel\n");
>
> @@ -351,9 +352,9 @@ void __init setup_arch(char **cmdline_p)
>         disable_caching(ram_start, ram_end - 1);
>
>         /* Set caching of external RAM used by Linux */
> -       for_each_memblock(memory, reg)
> -               enable_caching(CACHE_REGION_START(reg->base),
> -                              CACHE_REGION_START(reg->base + reg->size -
> 1));
> +       for_each_mem_range(i, &start, &end)
> +               enable_caching(CACHE_REGION_START(start),
> +                              CACHE_REGION_START(end - 1));
>
>  #ifdef CONFIG_BLK_DEV_INITRD
>         /*
> diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
> index 49e0c241f9b1..15403b5adfcf 100644
> --- a/arch/microblaze/mm/init.c
> +++ b/arch/microblaze/mm/init.c
> @@ -106,13 +106,14 @@ static void __init paging_init(void)
>  void __init setup_memory(void)
>  {
>  #ifndef CONFIG_MMU
> -       struct memblock_region *reg;
>         u32 kernel_align_start, kernel_align_size;
> +       phys_addr_t start, end;
> +       u64 i;
>
>         /* Find main memory where is the kernel */
> -       for_each_memblock(memory, reg) {
> -               memory_start = (u32)reg->base;
> -               lowmem_size = reg->size;
> +       for_each_mem_range(i, &start, &end) {
> +               memory_start = start;
> +               lowmem_size = end - start;
>                 if ((memory_start <= (u32)_text) &&
>                         ((u32)_text <= (memory_start + lowmem_size - 1))) {
>                         memory_size = lowmem_size;
> diff --git a/arch/mips/cavium-octeon/dma-octeon.c
> b/arch/mips/cavium-octeon/dma-octeon.c
> index 14ea680d180e..d938c1f7c1e1 100644
> --- a/arch/mips/cavium-octeon/dma-octeon.c
> +++ b/arch/mips/cavium-octeon/dma-octeon.c
> @@ -190,25 +190,25 @@ char *octeon_swiotlb;
>
>  void __init plat_swiotlb_setup(void)
>  {
> -       struct memblock_region *mem;
> +       phys_addr_t start, end;
>         phys_addr_t max_addr;
>         phys_addr_t addr_size;
>         size_t swiotlbsize;
>         unsigned long swiotlb_nslabs;
> +       u64 i;
>
>         max_addr = 0;
>         addr_size = 0;
>
> -       for_each_memblock(memory, mem) {
> +       for_each_mem_range(i, &start, &end) {
>                 /* These addresses map low for PCI. */
>                 if (mem->base > 0x410000000ull && !OCTEON_IS_OCTEON2())
>                         continue;
>
> -               addr_size += mem->size;
> -
> -               if (max_addr < mem->base + mem->size)
> -                       max_addr = mem->base + mem->size;
> +               addr_size += (end - start);
>
> +               if (max_addr < end)
> +                       max_addr = end;
>         }
>
>         swiotlbsize = PAGE_SIZE;
> diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
> index 7b537fa2035d..eaac1b66026d 100644
> --- a/arch/mips/kernel/setup.c
> +++ b/arch/mips/kernel/setup.c
> @@ -300,8 +300,9 @@ static void __init bootmem_init(void)
>
>  static void __init bootmem_init(void)
>  {
> -       struct memblock_region *mem;
>         phys_addr_t ramstart, ramend;
> +       phys_addr_t start, end;
> +       u64 i;
>
>         ramstart = memblock_start_of_DRAM();
>         ramend = memblock_end_of_DRAM();
> @@ -338,18 +339,13 @@ static void __init bootmem_init(void)
>
>         min_low_pfn = ARCH_PFN_OFFSET;
>         max_pfn = PFN_DOWN(ramend);
> -       for_each_memblock(memory, mem) {
> -               unsigned long start = memblock_region_memory_base_pfn(mem);
> -               unsigned long end = memblock_region_memory_end_pfn(mem);
> -
> +       for_each_mem_range(i, &start, &end) {
>                 /*
>                  * Skip highmem here so we get an accurate max_low_pfn if
> low
>                  * memory stops short of high memory.
>                  * If the region overlaps HIGHMEM_START, end is clipped so
>                  * max_pfn excludes the highmem portion.
>                  */
> -               if (memblock_is_nomap(mem))
> -                       continue;
>                 if (start >= PFN_DOWN(HIGHMEM_START))
>                         continue;
>                 if (end > PFN_DOWN(HIGHMEM_START))
> @@ -458,13 +454,12 @@ early_param("memmap", early_parse_memmap);
>  unsigned long setup_elfcorehdr, setup_elfcorehdr_size;
>  static int __init early_parse_elfcorehdr(char *p)
>  {
> -       struct memblock_region *mem;
> +       phys_addr_t start, end;
> +       u64 i;
>
>         setup_elfcorehdr = memparse(p, &p);
>
> -        for_each_memblock(memory, mem) {
> -               unsigned long start = mem->base;
> -               unsigned long end = start + mem->size;
> +       for_each_mem_range(i, &start, &end) {
>                 if (setup_elfcorehdr >= start && setup_elfcorehdr < end) {
>                         /*
>                          * Reserve from the elf core header to the end of
> @@ -728,7 +723,8 @@ static void __init arch_mem_init(char **cmdline_p)
>
>  static void __init resource_init(void)
>  {
> -       struct memblock_region *region;
> +       phys_addr_t start, end;
> +       u64 i;
>
>         if (UNCAC_BASE != IO_BASE)
>                 return;
> @@ -740,9 +736,7 @@ static void __init resource_init(void)
>         bss_resource.start = __pa_symbol(&__bss_start);
>         bss_resource.end = __pa_symbol(&__bss_stop) - 1;
>
> -       for_each_memblock(memory, region) {
> -               phys_addr_t start =
> PFN_PHYS(memblock_region_memory_base_pfn(region));
> -               phys_addr_t end =
> PFN_PHYS(memblock_region_memory_end_pfn(region)) - 1;
> +       for_each_mem_range(i, &start, &end) {
>                 struct resource *res;
>
>                 res = memblock_alloc(sizeof(struct resource),
> SMP_CACHE_BYTES);
> @@ -751,7 +745,12 @@ static void __init resource_init(void)
>                               sizeof(struct resource));
>
>                 res->start = start;
> -               res->end = end;
> +               /*
> +                * In memblock, end points to the first byte after the
> +                * range while in resourses, end points to the last byte in
> +                * the range.
> +                */
> +               res->end = end - 1;
>                 res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
>                 res->name = "System RAM";
>
> diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
> index 3d7c79c7745d..8348feaaf46e 100644
> --- a/arch/openrisc/mm/init.c
> +++ b/arch/openrisc/mm/init.c
> @@ -64,6 +64,7 @@ extern const char _s_kernel_ro[], _e_kernel_ro[];
>   */
>  static void __init map_ram(void)
>  {
> +       phys_addr_t start, end;
>         unsigned long v, p, e;
>         pgprot_t prot;
>         pgd_t *pge;
> @@ -71,6 +72,7 @@ static void __init map_ram(void)
>         pud_t *pue;
>         pmd_t *pme;
>         pte_t *pte;
> +       u64 i;
>         /* These mark extents of read-only kernel pages...
>          * ...from vmlinux.lds.S
>          */
> @@ -78,9 +80,9 @@ static void __init map_ram(void)
>
>         v = PAGE_OFFSET;
>
> -       for_each_memblock(memory, region) {
> -               p = (u32) region->base & PAGE_MASK;
> -               e = p + (u32) region->size;
> +       for_each_mem_range(i, &start, &end) {
> +               p = (u32) start & PAGE_MASK;
> +               e = (u32) end;
>
>                 v = (u32) __va(p);
>                 pge = pgd_offset_k(v);
> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index fdbafe417139..435b98d069eb 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -180,13 +180,13 @@ int is_fadump_active(void)
>   */
>  static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end)
>  {
> -       struct memblock_region *reg;
> +       phys_addr_t reg_start, reg_end;
>         bool ret = false;
> -       u64 start, end;
> +       u64 i, start, end;
>
> -       for_each_memblock(memory, reg) {
> -               start = max_t(u64, d_start, reg->base);
> -               end = min_t(u64, d_end, (reg->base + reg->size));
> +       for_each_mem_range(i, &reg_start, &reg_end) {
> +               start = max_t(u64, d_start, reg_start);
> +               end = min_t(u64, d_end, reg_end));
>                 if (d_start < end) {
>                         /* Memory hole from d_start to start */
>                         if (start > d_start)
> @@ -413,7 +413,7 @@ static int __init fadump_get_boot_mem_regions(void)
>  {
>         unsigned long base, size, cur_size, hole_size, last_end;
>         unsigned long mem_size = fw_dump.boot_memory_size;
> -       struct memblock_region *reg;
> +       phys_addr_t reg_start, reg_end;
>         int ret = 1;
>
>         fw_dump.boot_mem_regs_cnt = 0;
> @@ -421,9 +421,8 @@ static int __init fadump_get_boot_mem_regions(void)
>         last_end = 0;
>         hole_size = 0;
>         cur_size = 0;
> -       for_each_memblock(memory, reg) {
> -               base = reg->base;
> -               size = reg->size;
> +       for_each_mem_range(i, &reg_start, &reg_end) {
> +               size = reg_end - reg_start;
>                 hole_size += (base - last_end);
>
>                 if ((cur_size + size) >= mem_size) {
> @@ -959,9 +958,8 @@ static int fadump_init_elfcore_header(char *bufp)
>   */
>  static int fadump_setup_crash_memory_ranges(void)
>  {
> -       struct memblock_region *reg;
> -       u64 start, end;
> -       int i, ret;
> +       u64 i, start, end;
> +       int ret;
>
>         pr_debug("Setup crash memory ranges.\n");
>         crash_mrange_info.mem_range_cnt = 0;
> @@ -979,10 +977,7 @@ static int fadump_setup_crash_memory_ranges(void)
>                         return ret;
>         }
>
> -       for_each_memblock(memory, reg) {
> -               start = (u64)reg->base;
> -               end = start + (u64)reg->size;
> -
> +       for_each_mem_range(i, &start, end) {
>

I don't know anything about this code, but from pure pattern matching it
looks like you missed a & here.

                /*
>                  * skip the memory chunk that is already added
>                  * (0 through boot_memory_top).
> diff --git a/arch/powerpc/mm/book3s64/hash_utils.c
> b/arch/powerpc/mm/book3s64/hash_utils.c
> index 468169e33c86..9ba76b075b11 100644
> --- a/arch/powerpc/mm/book3s64/hash_utils.c
> +++ b/arch/powerpc/mm/book3s64/hash_utils.c
> @@ -7,7 +7,7 @@
>   *
>   * SMP scalability work:
>   *    Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
> - *
> + *
>   *    Module name: htab.c
>   *
>   *    Description:
> @@ -862,8 +862,8 @@ static void __init htab_initialize(void)
>         unsigned long table;
>         unsigned long pteg_count;
>         unsigned long prot;
> -       unsigned long base = 0, size = 0;
> -       struct memblock_region *reg;
> +       phys_addr_t base = 0, size = 0, end;
> +       u64 i;
>
>         DBG(" -> htab_initialize()\n");
>
> @@ -879,7 +879,7 @@ static void __init htab_initialize(void)
>         /*
>          * Calculate the required size of the htab.  We want the number of
>          * PTEGs to equal one half the number of real pages.
> -        */
> +        */
>         htab_size_bytes = htab_get_table_size();
>         pteg_count = htab_size_bytes >> 7;
>
> @@ -889,7 +889,7 @@ static void __init htab_initialize(void)
>             firmware_has_feature(FW_FEATURE_PS3_LV1)) {
>                 /* Using a hypervisor which owns the htab */
>                 htab_address = NULL;
> -               _SDR1 = 0;
> +               _SDR1 = 0;
>  #ifdef CONFIG_FA_DUMP
>                 /*
>                  * If firmware assisted dump is active firmware preserves
> @@ -955,9 +955,9 @@ static void __init htab_initialize(void)
>  #endif /* CONFIG_DEBUG_PAGEALLOC */
>
>         /* create bolted the linear mapping in the hash table */
> -       for_each_memblock(memory, reg) {
> -               base = (unsigned long)__va(reg->base);
> -               size = reg->size;
> +       for_each_mem_range(i, &base, &end) {
> +               size = end - base;
> +               base = (unsigned long)__va(base);
>
>                 DBG("creating mapping for region: %lx..%lx (prot: %lx)\n",
>                     base, size, prot);
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c
> b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index bb00e0cba119..65657b920847 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -318,28 +318,27 @@ static int __meminit
> create_physical_mapping(unsigned long start,
>  static void __init radix_init_pgtable(void)
>  {
>         unsigned long rts_field;
> -       struct memblock_region *reg;
> +       phys_addr_t start, end;
> +       u64 i;
>
>         /* We don't support slb for radix */
>         mmu_slb_size = 0;
>         /*
>          * Create the linear mapping, using standard page size for now
>          */
> -       for_each_memblock(memory, reg) {
> +       for_each_mem_range(i, &start, &end) {
>                 /*
>                  * The memblock allocator  is up at this point, so the
>                  * page tables will be allocated within the range. No
>                  * need or a node (which we don't have yet).
>                  */
>
> -               if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
> +               if (end >= RADIX_VMALLOC_START) {
>                         pr_warn("Outside the supported range\n");
>                         continue;
>                 }
>
> -               WARN_ON(create_physical_mapping(reg->base,
> -                                               reg->base + reg->size,
> -                                               -1, PAGE_KERNEL));
> +               WARN_ON(create_physical_mapping(start, end, -1,
> PAGE_KERNEL));
>         }
>
>         /* Find out how many PID bits are supported */
> diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c
> b/arch/powerpc/mm/kasan/kasan_init_32.c
> index 0760e1e754e4..6e73434e4e41 100644
> --- a/arch/powerpc/mm/kasan/kasan_init_32.c
> +++ b/arch/powerpc/mm/kasan/kasan_init_32.c
> @@ -120,11 +120,11 @@ static void __init
> kasan_unmap_early_shadow_vmalloc(void)
>  static void __init kasan_mmu_init(void)
>  {
>         int ret;
> -       struct memblock_region *reg;
> +       phys_addr_t base, end;
> +       u64 i;
>
> -       for_each_memblock(memory, reg) {
> -               phys_addr_t base = reg->base;
> -               phys_addr_t top = min(base + reg->size, total_lowmem);
> +       for_each_mem_range(i, &base, &end) {
> +               phys_addr_t top = min(end, total_lowmem);
>
>                 if (base >= top)
>                         continue;
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 38d1acd7c8ef..0248b6d58fcd 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -593,20 +593,24 @@ void flush_icache_user_page(struct vm_area_struct
> *vma, struct page *page,
>   */
>  static int __init add_system_ram_resources(void)
>  {
> -       struct memblock_region *reg;
> +       phys_addr_t start, end;
> +       u64 i;
>
> -       for_each_memblock(memory, reg) {
> +       for_each_mem_range(i, &start, &end) {
>                 struct resource *res;
> -               unsigned long base = reg->base;
> -               unsigned long size = reg->size;
>
>                 res = kzalloc(sizeof(struct resource), GFP_KERNEL);
>                 WARN_ON(!res);
>
>                 if (res) {
>                         res->name = "System RAM";
> -                       res->start = base;
> -                       res->end = base + size - 1;
> +                       res->start = start;
> +                       /*
> +                        * In memblock, end points to the first byte after
> +                        * the range while in resourses, end points to the
> +                        * last byte in the range.
> +                        */
> +                       res->end = end - 1;
>                         res->flags = IORESOURCE_SYSTEM_RAM |
> IORESOURCE_BUSY;
>                         WARN_ON(request_resource(&iomem_resource, res) <
> 0);
>                 }
> diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
> index 6eb4eab79385..079159e97bca 100644
> --- a/arch/powerpc/mm/pgtable_32.c
> +++ b/arch/powerpc/mm/pgtable_32.c
> @@ -123,11 +123,11 @@ static void __init __mapin_ram_chunk(unsigned long
> offset, unsigned long top)
>
>  void __init mapin_ram(void)
>  {
> -       struct memblock_region *reg;
> +       phys_addr_t base, end;
> +       u64 i;
>
> -       for_each_memblock(memory, reg) {
> -               phys_addr_t base = reg->base;
> -               phys_addr_t top = min(base + reg->size, total_lowmem);
> +       for_each_mem_range(i, &base, &end) {
> +               phys_addr_t top = min(end, total_lowmem);
>
>                 if (base >= top)
>                         continue;
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 7440ba2cdaaa..2abe1165fe56 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -145,21 +145,22 @@ static phys_addr_t dtb_early_pa __initdata;
>
>  void __init setup_bootmem(void)
>  {
> -       struct memblock_region *reg;
> +       phys_addr_t start, end;
>         phys_addr_t mem_size = 0;
>         phys_addr_t total_mem = 0;
>         phys_addr_t mem_start, end = 0;
>         phys_addr_t vmlinux_end = __pa_symbol(&_end);
>         phys_addr_t vmlinux_start = __pa_symbol(&_start);
> +       u64 i;
>
>         /* Find the memory region containing the kernel */
> -       for_each_memblock(memory, reg) {
> -               end = reg->base + reg->size;
> +       for_each_mem_range(i, &start, &end) {
> +               phys_addr_t size = end - start;
>                 if (!total_mem)
> -                       mem_start = reg->base;
> -               if (reg->base <= vmlinux_start && vmlinux_end <= end)
> -                       BUG_ON(reg->size == 0);
> -               total_mem = total_mem + reg->size;
> +                       mem_start = start;
> +               if (start <= vmlinux_start && vmlinux_end <= end)
> +                       BUG_ON(size == 0);
> +               total_mem = total_mem + size;
>         }
>
>         /*
> @@ -456,7 +457,7 @@ static void __init setup_vm_final(void)
>  {
>         uintptr_t va, map_size;
>         phys_addr_t pa, start, end;
> -       struct memblock_region *reg;
> +       u64 i;
>
>         /* Set mmu_enabled flag */
>         mmu_enabled = true;
> @@ -467,14 +468,9 @@ static void __init setup_vm_final(void)
>                            PGDIR_SIZE, PAGE_TABLE);
>
>         /* Map all memory banks */
> -       for_each_memblock(memory, reg) {
> -               start = reg->base;
> -               end = start + reg->size;
> -
> +       for_each_mem_range(i, &start, &end) {
>                 if (start >= end)
>                         break;
> -               if (memblock_is_nomap(reg))
> -                       continue;
>                 if (start <= __pa(PAGE_OFFSET) &&
>                     __pa(PAGE_OFFSET) < end)
>                         start = __pa(PAGE_OFFSET);
> diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
> index 87b4ab3d3c77..12ddd1f6bf70 100644
> --- a/arch/riscv/mm/kasan_init.c
> +++ b/arch/riscv/mm/kasan_init.c
> @@ -85,16 +85,16 @@ static void __init populate(void *start, void *end)
>
>  void __init kasan_init(void)
>  {
> -       struct memblock_region *reg;
> -       unsigned long i;
> +       phys_addr_t _start, _end;
> +       u64 i;
>
>         kasan_populate_early_shadow((void *)KASAN_SHADOW_START,
>                                     (void *)kasan_mem_to_shadow((void *)
>
> VMALLOC_END));
>
> -       for_each_memblock(memory, reg) {
> -               void *start = (void *)__va(reg->base);
> -               void *end = (void *)__va(reg->base + reg->size);
> +       for_each_mem_range(i, &_start, &_end) {
> +               void *start = (void *)_start;
> +               void *end = (void *)_end;
>
>                 if (start >= end)
>                         break;
> diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
> index 8b284cf6e199..b6c4a0c5ff86 100644
> --- a/arch/s390/kernel/setup.c
> +++ b/arch/s390/kernel/setup.c
> @@ -198,7 +198,7 @@ static void __init conmode_default(void)
>                 cpcmd("QUERY TERM", query_buffer, 1024, NULL);
>                 ptr = strstr(query_buffer, "CONMODE");
>                 /*
> -                * Set the conmode to 3215 so that the device recognition
> +                * Set the conmode to 3215 so that the device recognition
>                  * will set the cu_type of the console to 3215. If the
>                  * conmode is 3270 and we don't set it back then both
>                  * 3215 and the 3270 driver will try to access the console
> @@ -258,7 +258,7 @@ static inline void setup_zfcpdump(void) {}
>
>   /*
>   * Reboot, halt and power_off stubs. They just call _machine_restart,
> - * _machine_halt or _machine_power_off.
> + * _machine_halt or _machine_power_off.
>   */
>
>  void machine_restart(char *command)
> @@ -484,8 +484,9 @@ static struct resource __initdata
> *standard_resources[] = {
>  static void __init setup_resources(void)
>  {
>         struct resource *res, *std_res, *sub_res;
> -       struct memblock_region *reg;
> +       phys_addr_t start, end;
>         int j;
> +       u64 i;
>
>         code_resource.start = (unsigned long) _text;
>         code_resource.end = (unsigned long) _etext - 1;
> @@ -494,7 +495,7 @@ static void __init setup_resources(void)
>         bss_resource.start = (unsigned long) __bss_start;
>         bss_resource.end = (unsigned long) __bss_stop - 1;
>
> -       for_each_memblock(memory, reg) {
> +       for_each_mem_range(i, &start, &end) {
>                 res = memblock_alloc(sizeof(*res), 8);
>                 if (!res)
>                         panic("%s: Failed to allocate %zu bytes
> align=0x%x\n",
> @@ -502,8 +503,13 @@ static void __init setup_resources(void)
>                 res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
>
>                 res->name = "System RAM";
> -               res->start = reg->base;
> -               res->end = reg->base + reg->size - 1;
> +               res->start = start;
> +               /*
> +                * In memblock, end points to the first byte after the
> +                * range while in resourses, end points to the last byte in
> +                * the range.
> +                */
> +               res->end = end - 1;
>                 request_resource(&iomem_resource, res);
>
>                 for (j = 0; j < ARRAY_SIZE(standard_resources); j++) {
> @@ -819,14 +825,15 @@ static void __init reserve_kernel(void)
>
>  static void __init setup_memory(void)
>  {
> -       struct memblock_region *reg;
> +       phys_addr_t start, end;
> +       u64 i;
>
>         /*
>          * Init storage key for present memory
>          */
> -       for_each_memblock(memory, reg) {
> -               storage_key_init_range(reg->base, reg->base + reg->size);
> -       }
> +       for_each_mem_range(i, &start, &end)
> +               storage_key_init_range(start, end);
> +
>         psw_set_key(PAGE_DEFAULT_KEY);
>
>         /* Only cosmetics */
> diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
> index 8b6282cf7d13..30076ecc3eb7 100644
> --- a/arch/s390/mm/vmem.c
> +++ b/arch/s390/mm/vmem.c
> @@ -399,10 +399,11 @@ int vmem_add_mapping(unsigned long start, unsigned
> long size)
>   */
>  void __init vmem_map_init(void)
>  {
> -       struct memblock_region *reg;
> +       phys_addr_t start, end;
> +       u64 i;
>
> -       for_each_memblock(memory, reg)
> -               vmem_add_mem(reg->base, reg->size);
> +       for_each_mem_range(i, &start, &end)
> +               vmem_add_mem(start, end - start);
>         __set_memory((unsigned long)_stext,
>                      (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
>                      SET_MEMORY_RO | SET_MEMORY_X);
> @@ -428,16 +429,17 @@ void __init vmem_map_init(void)
>   */
>  static int __init vmem_convert_memory_chunk(void)
>  {
> -       struct memblock_region *reg;
> +       phys_addr_t start, end;
>         struct memory_segment *seg;
> +       u64 i;
>
>         mutex_lock(&vmem_mutex);
> -       for_each_memblock(memory, reg) {
> +       for_each_mem_range(i, &start, &end) {
>                 seg = kzalloc(sizeof(*seg), GFP_KERNEL);
>                 if (!seg)
>                         panic("Out of memory...\n");
> -               seg->start = reg->base;
> -               seg->size = reg->size;
> +               seg->start = start;
> +               seg->size = end - start;
>                 insert_memory_segment(seg);
>         }
>         mutex_unlock(&vmem_mutex);
> diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
> index 02e6e5e0f106..de63c002638e 100644
> --- a/arch/sparc/mm/init_64.c
> +++ b/arch/sparc/mm/init_64.c
> @@ -1192,18 +1192,14 @@ int of_node_to_nid(struct device_node *dp)
>
>  static void __init add_node_ranges(void)
>  {
> -       struct memblock_region *reg;
> +       phys_addr_t start, end;
>         unsigned long prev_max;
> +       u64 i;
>
>  memblock_resized:
>         prev_max = memblock.memory.max;
>
> -       for_each_memblock(memory, reg) {
> -               unsigned long size = reg->size;
> -               unsigned long start, end;
> -
> -               start = reg->base;
> -               end = start + size;
> +       for_each_mem_range(i, &start, &end) {
>                 while (start < end) {
>                         unsigned long this_end;
>                         int nid;
> @@ -1211,7 +1207,7 @@ static void __init add_node_ranges(void)
>                         this_end = memblock_nid_range(start, end, &nid);
>
>                         numadbg("Setting memblock NUMA node nid[%d] "
> -                               "start[%lx] end[%lx]\n",
> +                               "start[%llx] end[%lx]\n",
>                                 nid, start, this_end);
>
>                         memblock_set_node(start, this_end - start,
> diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c
> index 5b2a11a88951..2519ceede64b 100644
> --- a/drivers/bus/mvebu-mbus.c
> +++ b/drivers/bus/mvebu-mbus.c
> @@ -610,23 +610,23 @@ static unsigned int
> armada_xp_mbus_win_remap_offset(int win)
>  static void __init
>  mvebu_mbus_find_bridge_hole(uint64_t *start, uint64_t *end)
>  {
> -       struct memblock_region *r;
> -       uint64_t s = 0;
> +       phys_addr_t reg_start, reg_end;
> +       uint64_t i, s = 0;
>
> -       for_each_memblock(memory, r) {
> +       for_each_mem_range(i, &reg_start, &reg_end) {
>                 /*
>                  * This part of the memory is above 4 GB, so we don't
>                  * care for the MBus bridge hole.
>                  */
> -               if (r->base >= 0x100000000ULL)
> +               if (reg_start >= 0x100000000ULL)
>                         continue;
>
>                 /*
>                  * The MBus bridge hole is at the end of the RAM under
>                  * the 4 GB limit.
>                  */
> -               if (r->base + r->size > s)
> -                       s = r->base + r->size;
> +               if (reg_end > s)
> +                       s = reg_end;
>         }
>
>         *start = s;
> diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
> index 08f812475f5e..484b1ec9a1bc 100644
> --- a/drivers/s390/char/zcore.c
> +++ b/drivers/s390/char/zcore.c
> @@ -148,18 +148,19 @@ static ssize_t zcore_memmap_read(struct file *filp,
> char __user *buf,
>
>  static int zcore_memmap_open(struct inode *inode, struct file *filp)
>  {
> -       struct memblock_region *reg;
> +       phys_addr_t start, end;
>         char *buf;
>         int i = 0;
> +       u64 r;
>
>         buf = kcalloc(memblock.memory.cnt, CHUNK_INFO_SIZE, GFP_KERNEL);
>         if (!buf) {
>                 return -ENOMEM;
>         }
> -       for_each_memblock(memory, reg) {
> +       for_each_mem_range(r, &start, &end) {
>                 sprintf(buf + (i++ * CHUNK_INFO_SIZE), "%016llx %016llx ",
> -                       (unsigned long long) reg->base,
> -                       (unsigned long long) reg->size);
> +                       (unsigned long long) start,
> +                       (unsigned long long) (end - start));
>         }
>         filp->private_data = buf;
>         return nonseekable_open(inode, filp);
> --
> 2.26.2
>
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv
>

[-- Attachment #2: Type: text/html, Size: 54937 bytes --]

^ permalink raw reply

* Re: [patch 01/15] mm/memory.c: avoid access flag update TLB flush for retried page fault
From: Nicholas Piggin @ 2020-07-28 22:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-arch, Hillf Danton, mm-commits, Catalin Marinas,
	Hugh Dickins, Josef Bacik, Will Deacon, Linux-MM, Matthew Wilcox,
	Johannes Weiner, Yu Xu, Andrew Morton, linuxppc-dev, Yang Shi,
	Kirill A . Shutemov
In-Reply-To: <CAHk-=wgrgRqeEo-YUgec7yQNkN+_+sHBP-NtCnfktCFEuPHTDQ@mail.gmail.com>

Excerpts from Linus Torvalds's message of July 29, 2020 5:02 am:
> On Tue, Jul 28, 2020 at 3:53 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>>
>> The quirk is a problem with coprocessor where it's supposed to
>> invalidate the translation after a fault but it doesn't, so we can get a
>> read-only TLB stuck after something else does a RO->RW upgrade on the
>> TLB. Something like that IIRC.  Coprocessors have their own MMU which
>> lives in the nest not the core, so you need a global TLB flush to
>> invalidate that thing.
> 
> So I assumed, but it does seem confused.
> 
> Why? Because if there are stale translations on the co-processor,
> there's no guarantee that one of the CPU's will have them and take a
> fault.
> 
> So I'm not seeing why a core CPU doing spurious TLB invalidation would
> follow from "stale TLB in the Nest".

If the nest MMU access faults, it sends an interrupt to the CPU and
the driver tries to handle the page fault for it (I think that's how
it works).

> If anything, I think "we have a coprocessor that needs to never have
> stale TLB entries" would impact the _regular_ TLB invalidates (by
> update_mmu_cache()) and perhaps make those more aggressive, exactly
> because the coprocessor may not handle the fault as gracefully.

It could be done that way... Hmm although we do have something similar 
also in radix__ptep_set_access_flags for the relaxing permissions case
so maybe this is required for not-present faults as well? I'm not 
actually sure now.

But it's a bit weird and awkward because it's working around quirks in
the hardware which aren't regular, not because we're _completely_ 
confused (I hope).

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH] powerpc/powernv/pci: Fix build of pci-ioda.o
From: Oliver O'Halloran @ 2020-07-28 22:50 UTC (permalink / raw)
  To: Gustavo Romero; +Cc: linuxppc-dev
In-Reply-To: <20200728223337.40447-1-gromero@linux.ibm.com>

On Wed, Jul 29, 2020 at 8:35 AM Gustavo Romero <gromero@linux.ibm.com> wrote:
>
> Currently pnv_ioda_setup_bus_dma() is outside of a CONFIG_IOMMU_API guard
> and if CONFIG_IOMMU_API=n the build can fail if the compiler sets
> -Werror=unused-function, because pnv_ioda_setup_bus_dma() is only used in
> functions guarded by a CONFIG_IOMMU_API guard.
>
> That issue can be easily reproduced using the skiroot_defconfig. For other
> configs, like powernv_defconfig, that issue is hidden by the fact that
> if CONFIG_IOMMU_SUPPORT is enabled plus other common IOMMU options, like
> CONFIG_OF_IOMMU, by default CONFIG_IOMMU_API is enabled as well. Hence, for
> powernv_defconfig, it's necessary to set CONFIG_IOMMU_SUPPORT=n to make the
> build fail, because CONFIG_PCI=y and pci-ioda.c is included in the build,
> but since CONFIG_IOMMU_SUPPORT=n the CONFIG_IOMMU_API is disabled, breaking
> the build.
>
> This commit fixes that build issue by moving the pnv_ioda_setup_bus_dma()
> inside a CONFIG_IOMMU_API guard, so when CONFIG_IOMMU_API is disabled that
> function is not defined.

I think a fix for this is already in -next.

^ permalink raw reply

* [PATCH] powerpc/powernv/pci: Fix build of pci-ioda.o
From: Gustavo Romero @ 2020-07-28 22:33 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: oohall, gromero

Currently pnv_ioda_setup_bus_dma() is outside of a CONFIG_IOMMU_API guard
and if CONFIG_IOMMU_API=n the build can fail if the compiler sets
-Werror=unused-function, because pnv_ioda_setup_bus_dma() is only used in
functions guarded by a CONFIG_IOMMU_API guard.

That issue can be easily reproduced using the skiroot_defconfig. For other
configs, like powernv_defconfig, that issue is hidden by the fact that
if CONFIG_IOMMU_SUPPORT is enabled plus other common IOMMU options, like
CONFIG_OF_IOMMU, by default CONFIG_IOMMU_API is enabled as well. Hence, for
powernv_defconfig, it's necessary to set CONFIG_IOMMU_SUPPORT=n to make the
build fail, because CONFIG_PCI=y and pci-ioda.c is included in the build,
but since CONFIG_IOMMU_SUPPORT=n the CONFIG_IOMMU_API is disabled, breaking
the build.

This commit fixes that build issue by moving the pnv_ioda_setup_bus_dma()
inside a CONFIG_IOMMU_API guard, so when CONFIG_IOMMU_API is disabled that
function is not defined.

Signed-off-by: Gustavo Romero <gromero@linux.ibm.com>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 26 +++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 73a63efcf855..743d840712da 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1885,19 +1885,6 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
 	return false;
 }
 
-static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
-{
-	struct pci_dev *dev;
-
-	list_for_each_entry(dev, &bus->devices, bus_list) {
-		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
-		dev->dev.archdata.dma_offset = pe->tce_bypass_base;
-
-		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
-			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
-	}
-}
-
 static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
 						     bool real_mode)
 {
@@ -2501,6 +2488,19 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
 #endif
 
 #ifdef CONFIG_IOMMU_API
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
+		dev->dev.archdata.dma_offset = pe->tce_bypass_base;
+
+		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
+	}
+}
+
 unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 		__u64 window_size, __u32 levels)
 {
-- 
2.17.1


^ permalink raw reply related

* Re: [RESEND PATCH v5 06/11] ppc64/kexec_file: restrict memory usage of kdump kernel
From: Hari Bathini @ 2020-07-28 19:34 UTC (permalink / raw)
  To: Michael Ellerman, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
	Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <875za77o05.fsf@mpe.ellerman.id.au>



On 28/07/20 7:14 pm, Michael Ellerman wrote:
> Hari Bathini <hbathini@linux.ibm.com> writes:
>> diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
>> index 2df6f4273ddd..8df085a22fd7 100644
>> --- a/arch/powerpc/kexec/file_load_64.c
>> +++ b/arch/powerpc/kexec/file_load_64.c
>> @@ -17,9 +17,21 @@
>>   #include <linux/kexec.h>
>>   #include <linux/of_fdt.h>
>>   #include <linux/libfdt.h>
>> +#include <linux/of_device.h>
>>   #include <linux/memblock.h>
>> +#include <linux/slab.h>
>> +#include <asm/drmem.h>
>>   #include <asm/kexec_ranges.h>
>>   
>> +struct umem_info {
>> +	uint64_t *buf; /* data buffer for usable-memory property */
>> +	uint32_t idx;  /* current index */
>> +	uint32_t size; /* size allocated for the data buffer */
> 
> Use kernel types please, u64, u32.
> 
>> +	/* usable memory ranges to look up */
>> +	const struct crash_mem *umrngs;
> 
> "umrngs".
> 
> Given it's part of the umem_info struct could it just be "ranges"?

True. Actually, having crash_mem_range *ranges + u32 nr_ranges and 
populating them seems better. Will do that..

>> +		return NULL;
>> +	}
> 
> 	um_info->size = new_size;
> 
>> +
>> +	memset(tbuf + um_info->idx, 0, MEM_RANGE_CHUNK_SZ);
> 
> Just pass __GFP_ZERO to krealloc?

There are patches submitted to stable fixing a few modules that use 
krealloc with __GFP_ZERO. Also, this zeroing is not really needed.
I will drop the memset instead..

Thanks
Hari

^ permalink raw reply

* Re: [RESEND PATCH v5 07/11] ppc64/kexec_file: enable early kernel's OPAL calls
From: Hari Bathini @ 2020-07-28 19:24 UTC (permalink / raw)
  To: Michael Ellerman, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
	Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <87365b7nx4.fsf@mpe.ellerman.id.au>



On 28/07/20 7:16 pm, Michael Ellerman wrote:
> Hari Bathini <hbathini@linux.ibm.com> writes:
>> Kernel built with CONFIG_PPC_EARLY_DEBUG_OPAL enabled expects r8 & r9
>> to be filled with OPAL base & entry addresses respectively. Setting
>> these registers allows the kernel to perform OPAL calls before the
>> device tree is parsed.
> 
> I'm not convinced we want to do this.
> 
> If we do it becomes part of the kexec ABI and we have to honour it into
> the future.
> 
> And in practice there are no non-development kernels built with OPAL early
> debugging enabled, so it's not clear it actually helps anyone other than
> developers.
> 

Hmmm.. kexec-tools does it since commit d58ad564852c ("kexec/ppc64
Enable early kernel's OPAL calls") for kexec_load syscall. So, we would
be breaking kexec ABI either way, I guess.

Let me put this patch at the end of the series in the respin to let you
decide whether to have it or not..

Thanks
Hari

^ permalink raw reply

* Re: [PATCH] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal
From: Nathan Lynch @ 2020-07-28 19:19 UTC (permalink / raw)
  To: Laurent Dufour; +Cc: tyreld, cheloha, linuxppc-dev
In-Reply-To: <bd9225f2-40c9-0460-ba45-c29c920b5f91@linux.ibm.com>

Hi Laurent,

Laurent Dufour <ldufour@linux.ibm.com> writes:
> Le 28/07/2020 à 19:37, Nathan Lynch a écrit :
>> The drmem lmb list can have hundreds of thousands of entries, and
>> unfortunately lookups take the form of linear searches. As long as
>> this is the case, traversals have the potential to monopolize the CPU
>> and provoke lockup reports, workqueue stalls, and the like unless
>> they explicitly yield.
>> 
>> Rather than placing cond_resched() calls within various
>> for_each_drmem_lmb() loop blocks in the code, put it in the iteration
>> expression of the loop macro itself so users can't omit it.
>
> Is that not too much to call cond_resched() on every LMB?
>
> Could that be less frequent, every 10, or 100, I don't really know ?

Everything done within for_each_drmem_lmb is relatively heavyweight
already. E.g. calling dlpar_remove_lmb()/dlpar_add_lmb() can take dozens
of milliseconds. I don't think cond_resched() is an expensive check in
this context.

^ permalink raw reply

* Re: [patch 01/15] mm/memory.c: avoid access flag update TLB flush for retried page fault
From: Linus Torvalds @ 2020-07-28 19:02 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Hillf Danton, Yang Shi, Yu Xu, Catalin Marinas,
	Hugh Dickins, Josef Bacik, Will Deacon, Linux-MM, Matthew Wilcox,
	Johannes Weiner, mm-commits, Andrew Morton, linuxppc-dev,
	Kirill A . Shutemov
In-Reply-To: <1595932767.wga6c4yy6a.astroid@bobo.none>

On Tue, Jul 28, 2020 at 3:53 AM Nicholas Piggin <npiggin@gmail.com> wrote:
>
> The quirk is a problem with coprocessor where it's supposed to
> invalidate the translation after a fault but it doesn't, so we can get a
> read-only TLB stuck after something else does a RO->RW upgrade on the
> TLB. Something like that IIRC.  Coprocessors have their own MMU which
> lives in the nest not the core, so you need a global TLB flush to
> invalidate that thing.

So I assumed, but it does seem confused.

Why? Because if there are stale translations on the co-processor,
there's no guarantee that one of the CPU's will have them and take a
fault.

So I'm not seeing why a core CPU doing spurious TLB invalidation would
follow from "stale TLB in the Nest".

If anything, I think "we have a coprocessor that needs to never have
stale TLB entries" would impact the _regular_ TLB invalidates (by
update_mmu_cache()) and perhaps make those more aggressive, exactly
because the coprocessor may not handle the fault as gracefully.

I dunno. I don't know the coprocessor side well enough to judge, I'm
just looking at it from a conceptual standpoint.

          Linus

^ permalink raw reply

* Re: [PATCH v2] pseries/drmem: don't cache node id in drmem_lmb struct
From: David Hildenbrand @ 2020-07-28 18:22 UTC (permalink / raw)
  To: Scott Cheloha, linuxppc-dev
  Cc: Nathan Lynch, Michal Suchanek, Laurent Dufour, Rick Lindsley
In-Reply-To: <20200728165339.3031126-1-cheloha@linux.ibm.com>

On 28.07.20 18:53, Scott Cheloha wrote:
> At memory hot-remove time we can retrieve an LMB's nid from its
> corresponding memory_block.  There is no need to store the nid
> in multiple locations.
> 
> Note that lmb_to_memblock() uses find_memory_block() to get the
> corresponding memory_block.  As find_memory_block() runs in sub-linear
> time this approach is negligibly slower than what we do at present.
> 
> In exchange for this lookup at hot-remove time we no longer need to
> call memory_add_physaddr_to_nid() during drmem_init() for each LMB.
> On powerpc, memory_add_physaddr_to_nid() is a linear search, so this
> spares us an O(n^2) initialization during boot.
> 
> On systems with many LMBs that initialization overhead is palpable and
> disruptive.  For example, on a box with 249854 LMBs we're seeing
> drmem_init() take upwards of 30 seconds to complete:
> 
> [   53.721639] drmem: initializing drmem v2
> [   80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1]
> [   80.604377] Modules linked in:
> [   80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4
> [   80.604397] NIP:  c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000
> [   80.604407] REGS: c0002dbff8493830 TRAP: 0901   Not tainted  (5.6.0-rc2+)
> [   80.604412] MSR:  8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 44000248  XER: 0000000d
> [   80.604431] CFAR: c0000000000a4a38 IRQMASK: 0
> [   80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30
> [   80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000
> [   80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001
> [   80.604431] GPR12: 0000000000000000 c00000001e8ec200
> [   80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0
> [   80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0
> [   80.604492] Call Trace:
> [   80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable)
> [   80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60
> [   80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0
> [   80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0
> [   80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0
> [   80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148
> [   80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74
> [   80.604567] Instruction dump:
> [   80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214
> [   80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040
> [   89.047390] drmem: 249854 LMB(s)
> 
> With a patched kernel on the same machine we're no longer seeing the
> soft lockup.  drmem_init() now completes in negligible time, even when
> the LMB count is large.
> 

Yeah, as long as you remove_memory() in memory block granularity, this
is guaranteed to work.

Reviewed-by: David Hildenbrand <david@redhat.com>


-- 
Thanks,

David / dhildenb


^ permalink raw reply

* Re: [PATCH] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal
From: Laurent Dufour @ 2020-07-28 17:46 UTC (permalink / raw)
  To: Nathan Lynch, linuxppc-dev; +Cc: tyreld, cheloha
In-Reply-To: <20200728173741.717372-1-nathanl@linux.ibm.com>

Le 28/07/2020 à 19:37, Nathan Lynch a écrit :
> The drmem lmb list can have hundreds of thousands of entries, and
> unfortunately lookups take the form of linear searches. As long as
> this is the case, traversals have the potential to monopolize the CPU
> and provoke lockup reports, workqueue stalls, and the like unless
> they explicitly yield.
> 
> Rather than placing cond_resched() calls within various
> for_each_drmem_lmb() loop blocks in the code, put it in the iteration
> expression of the loop macro itself so users can't omit it.

Hi Nathan,

Is that not too much to call cond_resched() on every LMB?

Could that be less frequent, every 10, or 100, I don't really know ?

Cheers,
Laurent.
> 
> Fixes: 6c6ea53725b3 ("powerpc/mm: Separate ibm, dynamic-memory data from DT format")
> Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
> ---
>   arch/powerpc/include/asm/drmem.h | 10 +++++++++-
>   1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
> index 414d209f45bb..36d0ed04bda8 100644
> --- a/arch/powerpc/include/asm/drmem.h
> +++ b/arch/powerpc/include/asm/drmem.h
> @@ -8,6 +8,8 @@
>   #ifndef _ASM_POWERPC_LMB_H
>   #define _ASM_POWERPC_LMB_H
>   
> +#include <linux/sched.h>
> +
>   struct drmem_lmb {
>   	u64     base_addr;
>   	u32     drc_index;
> @@ -26,8 +28,14 @@ struct drmem_lmb_info {
>   
>   extern struct drmem_lmb_info *drmem_info;
>   
> +static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb)
> +{
> +	cond_resched();
> +	return ++lmb;
> +}
> +
>   #define for_each_drmem_lmb_in_range(lmb, start, end)		\
> -	for ((lmb) = (start); (lmb) < (end); (lmb)++)
> +	for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb))
>   
>   #define for_each_drmem_lmb(lmb)					\
>   	for_each_drmem_lmb_in_range((lmb),			\
> 


^ permalink raw reply

* [PATCH] powerpc/pseries: explicitly reschedule during drmem_lmb list traversal
From: Nathan Lynch @ 2020-07-28 17:37 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: tyreld, cheloha, ldufour

The drmem lmb list can have hundreds of thousands of entries, and
unfortunately lookups take the form of linear searches. As long as
this is the case, traversals have the potential to monopolize the CPU
and provoke lockup reports, workqueue stalls, and the like unless
they explicitly yield.

Rather than placing cond_resched() calls within various
for_each_drmem_lmb() loop blocks in the code, put it in the iteration
expression of the loop macro itself so users can't omit it.

Fixes: 6c6ea53725b3 ("powerpc/mm: Separate ibm, dynamic-memory data from DT format")
Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
---
 arch/powerpc/include/asm/drmem.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 414d209f45bb..36d0ed04bda8 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -8,6 +8,8 @@
 #ifndef _ASM_POWERPC_LMB_H
 #define _ASM_POWERPC_LMB_H
 
+#include <linux/sched.h>
+
 struct drmem_lmb {
 	u64     base_addr;
 	u32     drc_index;
@@ -26,8 +28,14 @@ struct drmem_lmb_info {
 
 extern struct drmem_lmb_info *drmem_info;
 
+static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb)
+{
+	cond_resched();
+	return ++lmb;
+}
+
 #define for_each_drmem_lmb_in_range(lmb, start, end)		\
-	for ((lmb) = (start); (lmb) < (end); (lmb)++)
+	for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb))
 
 #define for_each_drmem_lmb(lmb)					\
 	for_each_drmem_lmb_in_range((lmb),			\
-- 
2.25.4


^ permalink raw reply related

* [PATCH v2] pseries/drmem: don't cache node id in drmem_lmb struct
From: Scott Cheloha @ 2020-07-28 16:53 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Nathan Lynch, Laurent Dufour, David Hildenbrand, Michal Suchanek,
	Rick Lindsley

At memory hot-remove time we can retrieve an LMB's nid from its
corresponding memory_block.  There is no need to store the nid
in multiple locations.

Note that lmb_to_memblock() uses find_memory_block() to get the
corresponding memory_block.  As find_memory_block() runs in sub-linear
time this approach is negligibly slower than what we do at present.

In exchange for this lookup at hot-remove time we no longer need to
call memory_add_physaddr_to_nid() during drmem_init() for each LMB.
On powerpc, memory_add_physaddr_to_nid() is a linear search, so this
spares us an O(n^2) initialization during boot.

On systems with many LMBs that initialization overhead is palpable and
disruptive.  For example, on a box with 249854 LMBs we're seeing
drmem_init() take upwards of 30 seconds to complete:

[   53.721639] drmem: initializing drmem v2
[   80.604346] watchdog: BUG: soft lockup - CPU#65 stuck for 23s! [swapper/0:1]
[   80.604377] Modules linked in:
[   80.604389] CPU: 65 PID: 1 Comm: swapper/0 Not tainted 5.6.0-rc2+ #4
[   80.604397] NIP:  c0000000000a4980 LR: c0000000000a4940 CTR: 0000000000000000
[   80.604407] REGS: c0002dbff8493830 TRAP: 0901   Not tainted  (5.6.0-rc2+)
[   80.604412] MSR:  8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 44000248  XER: 0000000d
[   80.604431] CFAR: c0000000000a4a38 IRQMASK: 0
[   80.604431] GPR00: c0000000000a4940 c0002dbff8493ac0 c000000001904400 c0003cfffffede30
[   80.604431] GPR04: 0000000000000000 c000000000f4095a 000000000000002f 0000000010000000
[   80.604431] GPR08: c0000bf7ecdb7fb8 c0000bf7ecc2d3c8 0000000000000008 c00c0002fdfb2001
[   80.604431] GPR12: 0000000000000000 c00000001e8ec200
[   80.604477] NIP [c0000000000a4980] hot_add_scn_to_nid+0xa0/0x3e0
[   80.604486] LR [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0
[   80.604492] Call Trace:
[   80.604498] [c0002dbff8493ac0] [c0000000000a4940] hot_add_scn_to_nid+0x60/0x3e0 (unreliable)
[   80.604509] [c0002dbff8493b20] [c000000000087c10] memory_add_physaddr_to_nid+0x20/0x60
[   80.604521] [c0002dbff8493b40] [c0000000010d4880] drmem_init+0x25c/0x2f0
[   80.604530] [c0002dbff8493c10] [c000000000010154] do_one_initcall+0x64/0x2c0
[   80.604540] [c0002dbff8493ce0] [c0000000010c4aa0] kernel_init_freeable+0x2d8/0x3a0
[   80.604550] [c0002dbff8493db0] [c000000000010824] kernel_init+0x2c/0x148
[   80.604560] [c0002dbff8493e20] [c00000000000b648] ret_from_kernel_thread+0x5c/0x74
[   80.604567] Instruction dump:
[   80.604574] 392918e8 e9490000 e90a000a e92a0000 80ea000c 1d080018 3908ffe8 7d094214
[   80.604586] 7fa94040 419d00dc e9490010 714a0088 <2faa0008> 409e00ac e9490000 7fbe5040
[   89.047390] drmem: 249854 LMB(s)

With a patched kernel on the same machine we're no longer seeing the
soft lockup.  drmem_init() now completes in negligible time, even when
the LMB count is large.

Signed-off-by: Scott Cheloha <cheloha@linux.ibm.com>
---
 arch/powerpc/include/asm/drmem.h              | 21 -------------------
 arch/powerpc/mm/drmem.c                       |  6 +-----
 .../platforms/pseries/hotplug-memory.c        | 19 ++++++++++-------
 3 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h
index 414d209f45bb..34e4e9b257f5 100644
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@@ -13,9 +13,6 @@ struct drmem_lmb {
 	u32     drc_index;
 	u32     aa_index;
 	u32     flags;
-#ifdef CONFIG_MEMORY_HOTPLUG
-	int	nid;
-#endif
 };
 
 struct drmem_lmb_info {
@@ -104,22 +101,4 @@ static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb)
 	lmb->aa_index = 0xffffffff;
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-static inline void lmb_set_nid(struct drmem_lmb *lmb)
-{
-	lmb->nid = memory_add_physaddr_to_nid(lmb->base_addr);
-}
-static inline void lmb_clear_nid(struct drmem_lmb *lmb)
-{
-	lmb->nid = -1;
-}
-#else
-static inline void lmb_set_nid(struct drmem_lmb *lmb)
-{
-}
-static inline void lmb_clear_nid(struct drmem_lmb *lmb)
-{
-}
-#endif
-
 #endif /* _ASM_POWERPC_LMB_H */
diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c
index 59327cefbc6a..873fcfc7b875 100644
--- a/arch/powerpc/mm/drmem.c
+++ b/arch/powerpc/mm/drmem.c
@@ -362,10 +362,8 @@ static void __init init_drmem_v1_lmbs(const __be32 *prop)
 	if (!drmem_info->lmbs)
 		return;
 
-	for_each_drmem_lmb(lmb) {
+	for_each_drmem_lmb(lmb)
 		read_drconf_v1_cell(lmb, &prop);
-		lmb_set_nid(lmb);
-	}
 }
 
 static void __init init_drmem_v2_lmbs(const __be32 *prop)
@@ -410,8 +408,6 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop)
 
 			lmb->aa_index = dr_cell.aa_index;
 			lmb->flags = dr_cell.flags;
-
-			lmb_set_nid(lmb);
 		}
 	}
 }
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 5ace2f9a277e..7bf66fdcf916 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -356,25 +356,29 @@ static int dlpar_add_lmb(struct drmem_lmb *);
 
 static int dlpar_remove_lmb(struct drmem_lmb *lmb)
 {
+	struct memory_block *mem_block;
 	unsigned long block_sz;
 	int rc;
 
 	if (!lmb_is_removable(lmb))
 		return -EINVAL;
 
+	mem_block = lmb_to_memblock(lmb);
+	if (mem_block == NULL)
+		return -EINVAL;
+
 	rc = dlpar_offline_lmb(lmb);
 	if (rc)
 		return rc;
 
 	block_sz = pseries_memory_block_size();
 
-	__remove_memory(lmb->nid, lmb->base_addr, block_sz);
+	__remove_memory(mem_block->nid, lmb->base_addr, block_sz);
 
 	/* Update memory regions for memory remove */
 	memblock_remove(lmb->base_addr, block_sz);
 
 	invalidate_lmb_associativity_index(lmb);
-	lmb_clear_nid(lmb);
 	lmb->flags &= ~DRCONF_MEM_ASSIGNED;
 
 	return 0;
@@ -631,7 +635,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index)
 static int dlpar_add_lmb(struct drmem_lmb *lmb)
 {
 	unsigned long block_sz;
-	int rc;
+	int nid, rc;
 
 	if (lmb->flags & DRCONF_MEM_ASSIGNED)
 		return -EINVAL;
@@ -642,11 +646,13 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
 		return rc;
 	}
 
-	lmb_set_nid(lmb);
 	block_sz = memory_block_size_bytes();
 
+	/* Find the node id for this address. */
+	nid = memory_add_physaddr_to_nid(lmb->base_addr);
+
 	/* Add the memory */
-	rc = __add_memory(lmb->nid, lmb->base_addr, block_sz);
+	rc = __add_memory(nid, lmb->base_addr, block_sz);
 	if (rc) {
 		invalidate_lmb_associativity_index(lmb);
 		return rc;
@@ -654,9 +660,8 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
 
 	rc = dlpar_online_lmb(lmb);
 	if (rc) {
-		__remove_memory(lmb->nid, lmb->base_addr, block_sz);
+		__remove_memory(nid, lmb->base_addr, block_sz);
 		invalidate_lmb_associativity_index(lmb);
-		lmb_clear_nid(lmb);
 	} else {
 		lmb->flags |= DRCONF_MEM_ASSIGNED;
 	}
-- 
2.24.1


^ permalink raw reply related

* [PATCH] selftests/powerpc: return skip code for spectre_v2
From: Thadeu Lima de Souza Cascardo @ 2020-07-28 15:50 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: cascardo, Shuah Khan, linuxppc-dev, linux-kselftest, linux-kernel

When running under older versions of qemu of under newer versions with old
machine types, some security features will not be reported to the guest.
This will lead the guest OS to consider itself Vulnerable to spectre_v2.

So, spectre_v2 test fails in such cases when the host is mitigated and miss
predictions cannot be detected as expected by the test.

Make it return the skip code instead, for this particular case. We don't
want to miss the case when the test fails and the system reports as
mitigated or not affected. But it is not a problem to miss failures when
the system reports as Vulnerable.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
---
 tools/testing/selftests/powerpc/security/spectre_v2.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/testing/selftests/powerpc/security/spectre_v2.c b/tools/testing/selftests/powerpc/security/spectre_v2.c
index 8c6b982af2a8..d5445bfd63ed 100644
--- a/tools/testing/selftests/powerpc/security/spectre_v2.c
+++ b/tools/testing/selftests/powerpc/security/spectre_v2.c
@@ -183,6 +183,14 @@ int spectre_v2_test(void)
 		if (miss_percent > 15) {
 			printf("Branch misses > 15%% unexpected in this configuration!\n");
 			printf("Possible mis-match between reported & actual mitigation\n");
+			/* Such a mismatch may be caused by a guest system
+			 * reporting as vulnerable when the host is mitigated.
+			 * Return skip code to avoid detecting this as an
+			 * error. We are not vulnerable and reporting otherwise,
+			 * so missing such a mismatch is safe.
+			 */
+			if (state == VULNERABLE)
+				return 4;
 			return 1;
 		}
 		break;
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH v4 09/10] Powerpc/smp: Create coregroup domain
From: Valentin Schneider @ 2020-07-28 15:03 UTC (permalink / raw)
  To: Srikar Dronamraju
  Cc: Nathan Lynch, Gautham R Shenoy, Michael Neuling, Peter Zijlstra,
	LKML, Nicholas Piggin, Oliver O'Halloran, Jordan Niethe,
	linuxppc-dev, Ingo Molnar
In-Reply-To: <20200727053230.19753-10-srikar@linux.vnet.ibm.com>


Hi,

On 27/07/20 06:32, Srikar Dronamraju wrote:
> Add percpu coregroup maps and masks to create coregroup domain.
> If a coregroup doesn't exist, the coregroup domain will be degenerated
> in favour of SMT/CACHE domain.
>

So there's at least one arm64 platform out there with the same "pairs of
cores share L2" thing (Ampere eMAG), and that lives quite happily with the
default scheduler topology (SMT/MC/DIE). Each pair of core gets its MC
domain, and the whole system is covered by DIE.

Now arguably it's not a perfect representation; DIE doesn't have
SD_SHARE_PKG_RESOURCES so the highest level sd_llc can point to is MC. That
will impact all callsites using cpus_share_cache(): in the eMAG case, only
pairs of cores will be seen as sharing cache, even though *all* cores share
the same L3.

I'm trying to paint a picture of what the P9 topology looks like (the one
you showcase in your cover letter) to see if there are any similarities;
from what I gather in [1], wikichips and your cover letter, with P9 you can
have something like this in a single DIE (somewhat unsure about L3 setup;
it looks to be distributed?)

     +---------------------------------------------------------------------+
     |                                  L3                                 |
     +---------------+-+---------------+-+---------------+-+---------------+
     |       L2      | |       L2      | |       L2      | |       L2      |
     +------+-+------+ +------+-+------+ +------+-+------+ +------+-+------+
     |  L1  | |  L1  | |  L1  | |  L1  | |  L1  | |  L1  | |  L1  | |  L1  |
     +------+ +------+ +------+ +------+ +------+ +------+ +------+ +------+
     |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs| |4 CPUs|
     +------+ +------+ +------+ +------+ +------+ +------+ +------+ +------+

Which would lead to (ignoring the whole SMT CPU numbering shenanigans)

NUMA     [                                                   ...
DIE      [                                             ]
MC       [         ] [         ] [         ] [         ]
BIGCORE  [         ] [         ] [         ] [         ]
SMT      [   ] [   ] [   ] [   ] [   ] [   ] [   ] [   ]
         00-03 04-07 08-11 12-15 16-19 20-23 24-27 28-31  <other node here>

This however has MC == BIGCORE; what makes it you can have different spans
for these two domains? If it's not too much to ask, I'd love to have a P9
topology diagram.

[1]: 20200722081822.GG9290@linux.vnet.ibm.com

^ permalink raw reply

* Re: [PATCH 14/15] x86/numa: remove redundant iteration over memblock.reserved
From: Baoquan He @ 2020-07-28 14:23 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linuxppc-dev, linux-kernel, iommu, Palmer Dabbelt,
	Andrew Morton, Christoph Hellwig
In-Reply-To: <20200728141504.GC3655207@kernel.org>

On 07/28/20 at 05:15pm, Mike Rapoport wrote:
> On Tue, Jul 28, 2020 at 07:02:54PM +0800, Baoquan He wrote:
> > On 07/28/20 at 08:11am, Mike Rapoport wrote:
> > > From: Mike Rapoport <rppt@linux.ibm.com>
> > > 
> > > numa_clear_kernel_node_hotplug() function first traverses numa_meminfo
> > > regions to set node ID in memblock.reserved and than traverses
> > > memblock.reserved to update reserved_nodemask to include node IDs that were
> > > set in the first loop.
> > > 
> > > Remove redundant traversal over memblock.reserved and update
> > > reserved_nodemask while iterating over numa_meminfo.
> > > 
> > > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > > ---
> > >  arch/x86/mm/numa.c | 26 ++++++++++----------------
> > >  1 file changed, 10 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > > index 8ee952038c80..4078abd33938 100644
> > > --- a/arch/x86/mm/numa.c
> > > +++ b/arch/x86/mm/numa.c
> > > @@ -498,31 +498,25 @@ static void __init numa_clear_kernel_node_hotplug(void)
> > >  	 * and use those ranges to set the nid in memblock.reserved.
> > >  	 * This will split up the memblock regions along node
> > >  	 * boundaries and will set the node IDs as well.
> > > +	 *
> > > +	 * The nid will also be set in reserved_nodemask which is later
> > > +	 * used to clear MEMBLOCK_HOTPLUG flag.
> > > +	 *
> > > +	 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
> > > +	 *   numa_meminfo might not include all memblock.reserved
> > > +	 *   memory ranges, because quirks such as trim_snb_memory()
> > > +	 *   reserve specific pages for Sandy Bridge graphics.
> > > +	 *   These ranges will remain with nid == MAX_NUMNODES. ]
> > >  	 */
> > >  	for (i = 0; i < numa_meminfo.nr_blks; i++) {
> > >  		struct numa_memblk *mb = numa_meminfo.blk + i;
> > >  		int ret;
> > >  
> > >  		ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
> > > +		node_set(mb->nid, reserved_nodemask);
> > 
> > Really? This will set all node id into reserved_nodemask. But in the
> > current code, it's setting nid into memblock reserved region which
> > interleaves with numa_memoinfo, then get those nid and set it in
> > reserved_nodemask. This is so different, with my understanding. Please
> > correct me if I am wrong.
> 
> You are right, I've missed the intersections of numa_meminfo with
> memblock.reserved.
> 
> x86 interaction with membock is so, hmm, interesting...

Yeah, numa_clear_kernel_node_hotplug() intends to find out any memory node
which has reserved memory, then make it as unmovable. Setting all node
id into reserved_nodemask will break the use case of hot removing hotpluggable
boot memory after system bootup.


^ permalink raw reply

* Re: [PATCH 14/15] x86/numa: remove redundant iteration over memblock.reserved
From: Mike Rapoport @ 2020-07-28 14:15 UTC (permalink / raw)
  To: Baoquan He
  Cc: linux-sh, Peter Zijlstra, Dave Hansen, linux-mips, Max Filippov,
	Paul Mackerras, sparclinux, linux-riscv, Will Deacon,
	Stafford Horne, Marek Szyprowski, linux-s390, linux-c6x-dev,
	Yoshinori Sato, x86, Russell King, Mike Rapoport,
	clang-built-linux, Ingo Molnar, Catalin Marinas, uclinux-h8-devel,
	linux-xtensa, openrisc, Borislav Petkov, Andy Lutomirski,
	Paul Walmsley, Thomas Gleixner, linux-arm-kernel, Michal Simek,
	linux-mm, linuxppc-dev, linux-kernel, iommu, Palmer Dabbelt,
	Andrew Morton, Christoph Hellwig
In-Reply-To: <20200728110254.GA14854@MiWiFi-R3L-srv>

On Tue, Jul 28, 2020 at 07:02:54PM +0800, Baoquan He wrote:
> On 07/28/20 at 08:11am, Mike Rapoport wrote:
> > From: Mike Rapoport <rppt@linux.ibm.com>
> > 
> > numa_clear_kernel_node_hotplug() function first traverses numa_meminfo
> > regions to set node ID in memblock.reserved and than traverses
> > memblock.reserved to update reserved_nodemask to include node IDs that were
> > set in the first loop.
> > 
> > Remove redundant traversal over memblock.reserved and update
> > reserved_nodemask while iterating over numa_meminfo.
> > 
> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > ---
> >  arch/x86/mm/numa.c | 26 ++++++++++----------------
> >  1 file changed, 10 insertions(+), 16 deletions(-)
> > 
> > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > index 8ee952038c80..4078abd33938 100644
> > --- a/arch/x86/mm/numa.c
> > +++ b/arch/x86/mm/numa.c
> > @@ -498,31 +498,25 @@ static void __init numa_clear_kernel_node_hotplug(void)
> >  	 * and use those ranges to set the nid in memblock.reserved.
> >  	 * This will split up the memblock regions along node
> >  	 * boundaries and will set the node IDs as well.
> > +	 *
> > +	 * The nid will also be set in reserved_nodemask which is later
> > +	 * used to clear MEMBLOCK_HOTPLUG flag.
> > +	 *
> > +	 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
> > +	 *   numa_meminfo might not include all memblock.reserved
> > +	 *   memory ranges, because quirks such as trim_snb_memory()
> > +	 *   reserve specific pages for Sandy Bridge graphics.
> > +	 *   These ranges will remain with nid == MAX_NUMNODES. ]
> >  	 */
> >  	for (i = 0; i < numa_meminfo.nr_blks; i++) {
> >  		struct numa_memblk *mb = numa_meminfo.blk + i;
> >  		int ret;
> >  
> >  		ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
> > +		node_set(mb->nid, reserved_nodemask);
> 
> Really? This will set all node id into reserved_nodemask. But in the
> current code, it's setting nid into memblock reserved region which
> interleaves with numa_memoinfo, then get those nid and set it in
> reserved_nodemask. This is so different, with my understanding. Please
> correct me if I am wrong.

You are right, I've missed the intersections of numa_meminfo with
memblock.reserved.

x86 interaction with membock is so, hmm, interesting...
 
> Thanks
> Baoquan
> 
> >  		WARN_ON_ONCE(ret);
> >  	}
> >  
> > -	/*
> > -	 * Now go over all reserved memblock regions, to construct a
> > -	 * node mask of all kernel reserved memory areas.
> > -	 *
> > -	 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
> > -	 *   numa_meminfo might not include all memblock.reserved
> > -	 *   memory ranges, because quirks such as trim_snb_memory()
> > -	 *   reserve specific pages for Sandy Bridge graphics. ]
> > -	 */
> > -	for_each_memblock(reserved, mb_region) {
> > -		int nid = memblock_get_region_node(mb_region);
> > -
> > -		if (nid != MAX_NUMNODES)
> > -			node_set(nid, reserved_nodemask);
> > -	}
> > -
> >  	/*
> >  	 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
> >  	 * belonging to the reserved node mask.
> > -- 
> > 2.26.2
> > 
> > 
> 

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [RESEND PATCH v5 08/11] ppc64/kexec_file: setup backup region for kdump kernel
From: Michael Ellerman @ 2020-07-28 14:11 UTC (permalink / raw)
  To: Hari Bathini, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Eric Biederman, Thiago Jung Bauermann, Dave Young, Vivek Goyal
In-Reply-To: <159579235754.5790.5203600072984600891.stgit@hbathini>

Hari Bathini <hbathini@linux.ibm.com> writes:
> diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
> index a5c1442590b2..88408b17a7f6 100644
> --- a/arch/powerpc/kexec/file_load_64.c
> +++ b/arch/powerpc/kexec/file_load_64.c
> @@ -697,6 +699,69 @@ static int update_usable_mem_fdt(void *fdt, struct crash_mem *usable_mem)
>  	return ret;
>  }
>  
> +/**
> + * load_backup_segment - Locate a memory hole to place the backup region.
> + * @image:               Kexec image.
> + * @kbuf:                Buffer contents and memory parameters.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +static int load_backup_segment(struct kimage *image, struct kexec_buf *kbuf)
> +{
> +	void *buf;
> +	int ret;
> +
> +	/* Setup a segment for backup region */
> +	buf = vzalloc(BACKUP_SRC_SIZE);

This worried me initially, because we can't copy from physically
discontiguous pages in real mode.

But as you explained this buffer is not used for copying.

I think if you move the large comment below up here, it would be
clearer.


> diff --git a/arch/powerpc/purgatory/trampoline_64.S b/arch/powerpc/purgatory/trampoline_64.S
> index 464af8e8a4cb..d4b52961f592 100644
> --- a/arch/powerpc/purgatory/trampoline_64.S
> +++ b/arch/powerpc/purgatory/trampoline_64.S
> @@ -43,14 +44,38 @@ master:
>  	mr	%r17,%r3	/* save cpu id to r17 */
>  	mr	%r15,%r4	/* save physical address in reg15 */
>  
> +	bl	0f		/* Work out where we're running */
> +0:	mflr	%r18

I know you just moved it, but this should use:

	bcl	20, 31, $+4
	mflr	%r18

Which is a special form of branch and link that doesn't unbalance the
link stack in the chip.

> +	/*
> +	 * Copy BACKUP_SRC_SIZE bytes from BACKUP_SRC_START to
> +	 * backup_start 8 bytes at a time.
> +	 *
> +	 * Use r3 = dest, r4 = src, r5 = size, r6 = count
> +	 */
> +	ld	%r3,(backup_start - 0b)(%r18)
> +	cmpdi	%cr0,%r3,0

I prefer spaces or tabs between arguments, eg:

	cmpdi	%cr0, %r3, 0

> +	beq	80f		/* skip if there is no backup region */

Local labels will make this clearer I think. eg:

	beq	.Lskip_copy

> +	lis	%r5,BACKUP_SRC_SIZE@h
> +	ori	%r5,%r5,BACKUP_SRC_SIZE@l
> +	cmpdi	%cr0,%r5,0
> +	beq	80f		/* skip if copy size is zero */
> +	lis	%r4,BACKUP_SRC_START@h
> +	ori	%r4,%r4,BACKUP_SRC_START@l
> +	li	%r6,0
> +70:

.Lcopy_loop:

> +	ldx	%r0,%r6,%r4
> +	stdx	%r0,%r6,%r3
> +	addi	%r6,%r6,8
> +	cmpld	%cr0,%r6,%r5
> +	blt	70b

	blt	.Lcopy_loop

> +

.Lskip_copy:

> +80:
>  	or	%r3,%r3,%r3	/* ok now to high priority, lets boot */
>  	lis	%r6,0x1
>  	mtctr	%r6		/* delay a bit for slaves to catch up */
>  	bdnz	.		/* before we overwrite 0-100 again */


cheers

^ permalink raw reply

* Re: [RESEND PATCH v5 07/11] ppc64/kexec_file: enable early kernel's OPAL calls
From: Michael Ellerman @ 2020-07-28 13:46 UTC (permalink / raw)
  To: Hari Bathini, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Mimi Zohar, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Sourabh Jain, lkml, linuxppc-dev,
	Eric Biederman, Thiago Jung Bauermann, Dave Young, Vivek Goyal
In-Reply-To: <159579233676.5790.10701756666641782647.stgit@hbathini>

Hari Bathini <hbathini@linux.ibm.com> writes:
> Kernel built with CONFIG_PPC_EARLY_DEBUG_OPAL enabled expects r8 & r9
> to be filled with OPAL base & entry addresses respectively. Setting
> these registers allows the kernel to perform OPAL calls before the
> device tree is parsed.

I'm not convinced we want to do this.

If we do it becomes part of the kexec ABI and we have to honour it into
the future.

And in practice there are no non-development kernels built with OPAL early
debugging enabled, so it's not clear it actually helps anyone other than
developers.

cheers

> v4 -> v5:
> * New patch. Updated opal_base & opal_entry values in r8 & r9 respectively.
>   This change was part of the below dropped patch in v4:
>     - https://lore.kernel.org/patchwork/patch/1275667/
>
>
>  arch/powerpc/kexec/file_load_64.c      |   16 ++++++++++++++++
>  arch/powerpc/purgatory/trampoline_64.S |   15 +++++++++++++++
>  2 files changed, 31 insertions(+)
>
> diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
> index 8df085a22fd7..a5c1442590b2 100644
> --- a/arch/powerpc/kexec/file_load_64.c
> +++ b/arch/powerpc/kexec/file_load_64.c
> @@ -713,6 +713,8 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
>  			  const void *fdt, unsigned long kernel_load_addr,
>  			  unsigned long fdt_load_addr)
>  {
> +	struct device_node *dn = NULL;
> +	uint64_t val;
>  	int ret;
>  
>  	ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
> @@ -735,9 +737,23 @@ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
>  			goto out;
>  	}
>  
> +	/* Setup OPAL base & entry values */
> +	dn = of_find_node_by_path("/ibm,opal");
> +	if (dn) {
> +		of_property_read_u64(dn, "opal-base-address", &val);
> +		ret = kexec_purgatory_get_set_symbol(image, "opal_base", &val,
> +						     sizeof(val), false);
> +		if (ret)
> +			goto out;
> +
> +		of_property_read_u64(dn, "opal-entry-address", &val);
> +		ret = kexec_purgatory_get_set_symbol(image, "opal_entry", &val,
> +						     sizeof(val), false);
> +	}
>  out:
>  	if (ret)
>  		pr_err("Failed to setup purgatory symbols");
> +	of_node_put(dn);
>  	return ret;
>  }
>  
> diff --git a/arch/powerpc/purgatory/trampoline_64.S b/arch/powerpc/purgatory/trampoline_64.S
> index a5a83c3f53e6..464af8e8a4cb 100644
> --- a/arch/powerpc/purgatory/trampoline_64.S
> +++ b/arch/powerpc/purgatory/trampoline_64.S
> @@ -61,6 +61,10 @@ master:
>  	li	%r4,28
>  	STWX_BE	%r17,%r3,%r4	/* Store my cpu as __be32 at byte 28 */
>  1:
> +	/* Load opal base and entry values in r8 & r9 respectively */
> +	ld	%r8,(opal_base - 0b)(%r18)
> +	ld	%r9,(opal_entry - 0b)(%r18)
> +
>  	/* load the kernel address */
>  	ld	%r4,(kernel - 0b)(%r18)
>  
> @@ -102,6 +106,17 @@ dt_offset:
>  	.8byte  0x0
>  	.size dt_offset, . - dt_offset
>  
> +	.balign 8
> +	.globl opal_base
> +opal_base:
> +	.8byte  0x0
> +	.size opal_base, . - opal_base
> +
> +	.balign 8
> +	.globl opal_entry
> +opal_entry:
> +	.8byte  0x0
> +	.size opal_entry, . - opal_entry
>  
>  	.data
>  	.balign 8

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox