LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH 2/2] PCI/AER: Log correctable errors as warning, not error
From: Bjorn Helgaas @ 2020-07-09 22:06 UTC (permalink / raw)
  To: Matt Jolly
  Cc: Sam Bobroff, linux-pci, linux-kernel, Oliver O'Halloran,
	Bjorn Helgaas, linuxppc-dev
In-Reply-To: <20200708001401.405749-2-helgaas@kernel.org>

On Tue, Jul 07, 2020 at 07:14:01PM -0500, Bjorn Helgaas wrote:
> From: Matt Jolly <Kangie@footclan.ninja>
> 
> PCIe correctable errors are recovered by hardware with no need for software
> intervention (PCIe r5.0, sec 6.2.2.1).
> 
> Reduce the log level of correctable errors from KERN_ERR to KERN_WARNING.
> 
> The bug reports below are for correctable error logging.  This doesn't fix
> the cause of those reports, but it may make the messages less alarming.
> 
> [bhelgaas: commit log, use pci_printk() to avoid code duplication]
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=201517
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=196183
> Link: https://lore.kernel.org/r/20200618155511.16009-1-Kangie@footclan.ninja
> Signed-off-by: Matt Jolly <Kangie@footclan.ninja>
> Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>

I applied both of these to pci/error for v5.9.

> ---
>  drivers/pci/pcie/aer.c | 25 +++++++++++++++----------
>  1 file changed, 15 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 9176c8a968b9..ca886bf91fd9 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -673,20 +673,23 @@ static void __aer_print_error(struct pci_dev *dev,
>  {
>  	const char **strings;
>  	unsigned long status = info->status & ~info->mask;
> -	const char *errmsg;
> +	const char *level, *errmsg;
>  	int i;
>  
> -	if (info->severity == AER_CORRECTABLE)
> +	if (info->severity == AER_CORRECTABLE) {
>  		strings = aer_correctable_error_string;
> -	else
> +		level = KERN_WARNING;
> +	} else {
>  		strings = aer_uncorrectable_error_string;
> +		level = KERN_ERR;
> +	}
>  
>  	for_each_set_bit(i, &status, 32) {
>  		errmsg = strings[i];
>  		if (!errmsg)
>  			errmsg = "Unknown Error Bit";
>  
> -		pci_err(dev, "   [%2d] %-22s%s\n", i, errmsg,
> +		pci_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
>  				info->first_error == i ? " (First)" : "");
>  	}
>  	pci_dev_aer_stats_incr(dev, info);
> @@ -696,6 +699,7 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  {
>  	int layer, agent;
>  	int id = ((dev->bus->number << 8) | dev->devfn);
> +	const char *level;
>  
>  	if (!info->status) {
>  		pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
> @@ -706,13 +710,14 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
>  	layer = AER_GET_LAYER_ERROR(info->severity, info->status);
>  	agent = AER_GET_AGENT(info->severity, info->status);
>  
> -	pci_err(dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> -		aer_error_severity_string[info->severity],
> -		aer_error_layer[layer], aer_agent_string[agent]);
> +	level = (info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR;
> +
> +	pci_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n",
> +		   aer_error_severity_string[info->severity],
> +		   aer_error_layer[layer], aer_agent_string[agent]);
>  
> -	pci_err(dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> -		dev->vendor, dev->device,
> -		info->status, info->mask);
> +	pci_printk(level, dev, "  device [%04x:%04x] error status/mask=%08x/%08x\n",
> +		   dev->vendor, dev->device, info->status, info->mask);
>  
>  	__aer_print_error(dev, info);
>  
> -- 
> 2.25.1
> 

^ permalink raw reply

* Re: /sys/kernel/debug/kmemleak empty despite kmemleak reports
From: Paul Menzel @ 2020-07-09 21:08 UTC (permalink / raw)
  To: Catalin Marinas; +Cc: linuxppc-dev
In-Reply-To: <20200709175705.GD6579@gaia>

Dear Catalin,


Am 09.07.20 um 19:57 schrieb Catalin Marinas:
> On Thu, Jul 09, 2020 at 04:37:10PM +0200, Paul Menzel wrote:
>> Despite Linux 5.8-rc4 reporting memory leaks on the IBM POWER 8 S822LC, the
>> file does not contain more information.
>>
>>> $ dmesg
>>> […] > [48662.953323] perf: interrupt took too long (2570 > 2500), lowering kernel.perf_event_max_sample_rate to 77750
>>> [48854.810636] perf: interrupt took too long (3216 > 3212), lowering kernel.perf_event_max_sample_rate to 62000
>>> [52300.044518] perf: interrupt took too long (4244 > 4020), lowering kernel.perf_event_max_sample_rate to 47000
>>> [52751.373083] perf: interrupt took too long (5373 > 5305), lowering kernel.perf_event_max_sample_rate to 37000
>>> [53354.000363] perf: interrupt took too long (6793 > 6716), lowering kernel.perf_event_max_sample_rate to 29250
>>> [53850.215606] perf: interrupt took too long (8672 > 8491), lowering kernel.perf_event_max_sample_rate to 23000
>>> [57542.266099] perf: interrupt took too long (10940 > 10840), lowering kernel.perf_event_max_sample_rate to 18250
>>> [57559.645404] perf: interrupt took too long (13714 > 13675), lowering kernel.perf_event_max_sample_rate to 14500
>>> [61608.697728] Can't find PMC that caused IRQ
>>> [71774.463111] kmemleak: 12 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>> [92372.044785] process '@/usr/bin/gnatmake-5' started with executable stack
>>> [92849.380672] FS-Cache: Loaded
>>> [92849.417269] FS-Cache: Netfs 'nfs' registered for caching
>>> [92849.595974] NFS: Registering the id_resolver key type
>>> [92849.596000] Key type id_resolver registered
>>> [92849.596000] Key type id_legacy registered
>>> [101808.079143] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>> [106904.323471] Can't find PMC that caused IRQ
>>> [129416.391456] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>> [158171.604221] kmemleak: 34 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
>>> $ sudo cat /sys/kernel/debug/kmemleak
> 
> When they are no longer present, they are most likely false positives.

How can this be? Shouldn’t the false positive also be logged in 
`/sys/kernel/debug/kmemleak`?

> Was this triggered during boot? Or under some workload?

 From the timestamps it looks like under some load.


Kind regards,

Paul

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Mathieu Desnoyers @ 2020-07-09 20:57 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <20200709204609.GQ3598@gate.crashing.org>

----- On Jul 9, 2020, at 4:46 PM, Segher Boessenkool segher@kernel.crashing.org wrote:

> On Thu, Jul 09, 2020 at 01:56:19PM -0400, Mathieu Desnoyers wrote:
>> > Just to make sure I understand your recommendation. So rather than
>> > hard coding r17 as the temporary registers, we could explicitly
>> > declare the temporary register as a C variable, pass it as an
>> > input operand to the inline asm, and then refer to it by operand
>> > name in the macros using it. This way the compiler would be free
>> > to perform its own register allocation.
>> > 
>> > If that is what you have in mind, then yes, I think it makes a
>> > lot of sense.
>> 
>> Except that asm goto have this limitation with gcc: those cannot
>> have any output operand, only inputs, clobbers and target labels.
>> We cannot modify a temporary register received as input operand. So I don't
>> see how to get a temporary register allocated by the compiler considering
>> this limitation.
> 
> Heh, yet another reason not to obfuscate your inline asm: it didn't
> register this is asm goto.
> 
> A clobber is one way, yes (those *are* allowed in asm goto).  Another
> way is to not actually change that register: move the original value
> back into there at the end of the asm!  (That isn't always easy to do,
> it depends on your code).  So something like
> 
>	long start = ...;
>	long tmp = start;
>	asm("stuff that modifies %0; ...; mr %0,%1" : : "r"(tmp), "r"(start));
> 
> is just fine: %0 isn't actually modified at all, as far as GCC is
> concerned, and this isn't lying to it!

It appears to be at the cost of adding one extra instruction on the fast-path
to restore the register to its original value. I'll leave Boqun whom authored
the original rseq-ppc code to figure out what works best performance-wise
(when he finds time).

Thanks for the pointers!

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Segher Boessenkool @ 2020-07-09 20:46 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <1682947575.7422.1594317379612.JavaMail.zimbra@efficios.com>

On Thu, Jul 09, 2020 at 01:56:19PM -0400, Mathieu Desnoyers wrote:
> > Just to make sure I understand your recommendation. So rather than
> > hard coding r17 as the temporary registers, we could explicitly
> > declare the temporary register as a C variable, pass it as an
> > input operand to the inline asm, and then refer to it by operand
> > name in the macros using it. This way the compiler would be free
> > to perform its own register allocation.
> > 
> > If that is what you have in mind, then yes, I think it makes a
> > lot of sense.
> 
> Except that asm goto have this limitation with gcc: those cannot
> have any output operand, only inputs, clobbers and target labels.
> We cannot modify a temporary register received as input operand. So I don't
> see how to get a temporary register allocated by the compiler considering
> this limitation.

Heh, yet another reason not to obfuscate your inline asm: it didn't
register this is asm goto.

A clobber is one way, yes (those *are* allowed in asm goto).  Another
way is to not actually change that register: move the original value
back into there at the end of the asm!  (That isn't always easy to do,
it depends on your code).  So something like

	long start = ...;
	long tmp = start;
	asm("stuff that modifies %0; ...; mr %0,%1" : : "r"(tmp), "r"(start));

is just fine: %0 isn't actually modified at all, as far as GCC is
concerned, and this isn't lying to it!


Segher

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Segher Boessenkool @ 2020-07-09 20:31 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <1584179170.7410.1594316576293.JavaMail.zimbra@efficios.com>

On Thu, Jul 09, 2020 at 01:42:56PM -0400, Mathieu Desnoyers wrote:
> > That works fine then, for a testcase.  Using r17 is not a great idea for
> > performance (it increases the active register footprint, and causes more
> > registers to be saved in the prologue of the functions, esp. on older
> > compilers), and it is easier to just let the compiler choose a good
> > register to use.  But maybe you want to see r17 in the generated
> > testcases, as eyecatcher or something, dunno :-)
> 
> Just to make sure I understand your recommendation. So rather than
> hard coding r17 as the temporary registers, we could explicitly
> declare the temporary register as a C variable, pass it as an
> input operand to the inline asm, and then refer to it by operand
> name in the macros using it. This way the compiler would be free
> to perform its own register allocation.
> 
> If that is what you have in mind, then yes, I think it makes a
> lot of sense.

You write to it as well, so an inout register ("+r" or such).  And yes,
you use a local var for it (like "long tmp;").  And then you can refer
to it like anything else in your asm, like "%3" or like
"%[a_long_name]"; and the compiler sees it as any other register,
exactly.


Segher

^ permalink raw reply

* Re: [PATCH 11/20] Documentation: leds/ledtrig-transient: eliminate duplicated word
From: Jacek Anaszewski @ 2020-07-09 20:01 UTC (permalink / raw)
  To: Randy Dunlap, linux-kernel
  Cc: kvm, linux-doc, David Airlie, kgdb-bugreport, linux-fpga,
	Liviu Dudau, dri-devel, linux-mips, Paul Cercueil, keyrings,
	Paul Mackerras, linux-i2c, Pavel Machek, Srinivas Pandruvada,
	Mihail Atanassov, linux-leds, linux-s390, Daniel Thompson,
	linux-scsi, Jonathan Corbet, Masahiro Yamada, Matthew Wilcox,
	Halil Pasic, Jarkko Sakkinen, James Wang, linux-input,
	Mali DP Maintainers, Derek Kiernan, Dragan Cvetic, Wu Hao,
	Tony Krowiak, linux-kbuild, James E.J. Bottomley, Jiri Kosina,
	Hannes Reinecke, linux-block, Thomas Bogendoerfer, Dan Murphy,
	linux-mm, Dan Williams, Andrew Morton, Mimi Zohar, Jens Axboe,
	Michal Marek, Martin K. Petersen, Pierre Morel, Douglas Anderson,
	Wolfram Sang, Daniel Vetter, Jason Wessel, Paolo Bonzini,
	linux-integrity, linuxppc-dev, Mike Rapoport
In-Reply-To: <20200707180414.10467-12-rdunlap@infradead.org>

On 7/7/20 8:04 PM, Randy Dunlap wrote:
> Drop the doubled word "for".
> 
> Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
> Cc: Jonathan Corbet <corbet@lwn.net>
> Cc: linux-doc@vger.kernel.org
> Cc: Jacek Anaszewski <jacek.anaszewski@gmail.com>
> Cc: Pavel Machek <pavel@ucw.cz>
> Cc: Dan Murphy <dmurphy@ti.com>
> Cc: linux-leds@vger.kernel.org
> ---
>   Documentation/leds/ledtrig-transient.rst |    2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> --- linux-next-20200701.orig/Documentation/leds/ledtrig-transient.rst
> +++ linux-next-20200701/Documentation/leds/ledtrig-transient.rst
> @@ -157,7 +157,7 @@ repeat the following step as needed::
>   	echo 1 > activate - start timer = duration to run once
>   	echo none > trigger
>   
> -This trigger is intended to be used for for the following example use cases:
> +This trigger is intended to be used for the following example use cases:
>   
>    - Control of vibrate (phones, tablets etc.) hardware by user space app.
>    - Use of LED by user space app as activity indicator.
> 

Acked-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>

-- 
Best regards,
Jacek Anaszewski

^ permalink raw reply

* Re: /sys/kernel/debug/kmemleak empty despite kmemleak reports
From: Catalin Marinas @ 2020-07-09 17:57 UTC (permalink / raw)
  To: Paul Menzel; +Cc: linuxppc-dev
In-Reply-To: <070dd6b7-1ee6-8090-8973-1eb0240f6948@molgen.mpg.de>

On Thu, Jul 09, 2020 at 04:37:10PM +0200, Paul Menzel wrote:
> Despite Linux 5.8-rc4 reporting memory leaks on the IBM POWER 8 S822LC, the
> file does not contain more information.
> 
> > $ dmesg
> > […] > [48662.953323] perf: interrupt took too long (2570 > 2500),
> > lowering
> kernel.perf_event_max_sample_rate to 77750
> > [48854.810636] perf: interrupt took too long (3216 > 3212), lowering kernel.perf_event_max_sample_rate to 62000
> > [52300.044518] perf: interrupt took too long (4244 > 4020), lowering kernel.perf_event_max_sample_rate to 47000
> > [52751.373083] perf: interrupt took too long (5373 > 5305), lowering kernel.perf_event_max_sample_rate to 37000
> > [53354.000363] perf: interrupt took too long (6793 > 6716), lowering kernel.perf_event_max_sample_rate to 29250
> > [53850.215606] perf: interrupt took too long (8672 > 8491), lowering kernel.perf_event_max_sample_rate to 23000
> > [57542.266099] perf: interrupt took too long (10940 > 10840), lowering kernel.perf_event_max_sample_rate to 18250
> > [57559.645404] perf: interrupt took too long (13714 > 13675), lowering kernel.perf_event_max_sample_rate to 14500
> > [61608.697728] Can't find PMC that caused IRQ
> > [71774.463111] kmemleak: 12 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > [92372.044785] process '@/usr/bin/gnatmake-5' started with executable stack
> > [92849.380672] FS-Cache: Loaded
> > [92849.417269] FS-Cache: Netfs 'nfs' registered for caching
> > [92849.595974] NFS: Registering the id_resolver key type
> > [92849.596000] Key type id_resolver registered
> > [92849.596000] Key type id_legacy registered
> > [101808.079143] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > [106904.323471] Can't find PMC that caused IRQ
> > [129416.391456] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > [158171.604221] kmemleak: 34 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > $ sudo cat /sys/kernel/debug/kmemleak

When they are no longer present, they are most likely false positives.
Was this triggered during boot? Or under some workload?

-- 
Catalin

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Mathieu Desnoyers @ 2020-07-09 17:56 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <1584179170.7410.1594316576293.JavaMail.zimbra@efficios.com>

----- On Jul 9, 2020, at 1:42 PM, Mathieu Desnoyers mathieu.desnoyers@efficios.com wrote:

> ----- On Jul 9, 2020, at 1:37 PM, Segher Boessenkool segher@kernel.crashing.org
> wrote:
> 
>> On Thu, Jul 09, 2020 at 09:43:47AM -0400, Mathieu Desnoyers wrote:
>>> > What protects r17 *after* this asm statement?
>>> 
>>> As discussed in the other leg of the thread (with the code example),
>>> r17 is in the clobber list of all asm statements using this macro, and
>>> is used as a temporary register within each inline asm.
>> 
>> That works fine then, for a testcase.  Using r17 is not a great idea for
>> performance (it increases the active register footprint, and causes more
>> registers to be saved in the prologue of the functions, esp. on older
>> compilers), and it is easier to just let the compiler choose a good
>> register to use.  But maybe you want to see r17 in the generated
>> testcases, as eyecatcher or something, dunno :-)
> 
> Just to make sure I understand your recommendation. So rather than
> hard coding r17 as the temporary registers, we could explicitly
> declare the temporary register as a C variable, pass it as an
> input operand to the inline asm, and then refer to it by operand
> name in the macros using it. This way the compiler would be free
> to perform its own register allocation.
> 
> If that is what you have in mind, then yes, I think it makes a
> lot of sense.

Except that asm goto have this limitation with gcc: those cannot
have any output operand, only inputs, clobbers and target labels.
We cannot modify a temporary register received as input operand. So I don't
see how to get a temporary register allocated by the compiler considering
this limitation.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* [Bug 208197] OF: /pci@f2000000/mac-io@17/gpio@50/...: could not find phandle
From: bugzilla-daemon @ 2020-07-09 17:50 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-208197-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=208197

--- Comment #9 from Erhard F. (erhard_f@mailbox.org) ---
(In reply to Michael Ellerman from comment #7)
> I couldn't really make sense of your bisect log, it doesn't have any
> good/bad commits in it.
> 
> Can you attach the output of "git bisect log".
Yea sorry, my fault... I thought e.g. with "git bisect bad | tee -a
~/bisect.log" bisect.log generated via tee would have the same output as with
"git bisect log" but I was wrong. 

Please find the correct one attached now. I had to restart the bisect as I
already resetted the original one.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* [Bug 208197] OF: /pci@f2000000/mac-io@17/gpio@50/...: could not find phandle
From: bugzilla-daemon @ 2020-07-09 17:44 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-208197-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=208197

Erhard F. (erhard_f@mailbox.org) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
 Attachment #290097|0                           |1
        is obsolete|                            |

--- Comment #8 from Erhard F. (erhard_f@mailbox.org) ---
Created attachment 290191
  --> https://bugzilla.kernel.org/attachment.cgi?id=290191&action=edit
bisect.log

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Mathieu Desnoyers @ 2020-07-09 17:42 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <20200709173712.GL3598@gate.crashing.org>

----- On Jul 9, 2020, at 1:37 PM, Segher Boessenkool segher@kernel.crashing.org wrote:

> On Thu, Jul 09, 2020 at 09:43:47AM -0400, Mathieu Desnoyers wrote:
>> > What protects r17 *after* this asm statement?
>> 
>> As discussed in the other leg of the thread (with the code example),
>> r17 is in the clobber list of all asm statements using this macro, and
>> is used as a temporary register within each inline asm.
> 
> That works fine then, for a testcase.  Using r17 is not a great idea for
> performance (it increases the active register footprint, and causes more
> registers to be saved in the prologue of the functions, esp. on older
> compilers), and it is easier to just let the compiler choose a good
> register to use.  But maybe you want to see r17 in the generated
> testcases, as eyecatcher or something, dunno :-)

Just to make sure I understand your recommendation. So rather than
hard coding r17 as the temporary registers, we could explicitly
declare the temporary register as a C variable, pass it as an
input operand to the inline asm, and then refer to it by operand
name in the macros using it. This way the compiler would be free
to perform its own register allocation.

If that is what you have in mind, then yes, I think it makes a
lot of sense.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Segher Boessenkool @ 2020-07-09 17:37 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <1769596686.6365.1594302227962.JavaMail.zimbra@efficios.com>

On Thu, Jul 09, 2020 at 09:43:47AM -0400, Mathieu Desnoyers wrote:
> > What protects r17 *after* this asm statement?
> 
> As discussed in the other leg of the thread (with the code example),
> r17 is in the clobber list of all asm statements using this macro, and
> is used as a temporary register within each inline asm.

That works fine then, for a testcase.  Using r17 is not a great idea for
performance (it increases the active register footprint, and causes more
registers to be saved in the prologue of the functions, esp. on older
compilers), and it is easier to just let the compiler choose a good
register to use.  But maybe you want to see r17 in the generated
testcases, as eyecatcher or something, dunno :-)

Segher

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Segher Boessenkool @ 2020-07-09 17:31 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <429958629.6348.1594301598584.JavaMail.zimbra@efficios.com>

On Thu, Jul 09, 2020 at 09:33:18AM -0400, Mathieu Desnoyers wrote:
> > The way this all uses r17 will likely not work reliably.
> 
> r17 is only used as a temporary register within the inline assembler, and it is
> in the clobber list. In which scenario would it not work reliably ?

This isn't clear at all, that is the problem.

> > The way multiple asm statements are used seems to have missing
> > dependencies between the statements.
> 
> I'm not sure I follow here. Note that we are injecting the CPP macros into
> a single inline asm statement as strings.

Yeah...  more trickiness.

> > And done macro-mess this, you want to be able to debug it, and you need
> > other people to be able to read it!
> 
> I understand that looking at macros can be cumbersome from the perspective
> of a reviewer only interested in a single architecture,

No, from the perspective of *any* reviewer.

> However, from my perspective, as a maintainer who must maintain similar code
> for x86 32/64, powerpc 32/64, arm, aarch64, s390, s390x, mips 32/64, and likely
> other architectures in the future, the macros abstracting 32-bit and 64-bit
> allow to eliminate code duplication for each architecture with 32-bit and 64-bit
> variants, which is better for maintainability.

IMNSHO it is MUCH better to just have simple separate implementations
for each.  They differ in *all* details.

Or have static inline functions, with proper dependencies, instead of
nasty text macros.

But it's your code, do what you want :-)


Segher

^ permalink raw reply

* [PATCH v2] powerpc/pseries: Avoid using addr_to_pfn in realmode
From: Ganesh Goudar @ 2020-07-09 16:39 UTC (permalink / raw)
  To: mpe, linuxppc-dev; +Cc: mahesh, Ganesh Goudar, npiggin, aneesh.kumar

When an UE or memory error exception is encountered the MCE handler
tries to find the pfn using addr_to_pfn() which takes effective
address as an argument, later pfn is used to poison the page where
memory error occurred, recent rework in this area made addr_to_pfn
to run in realmode, which can be fatal as it may try to access
memory outside RMO region.

To fix this use addr_to_pfn after switching to virtual mode.

Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
---
V2: Leave bare metal code and save_mce_event as is.
---
 arch/powerpc/platforms/pseries/ras.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index f3736fcd98fc..def875815e92 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -610,16 +610,8 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 		if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED)
 			eaddr = be64_to_cpu(mce_log->effective_address);
 
-		if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED) {
+		if (mce_log->sub_err_type & UE_LOGICAL_ADDR_PROVIDED)
 			paddr = be64_to_cpu(mce_log->logical_address);
-		} else if (mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
-			unsigned long pfn;
-
-			pfn = addr_to_pfn(regs, eaddr);
-			if (pfn != ULONG_MAX)
-				paddr = pfn << PAGE_SHIFT;
-		}
-
 		break;
 	case MC_ERROR_TYPE_SLB:
 		mce_err.error_type = MCE_ERROR_TYPE_SLB;
@@ -725,6 +717,16 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 	 *       SLB multihit is done by now.
 	 */
 	mtmsr(mfmsr() | MSR_IR | MSR_DR);
+
+	/* Use addr_to_pfn after switching to virtual mode */
+	if (!paddr && error_type == MC_ERROR_TYPE_UE &&
+	    mce_log->sub_err_type & UE_EFFECTIVE_ADDR_PROVIDED) {
+		unsigned long pfn;
+
+		pfn = addr_to_pfn(regs, eaddr);
+		if (pfn != ULONG_MAX)
+			paddr = pfn << PAGE_SHIFT;
+	}
 	save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
 			&mce_err, regs->nip, eaddr, paddr);
 
-- 
2.17.2


^ permalink raw reply related

* Re: [PATCH v3 5/6] powerpc/pseries: implement paravirt qspinlocks for SPLPAR
From: Waiman Long @ 2020-07-09 16:06 UTC (permalink / raw)
  To: Michael Ellerman, Nicholas Piggin, linuxppc-dev
  Cc: linux-arch, Peter Zijlstra, Boqun Feng, linux-kernel, kvm-ppc,
	virtualization, Ingo Molnar, Will Deacon
In-Reply-To: <874kqhvu1v.fsf@mpe.ellerman.id.au>

On 7/9/20 6:53 AM, Michael Ellerman wrote:
> Nicholas Piggin <npiggin@gmail.com> writes:
>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>   arch/powerpc/include/asm/paravirt.h           | 28 ++++++++
>>   arch/powerpc/include/asm/qspinlock.h          | 66 +++++++++++++++++++
>>   arch/powerpc/include/asm/qspinlock_paravirt.h |  7 ++
>>   arch/powerpc/platforms/pseries/Kconfig        |  5 ++
>>   arch/powerpc/platforms/pseries/setup.c        |  6 +-
>>   include/asm-generic/qspinlock.h               |  2 +
> Another ack?
>
I am OK with adding the #ifdef around queued_spin_lock().

Acked-by: Waiman Long <longman@redhat.com>

>> diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h
>> index 7a8546660a63..f2d51f929cf5 100644
>> --- a/arch/powerpc/include/asm/paravirt.h
>> +++ b/arch/powerpc/include/asm/paravirt.h
>> @@ -45,6 +55,19 @@ static inline void yield_to_preempted(int cpu, u32 yield_count)
>>   {
>>   	___bad_yield_to_preempted(); /* This would be a bug */
>>   }
>> +
>> +extern void ___bad_yield_to_any(void);
>> +static inline void yield_to_any(void)
>> +{
>> +	___bad_yield_to_any(); /* This would be a bug */
>> +}
> Why do we do that rather than just not defining yield_to_any() at all
> and letting the build fail on that?
>
> There's a condition somewhere that we know will false at compile time
> and drop the call before linking?
>
>> diff --git a/arch/powerpc/include/asm/qspinlock_paravirt.h b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> new file mode 100644
>> index 000000000000..750d1b5e0202
>> --- /dev/null
>> +++ b/arch/powerpc/include/asm/qspinlock_paravirt.h
>> @@ -0,0 +1,7 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +#ifndef __ASM_QSPINLOCK_PARAVIRT_H
>> +#define __ASM_QSPINLOCK_PARAVIRT_H
> _ASM_POWERPC_QSPINLOCK_PARAVIRT_H please.
>
>> +
>> +EXPORT_SYMBOL(__pv_queued_spin_unlock);
> Why's that in a header? Should that (eventually) go with the generic implementation?
The PV qspinlock implementation is not that generic at the moment. Even 
though native qspinlock is used by a number of archs, PV qspinlock is 
only currently used in x86. This is certainly an area that needs 
improvement.
>> diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
>> index 24c18362e5ea..756e727b383f 100644
>> --- a/arch/powerpc/platforms/pseries/Kconfig
>> +++ b/arch/powerpc/platforms/pseries/Kconfig
>> @@ -25,9 +25,14 @@ config PPC_PSERIES
>>   	select SWIOTLB
>>   	default y
>>   
>> +config PARAVIRT_SPINLOCKS
>> +	bool
>> +	default n
> default n is the default.
>
>> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
>> index 2db8469e475f..747a203d9453 100644
>> --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -771,8 +771,12 @@ static void __init pSeries_setup_arch(void)
>>   	if (firmware_has_feature(FW_FEATURE_LPAR)) {
>>   		vpa_init(boot_cpuid);
>>   
>> -		if (lppaca_shared_proc(get_lppaca()))
>> +		if (lppaca_shared_proc(get_lppaca())) {
>>   			static_branch_enable(&shared_processor);
>> +#ifdef CONFIG_PARAVIRT_SPINLOCKS
>> +			pv_spinlocks_init();
>> +#endif
>> +		}
> We could avoid the ifdef with this I think?
>
> diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
> index 434615f1d761..6ec72282888d 100644
> --- a/arch/powerpc/include/asm/spinlock.h
> +++ b/arch/powerpc/include/asm/spinlock.h
> @@ -10,5 +10,9 @@
>   #include <asm/simple_spinlock.h>
>   #endif
>
> +#ifndef CONFIG_PARAVIRT_SPINLOCKS
> +static inline void pv_spinlocks_init(void) { }
> +#endif
> +
>   #endif /* __KERNEL__ */
>   #endif /* __ASM_SPINLOCK_H */
>
>
> cheers
>
We don't really need to do a pv_spinlocks_init() if pv_kick() isn't 
supported.

Cheers,
Longman


^ permalink raw reply

* Re: [PATCH 10/20] Documentation: kbuild/kconfig-language: eliminate duplicated word
From: Masahiro Yamada @ 2020-07-09 15:31 UTC (permalink / raw)
  To: Randy Dunlap
  Cc: kvm, open list:DOCUMENTATION, David Airlie, kgdb-bugreport,
	linux-fpga, Liviu Dudau, dri-devel, Douglas Anderson,
	Paul Cercueil, keyrings, Paul Mackerras, linux-i2c, Pavel Machek,
	Srinivas Pandruvada, Mihail Atanassov, linux-leds, linux-s390,
	Daniel Thompson, linux-scsi, Jonathan Corbet, Matthew Wilcox,
	Halil Pasic, Jarkko Sakkinen, James Wang, linux-input,
	Mali DP Maintainers, Derek Kiernan, linux-mips, Dragan Cvetic,
	Wu Hao, Tony Krowiak, Linux Kbuild mailing list,
	James E.J. Bottomley, Jiri Kosina, Hannes Reinecke, linux-block,
	Thomas Bogendoerfer, Jacek Anaszewski, linux-mm, Dan Williams,
	Andrew Morton, Mimi Zohar, Jens Axboe, Michal Marek,
	Martin K. Petersen, Pierre Morel, Linux Kernel Mailing List,
	Wolfram Sang, Daniel Vetter, Jason Wessel, Paolo Bonzini,
	linux-integrity, linuxppc-dev, Mike Rapoport, Dan Murphy
In-Reply-To: <20200707180414.10467-11-rdunlap@infradead.org>

On Wed, Jul 8, 2020 at 3:06 AM Randy Dunlap <rdunlap@infradead.org> wrote:
>
> Drop the doubled word "the".
>
> Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
> Cc: Jonathan Corbet <corbet@lwn.net>
> Cc: linux-doc@vger.kernel.org
> Cc: Masahiro Yamada <masahiroy@kernel.org>



I guess this series will go in via the doc sub-system.

If so, please feel free to add:

Acked-by: Masahiro Yamada <masahiroy@kernel.org>






> Cc: Michal Marek <michal.lkml@markovi.net>
> Cc: linux-kbuild@vger.kernel.org
> ---
>  Documentation/kbuild/kconfig-language.rst |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> --- linux-next-20200701.orig/Documentation/kbuild/kconfig-language.rst
> +++ linux-next-20200701/Documentation/kbuild/kconfig-language.rst
> @@ -681,7 +681,7 @@ translate Kconfig logic into boolean for
>  find dead code / features (always inactive), 114 dead features were found in
>  Linux using this methodology [1]_ (Section 8: Threats to validity).
>
> -Confirming this could prove useful as Kconfig stands as one of the the leading
> +Confirming this could prove useful as Kconfig stands as one of the leading
>  industrial variability modeling languages [1]_ [2]_. Its study would help
>  evaluate practical uses of such languages, their use was only theoretical
>  and real world requirements were not well understood. As it stands though



--
Best Regards
Masahiro Yamada

^ permalink raw reply

* /sys/kernel/debug/kmemleak empty despite kmemleak reports
From: Paul Menzel @ 2020-07-09 14:37 UTC (permalink / raw)
  To: Catalin Marinas, Michael Ellerman; +Cc: linuxppc-dev

Dear Linux folks,


Despite Linux 5.8-rc4 reporting memory leaks on the IBM POWER 8 S822LC, 
the file does not contain more information.

> $ dmesg
> […] > [48662.953323] perf: interrupt took too long (2570 > 2500), lowering 
kernel.perf_event_max_sample_rate to 77750
> [48854.810636] perf: interrupt took too long (3216 > 3212), lowering kernel.perf_event_max_sample_rate to 62000
> [52300.044518] perf: interrupt took too long (4244 > 4020), lowering kernel.perf_event_max_sample_rate to 47000
> [52751.373083] perf: interrupt took too long (5373 > 5305), lowering kernel.perf_event_max_sample_rate to 37000
> [53354.000363] perf: interrupt took too long (6793 > 6716), lowering kernel.perf_event_max_sample_rate to 29250
> [53850.215606] perf: interrupt took too long (8672 > 8491), lowering kernel.perf_event_max_sample_rate to 23000
> [57542.266099] perf: interrupt took too long (10940 > 10840), lowering kernel.perf_event_max_sample_rate to 18250
> [57559.645404] perf: interrupt took too long (13714 > 13675), lowering kernel.perf_event_max_sample_rate to 14500
> [61608.697728] Can't find PMC that caused IRQ
> [71774.463111] kmemleak: 12 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> [92372.044785] process '@/usr/bin/gnatmake-5' started with executable stack
> [92849.380672] FS-Cache: Loaded
> [92849.417269] FS-Cache: Netfs 'nfs' registered for caching
> [92849.595974] NFS: Registering the id_resolver key type
> [92849.596000] Key type id_resolver registered
> [92849.596000] Key type id_legacy registered
> [101808.079143] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> [106904.323471] Can't find PMC that caused IRQ
> [129416.391456] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> [158171.604221] kmemleak: 34 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> $ sudo cat /sys/kernel/debug/kmemleak
> $


Kind regards,

Paul

^ permalink raw reply

* Re: [PATCH v2 2/2] papr/scm: Add bad memory ranges to nvdimm bad ranges
From: Christophe Leroy @ 2020-07-09 13:55 UTC (permalink / raw)
  To: Santosh Sivaraj, linuxppc-dev
  Cc: Ganesh Goudar, Vaibhav Jain, Oliver, Mahesh Salgaonkar,
	Aneesh Kumar K.V
In-Reply-To: <20200709135142.721504-2-santosh@fossix.org>



Le 09/07/2020 à 15:51, Santosh Sivaraj a écrit :
> Subscribe to the MCE notification and add the physical address which
> generated a memory error to nvdimm bad range.
> 
> Reviewed-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
> Signed-off-by: Santosh Sivaraj <santosh@fossix.org>

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>

> ---
>   arch/powerpc/platforms/pseries/papr_scm.c | 96 ++++++++++++++++++++++-
>   1 file changed, 95 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
> index 9c569078a09fd..90729029ca010 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -13,9 +13,11 @@
>   #include <linux/platform_device.h>
>   #include <linux/delay.h>
>   #include <linux/seq_buf.h>
> +#include <linux/nd.h>
>   
>   #include <asm/plpar_wrappers.h>
>   #include <asm/papr_pdsm.h>
> +#include <asm/mce.h>
>   
>   #define BIND_ANY_ADDR (~0ul)
>   
> @@ -80,6 +82,7 @@ struct papr_scm_priv {
>   	struct resource res;
>   	struct nd_region *region;
>   	struct nd_interleave_set nd_set;
> +	struct list_head region_list;
>   
>   	/* Protect dimm health data from concurrent read/writes */
>   	struct mutex health_mutex;
> @@ -91,6 +94,9 @@ struct papr_scm_priv {
>   	u64 health_bitmap;
>   };
>   
> +LIST_HEAD(papr_nd_regions);
> +DEFINE_MUTEX(papr_ndr_lock);
> +
>   static int drc_pmem_bind(struct papr_scm_priv *p)
>   {
>   	unsigned long ret[PLPAR_HCALL_BUFSIZE];
> @@ -759,6 +765,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>   		dev_info(dev, "Region registered with target node %d and online node %d",
>   			 target_nid, online_nid);
>   
> +	mutex_lock(&papr_ndr_lock);
> +	list_add_tail(&p->region_list, &papr_nd_regions);
> +	mutex_unlock(&papr_ndr_lock);
> +
>   	return 0;
>   
>   err:	nvdimm_bus_unregister(p->bus);
> @@ -766,6 +776,68 @@ err:	nvdimm_bus_unregister(p->bus);
>   	return -ENXIO;
>   }
>   
> +static void papr_scm_add_badblock(struct nd_region *region,
> +				  struct nvdimm_bus *bus, u64 phys_addr)
> +{
> +	u64 aligned_addr = ALIGN_DOWN(phys_addr, L1_CACHE_BYTES);
> +
> +	if (nvdimm_bus_add_badrange(bus, aligned_addr, L1_CACHE_BYTES)) {
> +		pr_err("Bad block registration for 0x%llx failed\n", phys_addr);
> +		return;
> +	}
> +
> +	pr_debug("Add memory range (0x%llx - 0x%llx) as bad range\n",
> +		 aligned_addr, aligned_addr + L1_CACHE_BYTES);
> +
> +	nvdimm_region_notify(region, NVDIMM_REVALIDATE_POISON);
> +}
> +
> +static int handle_mce_ue(struct notifier_block *nb, unsigned long val,
> +			 void *data)
> +{
> +	struct machine_check_event *evt = data;
> +	struct papr_scm_priv *p;
> +	u64 phys_addr;
> +	bool found = false;
> +
> +	if (evt->error_type != MCE_ERROR_TYPE_UE)
> +		return NOTIFY_DONE;
> +
> +	if (list_empty(&papr_nd_regions))
> +		return NOTIFY_DONE;
> +
> +	/*
> +	 * The physical address obtained here is PAGE_SIZE aligned, so get the
> +	 * exact address from the effective address
> +	 */
> +	phys_addr = evt->u.ue_error.physical_address +
> +			(evt->u.ue_error.effective_address & ~PAGE_MASK);
> +
> +	if (!evt->u.ue_error.physical_address_provided ||
> +	    !is_zone_device_page(pfn_to_page(phys_addr >> PAGE_SHIFT)))
> +		return NOTIFY_DONE;
> +
> +	/* mce notifier is called from a process context, so mutex is safe */
> +	mutex_lock(&papr_ndr_lock);
> +	list_for_each_entry(p, &papr_nd_regions, region_list) {
> +		if (phys_addr >= p->res.start && phys_addr <= p->res.end) {
> +			found = true;
> +			break;
> +		}
> +	}
> +
> +	if (found)
> +		papr_scm_add_badblock(p->region, p->bus, phys_addr);
> +
> +	mutex_unlock(&papr_ndr_lock);
> +
> +	return found ? NOTIFY_OK : NOTIFY_DONE;
> +}
> +
> +static struct notifier_block mce_ue_nb = {
> +	.notifier_call = handle_mce_ue
> +};
> +
>   static int papr_scm_probe(struct platform_device *pdev)
>   {
>   	struct device_node *dn = pdev->dev.of_node;
> @@ -866,6 +938,10 @@ static int papr_scm_remove(struct platform_device *pdev)
>   {
>   	struct papr_scm_priv *p = platform_get_drvdata(pdev);
>   
> +	mutex_lock(&papr_ndr_lock);
> +	list_del(&p->region_list);
> +	mutex_unlock(&papr_ndr_lock);
> +
>   	nvdimm_bus_unregister(p->bus);
>   	drc_pmem_unbind(p);
>   	kfree(p->bus_desc.provider_name);
> @@ -888,7 +964,25 @@ static struct platform_driver papr_scm_driver = {
>   	},
>   };
>   
> -module_platform_driver(papr_scm_driver);
> +static int __init papr_scm_init(void)
> +{
> +	int ret;
> +
> +	ret = platform_driver_register(&papr_scm_driver);
> +	if (!ret)
> +		mce_register_notifier(&mce_ue_nb);
> +
> +	return ret;
> +}
> +module_init(papr_scm_init);
> +
> +static void __exit papr_scm_exit(void)
> +{
> +	mce_unregister_notifier(&mce_ue_nb);
> +	platform_driver_unregister(&papr_scm_driver);
> +}
> +module_exit(papr_scm_exit);
> +
>   MODULE_DEVICE_TABLE(of, papr_scm_match);
>   MODULE_LICENSE("GPL");
>   MODULE_AUTHOR("IBM Corporation");
> 

^ permalink raw reply

* Re: [PATCH v2 1/2] powerpc/mce: Add MCE notification chain
From: Christophe Leroy @ 2020-07-09 13:54 UTC (permalink / raw)
  To: Santosh Sivaraj, linuxppc-dev
  Cc: Ganesh Goudar, Vaibhav Jain, Oliver, Mahesh Salgaonkar,
	Aneesh Kumar K.V
In-Reply-To: <20200709135142.721504-1-santosh@fossix.org>



Le 09/07/2020 à 15:51, Santosh Sivaraj a écrit :
> Introduce notification chain which lets us know about uncorrected memory
> errors(UE). This would help prospective users in pmem or nvdimm subsystem
> to track bad blocks for better handling of persistent memory allocations.
> 
> Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
> Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>

Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>

> ---
>   arch/powerpc/include/asm/mce.h |  2 ++
>   arch/powerpc/kernel/mce.c      | 15 +++++++++++++++
>   2 files changed, 17 insertions(+)
> 
> v2: Address comments from Christophe.
> 
> RESEND: Sending the two patches together so the dependencies are clear. The
> earlier patch reviews are here [1]; rebase the patches on top on 5.8-rc4
> 
> [1]: https://lore.kernel.org/linuxppc-dev/20200330071219.12284-1-ganeshgr@linux.ibm.com/
> 
> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
> index 376a395daf329..7bdd0cd4f2de0 100644
> --- a/arch/powerpc/include/asm/mce.h
> +++ b/arch/powerpc/include/asm/mce.h
> @@ -220,6 +220,8 @@ extern void machine_check_print_event_info(struct machine_check_event *evt,
>   unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
>   extern void mce_common_process_ue(struct pt_regs *regs,
>   				  struct mce_error_info *mce_err);
> +int mce_register_notifier(struct notifier_block *nb);
> +int mce_unregister_notifier(struct notifier_block *nb);
>   #ifdef CONFIG_PPC_BOOK3S_64
>   void flush_and_reload_slb(void);
>   #endif /* CONFIG_PPC_BOOK3S_64 */
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index fd90c0eda2290..b7b3ed4e61937 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -49,6 +49,20 @@ static struct irq_work mce_ue_event_irq_work = {
>   
>   DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
>   
> +static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
> +
> +int mce_register_notifier(struct notifier_block *nb)
> +{
> +	return blocking_notifier_chain_register(&mce_notifier_list, nb);
> +}
> +EXPORT_SYMBOL_GPL(mce_register_notifier);
> +
> +int mce_unregister_notifier(struct notifier_block *nb)
> +{
> +	return blocking_notifier_chain_unregister(&mce_notifier_list, nb);
> +}
> +EXPORT_SYMBOL_GPL(mce_unregister_notifier);
> +
>   static void mce_set_error_info(struct machine_check_event *mce,
>   			       struct mce_error_info *mce_err)
>   {
> @@ -278,6 +292,7 @@ static void machine_process_ue_event(struct work_struct *work)
>   	while (__this_cpu_read(mce_ue_count) > 0) {
>   		index = __this_cpu_read(mce_ue_count) - 1;
>   		evt = this_cpu_ptr(&mce_ue_event_queue[index]);
> +		blocking_notifier_call_chain(&mce_notifier_list, 0, evt);
>   #ifdef CONFIG_MEMORY_FAILURE
>   		/*
>   		 * This should probably queued elsewhere, but
> 

^ permalink raw reply

* [PATCH v2 2/2] papr/scm: Add bad memory ranges to nvdimm bad ranges
From: Santosh Sivaraj @ 2020-07-09 13:51 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Oliver, Aneesh Kumar K.V, Santosh Sivaraj, Mahesh Salgaonkar,
	Ganesh Goudar, Vaibhav Jain
In-Reply-To: <20200709135142.721504-1-santosh@fossix.org>

Subscribe to the MCE notification and add the physical address which
generated a memory error to nvdimm bad range.

Reviewed-by: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
---
 arch/powerpc/platforms/pseries/papr_scm.c | 96 ++++++++++++++++++++++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index 9c569078a09fd..90729029ca010 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -13,9 +13,11 @@
 #include <linux/platform_device.h>
 #include <linux/delay.h>
 #include <linux/seq_buf.h>
+#include <linux/nd.h>
 
 #include <asm/plpar_wrappers.h>
 #include <asm/papr_pdsm.h>
+#include <asm/mce.h>
 
 #define BIND_ANY_ADDR (~0ul)
 
@@ -80,6 +82,7 @@ struct papr_scm_priv {
 	struct resource res;
 	struct nd_region *region;
 	struct nd_interleave_set nd_set;
+	struct list_head region_list;
 
 	/* Protect dimm health data from concurrent read/writes */
 	struct mutex health_mutex;
@@ -91,6 +94,9 @@ struct papr_scm_priv {
 	u64 health_bitmap;
 };
 
+LIST_HEAD(papr_nd_regions);
+DEFINE_MUTEX(papr_ndr_lock);
+
 static int drc_pmem_bind(struct papr_scm_priv *p)
 {
 	unsigned long ret[PLPAR_HCALL_BUFSIZE];
@@ -759,6 +765,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 		dev_info(dev, "Region registered with target node %d and online node %d",
 			 target_nid, online_nid);
 
+	mutex_lock(&papr_ndr_lock);
+	list_add_tail(&p->region_list, &papr_nd_regions);
+	mutex_unlock(&papr_ndr_lock);
+
 	return 0;
 
 err:	nvdimm_bus_unregister(p->bus);
@@ -766,6 +776,68 @@ err:	nvdimm_bus_unregister(p->bus);
 	return -ENXIO;
 }
 
+static void papr_scm_add_badblock(struct nd_region *region,
+				  struct nvdimm_bus *bus, u64 phys_addr)
+{
+	u64 aligned_addr = ALIGN_DOWN(phys_addr, L1_CACHE_BYTES);
+
+	if (nvdimm_bus_add_badrange(bus, aligned_addr, L1_CACHE_BYTES)) {
+		pr_err("Bad block registration for 0x%llx failed\n", phys_addr);
+		return;
+	}
+
+	pr_debug("Add memory range (0x%llx - 0x%llx) as bad range\n",
+		 aligned_addr, aligned_addr + L1_CACHE_BYTES);
+
+	nvdimm_region_notify(region, NVDIMM_REVALIDATE_POISON);
+}
+
+static int handle_mce_ue(struct notifier_block *nb, unsigned long val,
+			 void *data)
+{
+	struct machine_check_event *evt = data;
+	struct papr_scm_priv *p;
+	u64 phys_addr;
+	bool found = false;
+
+	if (evt->error_type != MCE_ERROR_TYPE_UE)
+		return NOTIFY_DONE;
+
+	if (list_empty(&papr_nd_regions))
+		return NOTIFY_DONE;
+
+	/*
+	 * The physical address obtained here is PAGE_SIZE aligned, so get the
+	 * exact address from the effective address
+	 */
+	phys_addr = evt->u.ue_error.physical_address +
+			(evt->u.ue_error.effective_address & ~PAGE_MASK);
+
+	if (!evt->u.ue_error.physical_address_provided ||
+	    !is_zone_device_page(pfn_to_page(phys_addr >> PAGE_SHIFT)))
+		return NOTIFY_DONE;
+
+	/* mce notifier is called from a process context, so mutex is safe */
+	mutex_lock(&papr_ndr_lock);
+	list_for_each_entry(p, &papr_nd_regions, region_list) {
+		if (phys_addr >= p->res.start && phys_addr <= p->res.end) {
+			found = true;
+			break;
+		}
+	}
+
+	if (found)
+		papr_scm_add_badblock(p->region, p->bus, phys_addr);
+
+	mutex_unlock(&papr_ndr_lock);
+
+	return found ? NOTIFY_OK : NOTIFY_DONE;
+}
+
+static struct notifier_block mce_ue_nb = {
+	.notifier_call = handle_mce_ue
+};
+
 static int papr_scm_probe(struct platform_device *pdev)
 {
 	struct device_node *dn = pdev->dev.of_node;
@@ -866,6 +938,10 @@ static int papr_scm_remove(struct platform_device *pdev)
 {
 	struct papr_scm_priv *p = platform_get_drvdata(pdev);
 
+	mutex_lock(&papr_ndr_lock);
+	list_del(&p->region_list);
+	mutex_unlock(&papr_ndr_lock);
+
 	nvdimm_bus_unregister(p->bus);
 	drc_pmem_unbind(p);
 	kfree(p->bus_desc.provider_name);
@@ -888,7 +964,25 @@ static struct platform_driver papr_scm_driver = {
 	},
 };
 
-module_platform_driver(papr_scm_driver);
+static int __init papr_scm_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&papr_scm_driver);
+	if (!ret)
+		mce_register_notifier(&mce_ue_nb);
+
+	return ret;
+}
+module_init(papr_scm_init);
+
+static void __exit papr_scm_exit(void)
+{
+	mce_unregister_notifier(&mce_ue_nb);
+	platform_driver_unregister(&papr_scm_driver);
+}
+module_exit(papr_scm_exit);
+
 MODULE_DEVICE_TABLE(of, papr_scm_match);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("IBM Corporation");
-- 
2.26.2


^ permalink raw reply related

* [PATCH v2 1/2] powerpc/mce: Add MCE notification chain
From: Santosh Sivaraj @ 2020-07-09 13:51 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Oliver, Aneesh Kumar K.V, Santosh Sivaraj, Mahesh Salgaonkar,
	Ganesh Goudar, Vaibhav Jain

Introduce notification chain which lets us know about uncorrected memory
errors(UE). This would help prospective users in pmem or nvdimm subsystem
to track bad blocks for better handling of persistent memory allocations.

Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
---
 arch/powerpc/include/asm/mce.h |  2 ++
 arch/powerpc/kernel/mce.c      | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

v2: Address comments from Christophe.

RESEND: Sending the two patches together so the dependencies are clear. The
earlier patch reviews are here [1]; rebase the patches on top on 5.8-rc4

[1]: https://lore.kernel.org/linuxppc-dev/20200330071219.12284-1-ganeshgr@linux.ibm.com/

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 376a395daf329..7bdd0cd4f2de0 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -220,6 +220,8 @@ extern void machine_check_print_event_info(struct machine_check_event *evt,
 unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
 extern void mce_common_process_ue(struct pt_regs *regs,
 				  struct mce_error_info *mce_err);
+int mce_register_notifier(struct notifier_block *nb);
+int mce_unregister_notifier(struct notifier_block *nb);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index fd90c0eda2290..b7b3ed4e61937 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -49,6 +49,20 @@ static struct irq_work mce_ue_event_irq_work = {
 
 DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
+static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
+
+int mce_register_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&mce_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(mce_register_notifier);
+
+int mce_unregister_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&mce_notifier_list, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_notifier);
+
 static void mce_set_error_info(struct machine_check_event *mce,
 			       struct mce_error_info *mce_err)
 {
@@ -278,6 +292,7 @@ static void machine_process_ue_event(struct work_struct *work)
 	while (__this_cpu_read(mce_ue_count) > 0) {
 		index = __this_cpu_read(mce_ue_count) - 1;
 		evt = this_cpu_ptr(&mce_ue_event_queue[index]);
+		blocking_notifier_call_chain(&mce_notifier_list, 0, evt);
 #ifdef CONFIG_MEMORY_FAILURE
 		/*
 		 * This should probably queued elsewhere, but
-- 
2.26.2


^ permalink raw reply related

* Re: Failure to build librseq on ppc
From: Mathieu Desnoyers @ 2020-07-09 13:43 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <20200709001837.GD3598@gate.crashing.org>

----- On Jul 8, 2020, at 8:18 PM, Segher Boessenkool segher@kernel.crashing.org wrote:

> On Wed, Jul 08, 2020 at 08:01:23PM -0400, Mathieu Desnoyers wrote:
>> > > #define RSEQ_ASM_OP_CMPEQ(var, expect, label)
>> > > \
>> > >                 LOAD_WORD "%%r17, %[" __rseq_str(var) "]\n\t"                   \
>> > 
>> > The way this hardcodes r17 *will* break, btw.  The compiler will not
>> > likely want to use r17 as long as your code (after inlining etc.!) stays
>> > small, but there is Murphy's law.
>> 
>> r17 is in the clobber list, so it should be ok.
> 
> What protects r17 *after* this asm statement?

As discussed in the other leg of the thread (with the code example),
r17 is in the clobber list of all asm statements using this macro, and
is used as a temporary register within each inline asm.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* Re: Failure to build librseq on ppc
From: Mathieu Desnoyers @ 2020-07-09 13:33 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Boqun Feng, linuxppc-dev, Michael Jeanson
In-Reply-To: <20200709001003.GB3598@gate.crashing.org>

----- On Jul 8, 2020, at 8:10 PM, Segher Boessenkool segher@kernel.crashing.org wrote:

> Hi!
> 
> On Wed, Jul 08, 2020 at 10:00:01AM -0400, Mathieu Desnoyers wrote:
[...]
> 
>> -#define STORE_WORD     "std "
>> -#define LOAD_WORD      "ld "
>> -#define LOADX_WORD     "ldx "
>> +#define STORE_WORD(arg)        "std%U[" __rseq_str(arg) "]%X[" __rseq_str(arg)
>> "] "    /* To memory ("m" constraint) */
>> +#define LOAD_WORD(arg) "lwd%U[" __rseq_str(arg) "]%X[" __rseq_str(arg) "] "
>> /* From memory ("m" constraint) */
> 
> That cannot work (you typoed "ld" here).

Indeed, I noticed it before pushing to master (lwd -> ld).

> 
> Some more advice about this code, pretty generic stuff:

Let's take an example to support the discussion here. I'm taking it from
master branch (after a cleanup changing e.g. LOAD_WORD into RSEQ_LOAD_LONG).
So for powerpc32 we have (code edited to remove testing instrumentation):

#define __rseq_str_1(x) #x
#define __rseq_str(x)           __rseq_str_1(x)

#define RSEQ_STORE_LONG(arg)    "stw%U[" __rseq_str(arg) "]%X[" __rseq_str(arg) "] "    /* To memory ("m" constraint) */
#define RSEQ_STORE_INT(arg)     RSEQ_STORE_LONG(arg)                                    /* To memory ("m" constraint) */
#define RSEQ_LOAD_LONG(arg)     "lwz%U[" __rseq_str(arg) "]%X[" __rseq_str(arg) "] "    /* From memory ("m" constraint) */
#define RSEQ_LOAD_INT(arg)      RSEQ_LOAD_LONG(arg)                                     /* From memory ("m" constraint) */
#define RSEQ_LOADX_LONG         "lwzx "                                                 /* From base register ("b" constraint) */
#define RSEQ_CMP_LONG           "cmpw "

#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags,                          \
                        start_ip, post_commit_offset, abort_ip)                 \
                ".pushsection __rseq_cs, \"aw\"\n\t"                            \
                ".balign 32\n\t"                                                \
                __rseq_str(label) ":\n\t"                                       \
                ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t"      \
                /* 32-bit only supported on BE */                               \
                ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
                ".popsection\n\t"                                       \
                ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t"          \
                ".long 0x0, " __rseq_str(label) "b\n\t"                 \
                ".popsection\n\t"

/*
 * Exit points of a rseq critical section consist of all instructions outside
 * of the critical section where a critical section can either branch to or
 * reach through the normal course of its execution. The abort IP and the
 * post-commit IP are already part of the __rseq_cs section and should not be
 * explicitly defined as additional exit points. Knowing all exit points is
 * useful to assist debuggers stepping over the critical section.
 */
#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip)                           \
                ".pushsection __rseq_exit_point_array, \"aw\"\n\t"              \
                /* 32-bit only supported on BE */                               \
                ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n\t" \
                ".popsection\n\t"

#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs)                        \
                "lis %%r17, (" __rseq_str(cs_label) ")@ha\n\t"                  \
                "addi %%r17, %%r17, (" __rseq_str(cs_label) ")@l\n\t"           \
                RSEQ_STORE_INT(rseq_cs) "%%r17, %[" __rseq_str(rseq_cs) "]\n\t" \
                __rseq_str(label) ":\n\t"

#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip)        \
                __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip,              \
                                        (post_commit_ip - start_ip), abort_ip)

#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label)                      \
                RSEQ_LOAD_INT(current_cpu_id) "%%r17, %[" __rseq_str(current_cpu_id) "]\n\t" \
                "cmpw cr7, %[" __rseq_str(cpu_id) "], %%r17\n\t"                \
                "bne- cr7, " __rseq_str(label) "\n\t"

#define RSEQ_ASM_DEFINE_ABORT(label, abort_label)                               \
                ".pushsection __rseq_failure, \"ax\"\n\t"                       \
                ".long " __rseq_str(RSEQ_SIG) "\n\t"                            \
                __rseq_str(label) ":\n\t"                                       \
                "b %l[" __rseq_str(abort_label) "]\n\t"                         \
                ".popsection\n\t"

#define RSEQ_ASM_OP_CMPEQ(var, expect, label)                                   \
                RSEQ_LOAD_LONG(var) "%%r17, %[" __rseq_str(var) "]\n\t"         \
                RSEQ_CMP_LONG "cr7, %%r17, %[" __rseq_str(expect) "]\n\t"               \
                "bne- cr7, " __rseq_str(label) "\n\t"

#define RSEQ_ASM_OP_FINAL_STORE(value, var, post_commit_label)                  \
                RSEQ_STORE_LONG(var) "%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n\t" \
                __rseq_str(post_commit_label) ":\n\t"

static inline __attribute__((always_inline))
int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
{
        __asm__ __volatile__ goto (
                RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
                RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])

                /* Start rseq by storing table entry pointer into rseq_cs. */
                RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
                /* cmp cpuid */
                RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
                /* cmp @v equal to @expect */
                RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])

                /* final store */
                RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
                RSEQ_ASM_DEFINE_ABORT(4, abort)

                : /* gcc asm goto does not allow outputs */
                : [cpu_id]              "r" (cpu),
                  [current_cpu_id]      "m" (__rseq_abi.cpu_id),
                  [rseq_cs]             "m" (__rseq_abi.rseq_cs),
                  [v]                   "m" (*v),
                  [expect]              "r" (expect),
                  [newv]                "r" (newv)
                : "memory", "cc", "r17"
                : abort, cmpfail
        );
        return 0;
abort:
        RSEQ_INJECT_FAILED
        return -1;
cmpfail:
        return 1;
}

> 
> The way this all uses r17 will likely not work reliably.

r17 is only used as a temporary register within the inline assembler, and it is
in the clobber list. In which scenario would it not work reliably ?

> The way multiple asm statements are used seems to have missing
> dependencies between the statements.

I'm not sure I follow here. Note that we are injecting the CPP macros into
a single inline asm statement as strings.

> 
> Don't try to work *against* the compiler.  You will not win.
> 
> Alternatively, write assembler code, if that is what you actually want
> to do?  Not C code.
>
> And done macro-mess this, you want to be able to debug it, and you need
> other people to be able to read it!

I understand that looking at macros can be cumbersome from the perspective
of a reviewer only interested in a single architecture,

However, from my perspective, as a maintainer who must maintain similar code
for x86 32/64, powerpc 32/64, arm, aarch64, s390, s390x, mips 32/64, and likely
other architectures in the future, the macros abstracting 32-bit and 64-bit
allow to eliminate code duplication for each architecture with 32-bit and 64-bit
variants, which is better for maintainability.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply

* [PATCH v3 4/4] powerpc/mm/radix: Create separate mappings for hot-plugged memory
From: Aneesh Kumar K.V @ 2020-07-09 13:19 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K.V, Bharata B Rao
In-Reply-To: <20200709131925.922266-1-aneesh.kumar@linux.ibm.com>

To enable memory unplug without splitting kernel page table
mapping, we force the max mapping size to the LMB size. LMB
size is the unit in which hypervisor will do memory add/remove
operation.

Pseries systems supports max LMB size of 256MB. Hence on pseries,
we now end up mapping memory with 2M page size instead of 1G. To improve
that we want hypervisor to hint the kernel about the hotplug
memory range. That was added that as part of

commit b6eca183e23e ("powerpc/kernel: Enables memory
hot-remove after reboot on pseries guests")

But PowerVM doesn't provide that hint yet. Once we get PowerVM
updated, we can then force the 2M mapping only to hot-pluggable
memory region using memblock_is_hotpluggable(). Till then
let's depend on LMB size for finding the mapping page size
for linear range.

With this change KVM guest will also be doing linear mapping with
2M page size.

The actual TLB benefit of mapping guest page table entries with
hugepage size can only be materialized if the partition scoped
entries are also using the same or higher page size. A guest using
1G hugetlbfs backing guest memory can have a performance impact with
the above change.

Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/mmu.h |  5 ++
 arch/powerpc/mm/book3s64/radix_pgtable.c | 81 ++++++++++++++++++++----
 arch/powerpc/platforms/powernv/setup.c   | 10 ++-
 3 files changed, 84 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 5393a535240c..15aae924f41c 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -82,6 +82,11 @@ extern unsigned int mmu_pid_bits;
 /* Base PID to allocate from */
 extern unsigned int mmu_base_pid;
 
+/*
+ * memory block size used with radix translation.
+ */
+extern unsigned int __ro_after_init radix_mem_block_size;
+
 #define PRTB_SIZE_SHIFT	(mmu_pid_bits + 4)
 #define PRTB_ENTRIES	(1ul << mmu_pid_bits)
 
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index d5a01b9aadc9..bba45fc0b7b2 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/string_helpers.h>
+#include <linux/memory.h>
 
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
@@ -33,6 +34,7 @@
 
 unsigned int mmu_pid_bits;
 unsigned int mmu_base_pid;
+unsigned int radix_mem_block_size __ro_after_init;
 
 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
 			unsigned long region_start, unsigned long region_end)
@@ -265,6 +267,7 @@ static unsigned long next_boundary(unsigned long addr, unsigned long end)
 
 static int __meminit create_physical_mapping(unsigned long start,
 					     unsigned long end,
+					     unsigned long max_mapping_size,
 					     int nid, pgprot_t _prot)
 {
 	unsigned long vaddr, addr, mapping_size = 0;
@@ -278,6 +281,8 @@ static int __meminit create_physical_mapping(unsigned long start,
 		int rc;
 
 		gap = next_boundary(addr, end) - addr;
+		if (gap > max_mapping_size)
+			gap = max_mapping_size;
 		previous_size = mapping_size;
 		prev_exec = exec;
 
@@ -328,8 +333,9 @@ static void __init radix_init_pgtable(void)
 
 	/* We don't support slb for radix */
 	mmu_slb_size = 0;
+
 	/*
-	 * Create the linear mapping, using standard page size for now
+	 * Create the linear mapping
 	 */
 	for_each_memblock(memory, reg) {
 		/*
@@ -345,6 +351,7 @@ static void __init radix_init_pgtable(void)
 
 		WARN_ON(create_physical_mapping(reg->base,
 						reg->base + reg->size,
+						radix_mem_block_size,
 						-1, PAGE_KERNEL));
 	}
 
@@ -485,6 +492,47 @@ static int __init radix_dt_scan_page_sizes(unsigned long node,
 	return 1;
 }
 
+static int __init probe_memory_block_size(unsigned long node, const char *uname, int
+					  depth, void *data)
+{
+	unsigned long *mem_block_size = (unsigned long *)data;
+	const __be64 *prop;
+	int len;
+
+	if (depth != 1)
+		return 0;
+
+	if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
+		return 0;
+
+	prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
+	if (!prop || len < sizeof(__be64))
+		/*
+		 * Nothing in the device tree
+		 */
+		*mem_block_size = MIN_MEMORY_BLOCK_SIZE;
+	else
+		*mem_block_size = be64_to_cpup(prop);
+	return 1;
+}
+
+static unsigned long radix_memory_block_size(void)
+{
+	unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
+
+	/*
+	 * OPAL firmware feature is set by now. Hence we are ok
+	 * to test OPAL feature.
+	 */
+	if (firmware_has_feature(FW_FEATURE_OPAL))
+		mem_block_size = 1UL * 1024 * 1024 * 1024;
+	else
+		of_scan_flat_dt(probe_memory_block_size, &mem_block_size);
+
+	return mem_block_size;
+}
+
+
 void __init radix__early_init_devtree(void)
 {
 	int rc;
@@ -493,17 +541,27 @@ void __init radix__early_init_devtree(void)
 	 * Try to find the available page sizes in the device-tree
 	 */
 	rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
-	if (rc != 0)  /* Found */
-		goto found;
+	if (!rc) {
+		/*
+		 * No page size details found in device tree.
+		 * Let's assume we have page 4k and 64k support
+		 */
+		mmu_psize_defs[MMU_PAGE_4K].shift = 12;
+		mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
+
+		mmu_psize_defs[MMU_PAGE_64K].shift = 16;
+		mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
+	}
+
 	/*
-	 * let's assume we have page 4k and 64k support
+	 * Max mapping size used when mapping pages. We don't use
+	 * ppc_md.memory_block_size() here because this get called
+	 * early and we don't have machine probe called yet. Also
+	 * the pseries implementation only check for ibm,lmb-size.
+	 * All hypervisor supporting radix do expose that device
+	 * tree node.
 	 */
-	mmu_psize_defs[MMU_PAGE_4K].shift = 12;
-	mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
-
-	mmu_psize_defs[MMU_PAGE_64K].shift = 16;
-	mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
-found:
+	radix_mem_block_size = radix_memory_block_size();
 	return;
 }
 
@@ -855,7 +913,8 @@ int __meminit radix__create_section_mapping(unsigned long start,
 		return -1;
 	}
 
-	return create_physical_mapping(__pa(start), __pa(end), nid, prot);
+	return create_physical_mapping(__pa(start), __pa(end),
+				       radix_mem_block_size, nid, prot);
 }
 
 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 3bc188da82ba..7fcb88623081 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -399,7 +399,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static unsigned long pnv_memory_block_size(void)
 {
-	return 256UL * 1024 * 1024;
+	/*
+	 * We map the kernel linear region with 1GB large pages on radix. For
+	 * memory hot unplug to work our memory block size must be at least
+	 * this size.
+	 */
+	if (radix_enabled())
+		return radix_mem_block_size;
+	else
+		return 256UL * 1024 * 1024;
 }
 #endif
 
-- 
2.26.2


^ permalink raw reply related

* [PATCH v3 3/4] powerpc/mm/radix: Remove split_kernel_mapping()
From: Aneesh Kumar K.V @ 2020-07-09 13:19 UTC (permalink / raw)
  To: linuxppc-dev, mpe; +Cc: Aneesh Kumar K . V, Bharata B Rao
In-Reply-To: <20200709131925.922266-1-aneesh.kumar@linux.ibm.com>

From: Bharata B Rao <bharata@linux.ibm.com>

We split the page table mapping on memory unplug if the
linear range was mapped with huge page mapping (for ex: 1G)
The page table splitting code has a few issues:

1. Recursive locking
--------------------
Memory unplug path takes cpu_hotplug_lock and calls stop_machine()
for splitting the mappings. However stop_machine() takes
cpu_hotplug_lock again causing deadlock.

2. BUG: sleeping function called from in_atomic() context
---------------------------------------------------------
Memory unplug path (remove_pagetable) takes init_mm.page_table_lock
spinlock and later calls stop_machine() which does wait_for_completion()

3. Bad unlock unbalance
-----------------------
Memory unplug path takes init_mm.page_table_lock spinlock and calls
stop_machine(). The stop_machine thread function runs in a different
thread context (migration thread) which tries to release and reaquire
ptl. Releasing ptl from a different thread than which acquired it
causes bad unlock unbalance.

These problems can be avoided if we avoid mapping hot-plugged memory
with 1G mapping, thereby removing the need for splitting them during
unplug. The kernel always make sure the minimum unplug request is
SUBSECTION_SIZE for device memory and SECTION_SIZE for regular memory.

In preparation for such a change remove page table splitting support.

This essentially is a revert of
commit 4dd5f8a99e791 ("powerpc/mm/radix: Split linear mapping on hot-unplug")

Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 95 +++++-------------------
 1 file changed, 19 insertions(+), 76 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 46ad2da3087a..d5a01b9aadc9 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -15,7 +15,6 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/string_helpers.h>
-#include <linux/stop_machine.h>
 
 #include <asm/pgalloc.h>
 #include <asm/mmu_context.h>
@@ -722,32 +721,6 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
 	p4d_clear(p4d);
 }
 
-struct change_mapping_params {
-	pte_t *pte;
-	unsigned long start;
-	unsigned long end;
-	unsigned long aligned_start;
-	unsigned long aligned_end;
-};
-
-static int __meminit stop_machine_change_mapping(void *data)
-{
-	struct change_mapping_params *params =
-			(struct change_mapping_params *)data;
-
-	if (!data)
-		return -1;
-
-	spin_unlock(&init_mm.page_table_lock);
-	pte_clear(&init_mm, params->aligned_start, params->pte);
-	create_physical_mapping(__pa(params->aligned_start),
-				__pa(params->start), -1, PAGE_KERNEL);
-	create_physical_mapping(__pa(params->end), __pa(params->aligned_end),
-				-1, PAGE_KERNEL);
-	spin_lock(&init_mm.page_table_lock);
-	return 0;
-}
-
 static void remove_pte_table(pte_t *pte_start, unsigned long addr,
 			     unsigned long end)
 {
@@ -776,52 +749,6 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr,
 	}
 }
 
-/*
- * clear the pte and potentially split the mapping helper
- */
-static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
-				unsigned long size, pte_t *pte)
-{
-	unsigned long mask = ~(size - 1);
-	unsigned long aligned_start = addr & mask;
-	unsigned long aligned_end = addr + size;
-	struct change_mapping_params params;
-	bool split_region = false;
-
-	if ((end - addr) < size) {
-		/*
-		 * We're going to clear the PTE, but not flushed
-		 * the mapping, time to remap and flush. The
-		 * effects if visible outside the processor or
-		 * if we are running in code close to the
-		 * mapping we cleared, we are in trouble.
-		 */
-		if (overlaps_kernel_text(aligned_start, addr) ||
-			overlaps_kernel_text(end, aligned_end)) {
-			/*
-			 * Hack, just return, don't pte_clear
-			 */
-			WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
-				  "text, not splitting\n", addr, end);
-			return;
-		}
-		split_region = true;
-	}
-
-	if (split_region) {
-		params.pte = pte;
-		params.start = addr;
-		params.end = end;
-		params.aligned_start = addr & ~(size - 1);
-		params.aligned_end = min_t(unsigned long, aligned_end,
-				(unsigned long)__va(memblock_end_of_DRAM()));
-		stop_machine(stop_machine_change_mapping, &params, NULL);
-		return;
-	}
-
-	pte_clear(&init_mm, addr, pte);
-}
-
 static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 			     unsigned long end)
 {
@@ -837,7 +764,12 @@ static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 			continue;
 
 		if (pmd_is_leaf(*pmd)) {
-			split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
+			if (!IS_ALIGNED(addr, PMD_SIZE) ||
+			    !IS_ALIGNED(next, PMD_SIZE)) {
+				WARN_ONCE(1, "%s: unaligned range\n", __func__);
+				continue;
+			}
+			pte_clear(&init_mm, addr, (pte_t *)pmd);
 			continue;
 		}
 
@@ -862,7 +794,12 @@ static void remove_pud_table(pud_t *pud_start, unsigned long addr,
 			continue;
 
 		if (pud_is_leaf(*pud)) {
-			split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
+			if (!IS_ALIGNED(addr, PUD_SIZE) ||
+			    !IS_ALIGNED(next, PUD_SIZE)) {
+				WARN_ONCE(1, "%s: unaligned range\n", __func__);
+				continue;
+			}
+			pte_clear(&init_mm, addr, (pte_t *)pud);
 			continue;
 		}
 
@@ -890,7 +827,13 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end)
 			continue;
 
 		if (p4d_is_leaf(*p4d)) {
-			split_kernel_mapping(addr, end, P4D_SIZE, (pte_t *)p4d);
+			if (!IS_ALIGNED(addr, P4D_SIZE) ||
+			    !IS_ALIGNED(next, P4D_SIZE)) {
+				WARN_ONCE(1, "%s: unaligned range\n", __func__);
+				continue;
+			}
+
+			pte_clear(&init_mm, addr, (pte_t *)pgd);
 			continue;
 		}
 
-- 
2.26.2


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox