LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH v2 1/2] powerpc/perf: init pmu from core-book3s
From: Christophe Leroy @ 2019-04-29  5:38 UTC (permalink / raw)
  To: Madhavan Srinivasan, mpe; +Cc: linuxppc-dev
In-Reply-To: <1556506368-29329-1-git-send-email-maddy@linux.vnet.ibm.com>



Le 29/04/2019 à 04:52, Madhavan Srinivasan a écrit :
> Currenty pmu driver file for each ppc64 generation processor
> has a __init call in itself. Refactor the code by moving the
> __init call to core-books.c. This also clean's up compat mode
> pmu driver registration.

Can you explain the advantage of doing so ?
For me it makes more sense to have independant drivers with their own 
init call.


> 
> Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
> Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
> ---
> Changelog v1:
> - Added "internal.h" file and moved the extern definitions to that file
> 
>   arch/powerpc/perf/core-book3s.c | 28 ++++++++++++++++++++++++++++
>   arch/powerpc/perf/internal.h    | 16 ++++++++++++++++
>   arch/powerpc/perf/power5+-pmu.c |  4 +---
>   arch/powerpc/perf/power5-pmu.c  |  4 +---
>   arch/powerpc/perf/power6-pmu.c  |  4 +---
>   arch/powerpc/perf/power7-pmu.c  |  4 +---
>   arch/powerpc/perf/power8-pmu.c  |  3 +--
>   arch/powerpc/perf/power9-pmu.c  |  3 +--
>   arch/powerpc/perf/ppc970-pmu.c  |  4 +---
>   9 files changed, 51 insertions(+), 19 deletions(-)
>   create mode 100644 arch/powerpc/perf/internal.h
> 
> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
> index b0723002a396..a96f9420139c 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -22,6 +22,10 @@
>   #include <asm/ptrace.h>
>   #include <asm/code-patching.h>
>   
> +#ifdef CONFIG_PPC64

Can we avoid that CONFIG_PPC64 ifdef ? Why isn't it compatible with PPC32 ?

> +#include "internal.h"
> +#endif
> +
>   #define BHRB_MAX_ENTRIES	32
>   #define BHRB_TARGET		0x0000000000000002
>   #define BHRB_PREDICTION		0x0000000000000001
> @@ -2294,3 +2298,27 @@ int register_power_pmu(struct power_pmu *pmu)
>   			  power_pmu_prepare_cpu, NULL);
>   	return 0;
>   }
> +
> +#ifdef CONFIG_PPC64

Same, why PPC64 ?

> +static int __init init_ppc64_pmu(void)
> +{
> +	/* run through all the pmu drivers one at a time */
> +	if (!init_power5_pmu())
> +		return 0;
> +	else if (!init_power5p_pmu())
> +		return 0;
> +	else if (!init_power6_pmu())
> +		return 0;
> +	else if (!init_power7_pmu())
> +		return 0;
> +	else if (!init_power8_pmu())
> +		return 0;
> +	else if (!init_power9_pmu())
> +		return 0;
> +	else if (!init_ppc970_pmu())
> +		return 0;
> +	else
> +		return -ENODEV;
> +}
> +early_initcall(init_ppc64_pmu);
> +#endif
> diff --git a/arch/powerpc/perf/internal.h b/arch/powerpc/perf/internal.h
> new file mode 100644
> index 000000000000..e54d524d4283
> --- /dev/null
> +++ b/arch/powerpc/perf/internal.h
> @@ -0,0 +1,16 @@
> +/*
> + * Copyright 2019 Madhavan Srinivasan, IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */
> +
> +extern int init_ppc970_pmu(void);
> +extern int init_power5_pmu(void);
> +extern int init_power5p_pmu(void);
> +extern int init_power6_pmu(void);
> +extern int init_power7_pmu(void);
> +extern int init_power8_pmu(void);
> +extern int init_power9_pmu(void);

'extern' keyword is pointless, please remove it (checkpatch --strict 
probably told it to you).


Christophe


> diff --git a/arch/powerpc/perf/power5+-pmu.c b/arch/powerpc/perf/power5+-pmu.c
> index 0526dac66007..9aa803504cb2 100644
> --- a/arch/powerpc/perf/power5+-pmu.c
> +++ b/arch/powerpc/perf/power5+-pmu.c
> @@ -677,7 +677,7 @@ static struct power_pmu power5p_pmu = {
>   	.cache_events		= &power5p_cache_events,
>   };
>   
> -static int __init init_power5p_pmu(void)
> +int init_power5p_pmu(void)
>   {
>   	if (!cur_cpu_spec->oprofile_cpu_type ||
>   	    (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5+")
> @@ -686,5 +686,3 @@ static int __init init_power5p_pmu(void)
>   
>   	return register_power_pmu(&power5p_pmu);
>   }
> -
> -early_initcall(init_power5p_pmu);
> diff --git a/arch/powerpc/perf/power5-pmu.c b/arch/powerpc/perf/power5-pmu.c
> index 4dc99f9f7962..30cb13d081a9 100644
> --- a/arch/powerpc/perf/power5-pmu.c
> +++ b/arch/powerpc/perf/power5-pmu.c
> @@ -618,7 +618,7 @@ static struct power_pmu power5_pmu = {
>   	.flags			= PPMU_HAS_SSLOT,
>   };
>   
> -static int __init init_power5_pmu(void)
> +int init_power5_pmu(void)
>   {
>   	if (!cur_cpu_spec->oprofile_cpu_type ||
>   	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5"))
> @@ -626,5 +626,3 @@ static int __init init_power5_pmu(void)
>   
>   	return register_power_pmu(&power5_pmu);
>   }
> -
> -early_initcall(init_power5_pmu);
> diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c
> index 9c9d646b68a1..80ec48632cfe 100644
> --- a/arch/powerpc/perf/power6-pmu.c
> +++ b/arch/powerpc/perf/power6-pmu.c
> @@ -540,7 +540,7 @@ static struct power_pmu power6_pmu = {
>   	.cache_events		= &power6_cache_events,
>   };
>   
> -static int __init init_power6_pmu(void)
> +int init_power6_pmu(void)
>   {
>   	if (!cur_cpu_spec->oprofile_cpu_type ||
>   	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power6"))
> @@ -548,5 +548,3 @@ static int __init init_power6_pmu(void)
>   
>   	return register_power_pmu(&power6_pmu);
>   }
> -
> -early_initcall(init_power6_pmu);
> diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
> index 6dbae9884ec4..bb6efd5d2530 100644
> --- a/arch/powerpc/perf/power7-pmu.c
> +++ b/arch/powerpc/perf/power7-pmu.c
> @@ -445,7 +445,7 @@ static struct power_pmu power7_pmu = {
>   	.cache_events		= &power7_cache_events,
>   };
>   
> -static int __init init_power7_pmu(void)
> +int init_power7_pmu(void)
>   {
>   	if (!cur_cpu_spec->oprofile_cpu_type ||
>   	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power7"))
> @@ -456,5 +456,3 @@ static int __init init_power7_pmu(void)
>   
>   	return register_power_pmu(&power7_pmu);
>   }
> -
> -early_initcall(init_power7_pmu);
> diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
> index d12a2db26353..bcc3409a06de 100644
> --- a/arch/powerpc/perf/power8-pmu.c
> +++ b/arch/powerpc/perf/power8-pmu.c
> @@ -379,7 +379,7 @@ static struct power_pmu power8_pmu = {
>   	.bhrb_nr		= 32,
>   };
>   
> -static int __init init_power8_pmu(void)
> +int init_power8_pmu(void)
>   {
>   	int rc;
>   
> @@ -399,4 +399,3 @@ static int __init init_power8_pmu(void)
>   
>   	return 0;
>   }
> -early_initcall(init_power8_pmu);
> diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
> index 030544e35959..3a31ac6f4805 100644
> --- a/arch/powerpc/perf/power9-pmu.c
> +++ b/arch/powerpc/perf/power9-pmu.c
> @@ -437,7 +437,7 @@ static struct power_pmu power9_pmu = {
>   	.bhrb_nr		= 32,
>   };
>   
> -static int __init init_power9_pmu(void)
> +int init_power9_pmu(void)
>   {
>   	int rc = 0;
>   	unsigned int pvr = mfspr(SPRN_PVR);
> @@ -467,4 +467,3 @@ static int __init init_power9_pmu(void)
>   
>   	return 0;
>   }
> -early_initcall(init_power9_pmu);
> diff --git a/arch/powerpc/perf/ppc970-pmu.c b/arch/powerpc/perf/ppc970-pmu.c
> index 8b6a8a36fa38..1d3370914022 100644
> --- a/arch/powerpc/perf/ppc970-pmu.c
> +++ b/arch/powerpc/perf/ppc970-pmu.c
> @@ -490,7 +490,7 @@ static struct power_pmu ppc970_pmu = {
>   	.flags			= PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING,
>   };
>   
> -static int __init init_ppc970_pmu(void)
> +int init_ppc970_pmu(void)
>   {
>   	if (!cur_cpu_spec->oprofile_cpu_type ||
>   	    (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970")
> @@ -499,5 +499,3 @@ static int __init init_ppc970_pmu(void)
>   
>   	return register_power_pmu(&ppc970_pmu);
>   }
> -
> -early_initcall(init_ppc970_pmu);
> 

^ permalink raw reply

* Re: [PATCHv2] kernel/crash: make parse_crashkernel()'s return value more indicant
From: Dave Young @ 2019-04-29  5:04 UTC (permalink / raw)
  To: Pingfan Liu
  Cc: Rich Felker, linux-ia64, Julien Thierry, Yangtao Li,
	Palmer Dabbelt, Heiko Carstens, Stefan Agner, linux-mips,
	Paul Mackerras, H. Peter Anvin, Thomas Gleixner, Logan Gunthorpe,
	linux-s390, Florian Fainelli, Yoshinori Sato, linux-sh, x86,
	Russell King, Ingo Molnar, Hari Bathini, Catalin Marinas,
	James Hogan, Fenghua Yu, Tony Luck, Will Deacon, Johannes Weiner,
	Borislav Petkov, David Hildenbrand, linux-arm-kernel, Jens Axboe,
	Matthias Brugger, Baoquan He, Ard Biesheuvel, Robin Murphy, kexec,
	LKML, Ralf Baechle, Thomas Bogendoerfer, Paul Burton,
	Greg Kroah-Hartman, Martin Schwidefsky, Andrew Morton,
	linuxppc-dev, Greg Hackmann
In-Reply-To: <CAFgQCTszGixzH5ZrwOzjbp7W91Wxo3XvA+EeEx0ErVVcYMr0FA@mail.gmail.com>

On 04/29/19 at 12:48pm, Pingfan Liu wrote:
> On Mon, Apr 29, 2019 at 11:04 AM Pingfan Liu <kernelfans@gmail.com> wrote:
> >
> > On Sun, Apr 28, 2019 at 4:37 PM Dave Young <dyoung@redhat.com> wrote:
> > >
> > > On 04/25/19 at 04:20pm, Pingfan Liu wrote:
> > > > On Wed, Apr 24, 2019 at 4:31 PM Matthias Brugger <mbrugger@suse.com> wrote:
> > > > >
> > > > >
> > > > [...]
> > > > > > @@ -139,6 +141,8 @@ static int __init parse_crashkernel_simple(char *cmdline,
> > > > > >               pr_warn("crashkernel: unrecognized char: %c\n", *cur);
> > > > > >               return -EINVAL;
> > > > > >       }
> > > > > > +     if (*crash_size == 0)
> > > > > > +             return -EINVAL;
> > > > >
> > > > > This covers the case where I pass an argument like "crashkernel=0M" ?
> > > > > Can't we fix that by using kstrtoull() in memparse and check if the return value
> > > > > is < 0? In that case we could return without updating the retptr and we will be
> > > > > fine.
> > > > >
> > > > It seems that kstrtoull() treats 0M as invalid parameter, while
> > > > simple_strtoull() does not.
> > > >
> > > > If changed like your suggestion, then all the callers of memparse()
> > > > will treats 0M as invalid parameter. This affects many components
> > > > besides kexec.  Not sure this can be done or not.
> > >
> > > simple_strtoull is obsolete, move to kstrtoull is the right way.
> > >
> > > $ git grep memparse|wc
> > >     158     950   10479
> > >
> > > Except some documentation/tools etc there are still a log of callers
> > > which directly use the return value as the ull number without error
> > > checking.
> > >
> > > So it would be good to mark memparse as obsolete as well in
> > > lib/cmdline.c, and introduce a new function eg. kmemparse() to use
> > > kstrtoull,  and return a real error code, and save the size in an
> > > argument like &size.  Then update X86 crashkernel code to use it.
> > >
> > Thank for your good suggestion.
> >
> Go through the v5.0 kernel code, I think it will be a huge job.
> 
> The difference between unsigned long long simple_strtoull(const char
> *cp, char **endp, unsigned int base) and int _kstrtoull(const char *s,
> unsigned int base, unsigned long long *res) is bigger than expected,
> especially the output parameter @res. Many references to
> memparse(const char *ptr, char **retptr) rely on @retptr to work. A
> typical example from arch/x86/kernel/e820.c
>         mem_size = memparse(p, &p);
>         if (p == oldp)
>                 return -EINVAL;
> 
>         userdef = 1;
>         if (*p == '@') {  <----------- here
>                 start_at = memparse(p+1, &p);
>                 e820__range_add(start_at, mem_size, E820_TYPE_RAM);
>         } else if (*p == '#') {
>                 start_at = memparse(p+1, &p);
>                 e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
>         } else if (*p == '$') {
>                 start_at = memparse(p+1, &p);
>                 e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
>         }
> 
> So we need to resolve the prototype of kstrtoull() firstly, and maybe
> kstrtouint() etc too. All of them have lots of references in kernel.
> 
> Any idea about this?


Not only this place, a lot of other places, I think no hurry to fix them
all at one time.

As we talked just do it according to previous reply,  mark memparse as
obsolete, and create a new function to use kstrtoull, and make it used
in crashkernel code first.

Thanks
Dave

^ permalink raw reply

* Re: [PATCH V32 01/27] Add the ability to lock down access to the running kernel image
From: Daniel Axtens @ 2019-04-29  4:54 UTC (permalink / raw)
  To: Matthew Garrett, Andrew Donnellan
  Cc: Linux API, cmr, James Morris, Linux Kernel Mailing List,
	David Howells, LSM List, Andy Lutomirski, linuxppc-dev
In-Reply-To: <87wojdy8ro.fsf@dja-thinkpad.axtens.net>

Hi, 

>>> I'm thinking about whether we should lock down the powerpc xmon debug
>>> monitor - intuitively, I think the answer is yes if for no other reason
>>> than Least Astonishment, when lockdown is enabled you probably don't
>>> expect xmon to keep letting you access kernel memory.
>>
>> The original patchset contained a sysrq hotkey to allow physically
>> present users to disable lockdown, so I'm not super concerned about
>> this case - I could definitely be convinced otherwise, though.

So Mimi contacted me offlist and very helpfully provided me with a much
better and less confused justification for disabling xmon in lockdown:

On x86, physical presence (== console access) is a trigger to
disable/enable lockdown mode.

In lockdown mode, you're not supposed to be able to modify memory. xmon
allows you to modify memory, and therefore shouldn't be allowed in
lockdown.

So, if you can disable lockdown on the console that's probably OK, but
it should be specifically disabling lockdown, not randomly editing
memory with xmon.

Regards,
Daniel

^ permalink raw reply

* Re: [PATCHv2] kernel/crash: make parse_crashkernel()'s return value more indicant
From: Pingfan Liu @ 2019-04-29  4:48 UTC (permalink / raw)
  To: Dave Young
  Cc: Rich Felker, linux-ia64, Julien Thierry, Yangtao Li,
	Palmer Dabbelt, Heiko Carstens, Stefan Agner, linux-mips,
	Paul Mackerras, H. Peter Anvin, Thomas Gleixner, Logan Gunthorpe,
	linux-s390, Florian Fainelli, Yoshinori Sato, linux-sh, x86,
	Russell King, Ingo Molnar, Hari Bathini, Catalin Marinas,
	James Hogan, Fenghua Yu, Tony Luck, Will Deacon, Johannes Weiner,
	Borislav Petkov, David Hildenbrand, linux-arm-kernel, Jens Axboe,
	Matthias Brugger, Baoquan He, Ard Biesheuvel, Robin Murphy, kexec,
	LKML, Ralf Baechle, Thomas Bogendoerfer, Paul Burton,
	Greg Kroah-Hartman, Martin Schwidefsky, Andrew Morton,
	linuxppc-dev, Greg Hackmann
In-Reply-To: <CAFgQCTvQezGM7xgY2Q1RSUiQ7wLdxtUAeztrO3AqDfjx8f2kdg@mail.gmail.com>

On Mon, Apr 29, 2019 at 11:04 AM Pingfan Liu <kernelfans@gmail.com> wrote:
>
> On Sun, Apr 28, 2019 at 4:37 PM Dave Young <dyoung@redhat.com> wrote:
> >
> > On 04/25/19 at 04:20pm, Pingfan Liu wrote:
> > > On Wed, Apr 24, 2019 at 4:31 PM Matthias Brugger <mbrugger@suse.com> wrote:
> > > >
> > > >
> > > [...]
> > > > > @@ -139,6 +141,8 @@ static int __init parse_crashkernel_simple(char *cmdline,
> > > > >               pr_warn("crashkernel: unrecognized char: %c\n", *cur);
> > > > >               return -EINVAL;
> > > > >       }
> > > > > +     if (*crash_size == 0)
> > > > > +             return -EINVAL;
> > > >
> > > > This covers the case where I pass an argument like "crashkernel=0M" ?
> > > > Can't we fix that by using kstrtoull() in memparse and check if the return value
> > > > is < 0? In that case we could return without updating the retptr and we will be
> > > > fine.
> > > >
> > > It seems that kstrtoull() treats 0M as invalid parameter, while
> > > simple_strtoull() does not.
> > >
> > > If changed like your suggestion, then all the callers of memparse()
> > > will treats 0M as invalid parameter. This affects many components
> > > besides kexec.  Not sure this can be done or not.
> >
> > simple_strtoull is obsolete, move to kstrtoull is the right way.
> >
> > $ git grep memparse|wc
> >     158     950   10479
> >
> > Except some documentation/tools etc there are still a log of callers
> > which directly use the return value as the ull number without error
> > checking.
> >
> > So it would be good to mark memparse as obsolete as well in
> > lib/cmdline.c, and introduce a new function eg. kmemparse() to use
> > kstrtoull,  and return a real error code, and save the size in an
> > argument like &size.  Then update X86 crashkernel code to use it.
> >
> Thank for your good suggestion.
>
Go through the v5.0 kernel code, I think it will be a huge job.

The difference between unsigned long long simple_strtoull(const char
*cp, char **endp, unsigned int base) and int _kstrtoull(const char *s,
unsigned int base, unsigned long long *res) is bigger than expected,
especially the output parameter @res. Many references to
memparse(const char *ptr, char **retptr) rely on @retptr to work. A
typical example from arch/x86/kernel/e820.c
        mem_size = memparse(p, &p);
        if (p == oldp)
                return -EINVAL;

        userdef = 1;
        if (*p == '@') {  <----------- here
                start_at = memparse(p+1, &p);
                e820__range_add(start_at, mem_size, E820_TYPE_RAM);
        } else if (*p == '#') {
                start_at = memparse(p+1, &p);
                e820__range_add(start_at, mem_size, E820_TYPE_ACPI);
        } else if (*p == '$') {
                start_at = memparse(p+1, &p);
                e820__range_add(start_at, mem_size, E820_TYPE_RESERVED);
        }

So we need to resolve the prototype of kstrtoull() firstly, and maybe
kstrtouint() etc too. All of them have lots of references in kernel.

Any idea about this?

Thanks,
Pingfan

^ permalink raw reply

* Re: [PATCHv2] kernel/crash: make parse_crashkernel()'s return value more indicant
From: Pingfan Liu @ 2019-04-29  3:04 UTC (permalink / raw)
  To: Dave Young
  Cc: Rich Felker, linux-ia64, Julien Thierry, Yangtao Li,
	Palmer Dabbelt, Heiko Carstens, Stefan Agner, linux-mips,
	Paul Mackerras, H. Peter Anvin, Thomas Gleixner, Logan Gunthorpe,
	linux-s390, Florian Fainelli, Yoshinori Sato, linux-sh, x86,
	Russell King, Ingo Molnar, Hari Bathini, Catalin Marinas,
	James Hogan, Fenghua Yu, Tony Luck, Will Deacon, Johannes Weiner,
	Borislav Petkov, David Hildenbrand, linux-arm-kernel, Jens Axboe,
	Matthias Brugger, Baoquan He, Ard Biesheuvel, Robin Murphy, kexec,
	LKML, Ralf Baechle, Thomas Bogendoerfer, Paul Burton,
	Greg Kroah-Hartman, Martin Schwidefsky, Andrew Morton,
	linuxppc-dev, Greg Hackmann
In-Reply-To: <20190428083710.GA11981@dhcp-128-65.nay.redhat.com>

On Sun, Apr 28, 2019 at 4:37 PM Dave Young <dyoung@redhat.com> wrote:
>
> On 04/25/19 at 04:20pm, Pingfan Liu wrote:
> > On Wed, Apr 24, 2019 at 4:31 PM Matthias Brugger <mbrugger@suse.com> wrote:
> > >
> > >
> > [...]
> > > > @@ -139,6 +141,8 @@ static int __init parse_crashkernel_simple(char *cmdline,
> > > >               pr_warn("crashkernel: unrecognized char: %c\n", *cur);
> > > >               return -EINVAL;
> > > >       }
> > > > +     if (*crash_size == 0)
> > > > +             return -EINVAL;
> > >
> > > This covers the case where I pass an argument like "crashkernel=0M" ?
> > > Can't we fix that by using kstrtoull() in memparse and check if the return value
> > > is < 0? In that case we could return without updating the retptr and we will be
> > > fine.
> > >
> > It seems that kstrtoull() treats 0M as invalid parameter, while
> > simple_strtoull() does not.
> >
> > If changed like your suggestion, then all the callers of memparse()
> > will treats 0M as invalid parameter. This affects many components
> > besides kexec.  Not sure this can be done or not.
>
> simple_strtoull is obsolete, move to kstrtoull is the right way.
>
> $ git grep memparse|wc
>     158     950   10479
>
> Except some documentation/tools etc there are still a log of callers
> which directly use the return value as the ull number without error
> checking.
>
> So it would be good to mark memparse as obsolete as well in
> lib/cmdline.c, and introduce a new function eg. kmemparse() to use
> kstrtoull,  and return a real error code, and save the size in an
> argument like &size.  Then update X86 crashkernel code to use it.
>
Thank for your good suggestion.

Regards,
Pingfan

^ permalink raw reply

* [PATCH v2 2/2] powerpc/perf: Add generic compat mode pmu driver
From: Madhavan Srinivasan @ 2019-04-29  2:52 UTC (permalink / raw)
  To: mpe; +Cc: Madhavan Srinivasan, linuxppc-dev
In-Reply-To: <1556506368-29329-1-git-send-email-maddy@linux.vnet.ibm.com>

Most of the power processor generation performance monitoring
unit (PMU) driver code is bundled in the kernel and one of those
is enabled/registered based on the oprofile_cpu_type check at
the boot.

But things get little tricky incase of "compat" mode boot.
IBM POWER System Server based processors has a compactibility
mode feature, which simpily put is, Nth generation processor
(lets say POWER8) will act and appear in a mode consistent
with an earlier generation (N-1) processor (that is POWER7).
And in this "compat" mode boot, kernel modify the
"oprofile_cpu_type" to be Nth generation (POWER8). If Nth
generation pmu driver is bundled (POWER8), it gets registered.

Key dependency here is to have distro support for latest
processor performance monitoring support. Patch here adds
a generic "compat-mode" performance monitoring driver to
be register in absence of powernv platform specific pmu driver.

Driver supports "cycles", "instruction" and "branch-miss" events.
"0x100F0" used as event code for "cycles", "0x00002"
used as event code for "instruction" events and "0x400F6"
used as event code for "branch miss". These are architected events
as part of ISA. New file called "generic-compat-pmu.c" is
created to contain the driver specific code. And base raw event
code format modeled on PPMU_ARCH_207S.

Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
---
Changelog v1:
- Updated architected event opcodes
- included branch miss with architected event opcode

 arch/powerpc/perf/Makefile             |   3 +-
 arch/powerpc/perf/core-book3s.c        |   2 +-
 arch/powerpc/perf/generic-compat-pmu.c | 245 +++++++++++++++++++++++++++++++++
 arch/powerpc/perf/internal.h           |   1 +
 4 files changed, 249 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/perf/generic-compat-pmu.c

diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile
index ab26df5bacb9..c155dcbb8691 100644
--- a/arch/powerpc/perf/Makefile
+++ b/arch/powerpc/perf/Makefile
@@ -5,7 +5,8 @@ obj-$(CONFIG_PERF_EVENTS)	+= callchain.o perf_regs.o
 obj-$(CONFIG_PPC_PERF_CTRS)	+= core-book3s.o bhrb.o
 obj64-$(CONFIG_PPC_PERF_CTRS)	+= ppc970-pmu.o power5-pmu.o \
 				   power5+-pmu.o power6-pmu.o power7-pmu.o \
-				   isa207-common.o power8-pmu.o power9-pmu.o
+				   isa207-common.o power8-pmu.o power9-pmu.o \
+				   generic-compat-pmu.o
 obj32-$(CONFIG_PPC_PERF_CTRS)	+= mpc7450-pmu.o
 
 obj-$(CONFIG_PPC_POWERNV)	+= imc-pmu.o
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index a96f9420139c..a66fb9c01c9e 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2318,7 +2318,7 @@ static int __init init_ppc64_pmu(void)
 	else if (!init_ppc970_pmu())
 		return 0;
 	else
-		return -ENODEV;
+		return init_generic_compat_pmu();
 }
 early_initcall(init_ppc64_pmu);
 #endif
diff --git a/arch/powerpc/perf/generic-compat-pmu.c b/arch/powerpc/perf/generic-compat-pmu.c
new file mode 100644
index 000000000000..9c2d4bbc5c87
--- /dev/null
+++ b/arch/powerpc/perf/generic-compat-pmu.c
@@ -0,0 +1,245 @@
+/*
+ * Performance counter support.
+ *
+ * Copyright 2019 Madhavan Srinivasan, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or later version.
+ */
+
+#define pr_fmt(fmt)	"generic-compat-pmu: " fmt
+
+#include "isa207-common.h"
+
+/*
+ * Raw event encoding:
+ *
+ *        60        56        52        48        44        40        36        32
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *
+ *        28        24        20        16        12         8         4         0
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *                                 [ pmc ]   [unit ]   [ ]   m   [    pmcxsel    ]
+ *                                                     |     |
+ *                                                     |     *- mark
+ *                                                     |
+ *                                                     |
+ *                                                     *- combine
+ *
+ * Below uses IBM bit numbering.
+ *
+ * MMCR1[x:y] = unit    (PMCxUNIT)
+ * MMCR1[24]   = pmc1combine[0]
+ * MMCR1[25]   = pmc1combine[1]
+ * MMCR1[26]   = pmc2combine[0]
+ * MMCR1[27]   = pmc2combine[1]
+ * MMCR1[28]   = pmc3combine[0]
+ * MMCR1[29]   = pmc3combine[1]
+ * MMCR1[30]   = pmc4combine[0]
+ * MMCR1[31]   = pmc4combine[1]
+ *
+ */
+
+/*
+ * Some power9 event codes.
+ */
+#define EVENT(_name, _code)	_name = _code,
+
+enum {
+EVENT(PM_CYC,					0x100F0)
+EVENT(PM_INST_CMPL,				0x00002)
+EVENT(PM_BR_MPRED_CMPL,				0x400F6)
+};
+
+#undef EVENT
+
+GENERIC_EVENT_ATTR(cpu-cycles,			PM_CYC);
+GENERIC_EVENT_ATTR(instructions,		PM_INST_CMPL);
+GENERIC_EVENT_ATTR(branch-misses,               PM_BR_MPRED_CMPL);
+
+static struct attribute *generic_compat_events_attr[] = {
+	GENERIC_EVENT_PTR(PM_CYC),
+	GENERIC_EVENT_PTR(PM_INST_CMPL),
+	GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
+	NULL
+};
+
+static struct attribute_group generic_compat_pmu_events_group = {
+	.name = "events",
+	.attrs = generic_compat_events_attr,
+};
+
+PMU_FORMAT_ATTR(event,		"config:0-19");
+PMU_FORMAT_ATTR(pmcxsel,	"config:0-7");
+PMU_FORMAT_ATTR(mark,		"config:8");
+PMU_FORMAT_ATTR(combine,	"config:10-11");
+PMU_FORMAT_ATTR(unit,		"config:12-15");
+PMU_FORMAT_ATTR(pmc,		"config:16-19");
+
+static struct attribute *generic_compat_pmu_format_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_pmcxsel.attr,
+	&format_attr_mark.attr,
+	&format_attr_combine.attr,
+	&format_attr_unit.attr,
+	&format_attr_pmc.attr,
+	NULL,
+};
+
+static struct attribute_group generic_compat_pmu_format_group = {
+	.name = "format",
+	.attrs = generic_compat_pmu_format_attr,
+};
+
+static const struct attribute_group *generic_compat_pmu_attr_groups[] = {
+	&generic_compat_pmu_format_group,
+	&generic_compat_pmu_events_group,
+	NULL,
+};
+
+static int compat_generic_events[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] =			PM_CYC,
+	[PERF_COUNT_HW_INSTRUCTIONS] =			PM_INST_CMPL,
+	[PERF_COUNT_HW_BRANCH_MISSES] =                 PM_BR_MPRED_CMPL,
+};
+
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int generic_compat_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[ C(L1D) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+	[ C(L1I) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+	[ C(LL) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+	},
+	[ C(DTLB) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+	[ C(ITLB) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+	[ C(BPU) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = 0,
+			[ C(RESULT_MISS)   ] = 0,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+	[ C(NODE) ] = {
+		[ C(OP_READ) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_WRITE) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+		[ C(OP_PREFETCH) ] = {
+			[ C(RESULT_ACCESS) ] = -1,
+			[ C(RESULT_MISS)   ] = -1,
+		},
+	},
+};
+
+#undef C
+
+static struct power_pmu generic_compat_pmu = {
+	.name			= "GENERIC_COMPAT",
+	.n_counter		= MAX_PMU_COUNTERS,
+	.add_fields		= ISA207_ADD_FIELDS,
+	.test_adder		= ISA207_TEST_ADDER,
+	.compute_mmcr		= isa207_compute_mmcr,
+	.get_constraint		= isa207_get_constraint,
+	.disable_pmc		= isa207_disable_pmc,
+	.flags			= PPMU_HAS_SIER | PPMU_ARCH_207S,
+	.n_generic		= ARRAY_SIZE(compat_generic_events),
+	.generic_events		= compat_generic_events,
+	.cache_events		= &generic_compat_cache_events,
+	.attr_groups		= generic_compat_pmu_attr_groups,
+};
+
+int init_generic_compat_pmu(void)
+{
+	int rc = 0;
+
+	rc = register_power_pmu(&generic_compat_pmu);
+	if (rc)
+		return rc;
+
+	/* Tell userspace that EBB is supported */
+	cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_EBB;
+
+	return 0;
+}
diff --git a/arch/powerpc/perf/internal.h b/arch/powerpc/perf/internal.h
index e54d524d4283..185a40d1adff 100644
--- a/arch/powerpc/perf/internal.h
+++ b/arch/powerpc/perf/internal.h
@@ -14,3 +14,4 @@ extern int init_power6_pmu(void);
 extern int init_power7_pmu(void);
 extern int init_power8_pmu(void);
 extern int init_power9_pmu(void);
+extern int init_generic_compat_pmu(void);
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 1/2] powerpc/perf: init pmu from core-book3s
From: Madhavan Srinivasan @ 2019-04-29  2:52 UTC (permalink / raw)
  To: mpe; +Cc: Madhavan Srinivasan, linuxppc-dev

Currenty pmu driver file for each ppc64 generation processor
has a __init call in itself. Refactor the code by moving the
__init call to core-books.c. This also clean's up compat mode
pmu driver registration.

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
---
Changelog v1:
- Added "internal.h" file and moved the extern definitions to that file

 arch/powerpc/perf/core-book3s.c | 28 ++++++++++++++++++++++++++++
 arch/powerpc/perf/internal.h    | 16 ++++++++++++++++
 arch/powerpc/perf/power5+-pmu.c |  4 +---
 arch/powerpc/perf/power5-pmu.c  |  4 +---
 arch/powerpc/perf/power6-pmu.c  |  4 +---
 arch/powerpc/perf/power7-pmu.c  |  4 +---
 arch/powerpc/perf/power8-pmu.c  |  3 +--
 arch/powerpc/perf/power9-pmu.c  |  3 +--
 arch/powerpc/perf/ppc970-pmu.c  |  4 +---
 9 files changed, 51 insertions(+), 19 deletions(-)
 create mode 100644 arch/powerpc/perf/internal.h

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index b0723002a396..a96f9420139c 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -22,6 +22,10 @@
 #include <asm/ptrace.h>
 #include <asm/code-patching.h>
 
+#ifdef CONFIG_PPC64
+#include "internal.h"
+#endif
+
 #define BHRB_MAX_ENTRIES	32
 #define BHRB_TARGET		0x0000000000000002
 #define BHRB_PREDICTION		0x0000000000000001
@@ -2294,3 +2298,27 @@ int register_power_pmu(struct power_pmu *pmu)
 			  power_pmu_prepare_cpu, NULL);
 	return 0;
 }
+
+#ifdef CONFIG_PPC64
+static int __init init_ppc64_pmu(void)
+{
+	/* run through all the pmu drivers one at a time */
+	if (!init_power5_pmu())
+		return 0;
+	else if (!init_power5p_pmu())
+		return 0;
+	else if (!init_power6_pmu())
+		return 0;
+	else if (!init_power7_pmu())
+		return 0;
+	else if (!init_power8_pmu())
+		return 0;
+	else if (!init_power9_pmu())
+		return 0;
+	else if (!init_ppc970_pmu())
+		return 0;
+	else
+		return -ENODEV;
+}
+early_initcall(init_ppc64_pmu);
+#endif
diff --git a/arch/powerpc/perf/internal.h b/arch/powerpc/perf/internal.h
new file mode 100644
index 000000000000..e54d524d4283
--- /dev/null
+++ b/arch/powerpc/perf/internal.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright 2019 Madhavan Srinivasan, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+extern int init_ppc970_pmu(void);
+extern int init_power5_pmu(void);
+extern int init_power5p_pmu(void);
+extern int init_power6_pmu(void);
+extern int init_power7_pmu(void);
+extern int init_power8_pmu(void);
+extern int init_power9_pmu(void);
diff --git a/arch/powerpc/perf/power5+-pmu.c b/arch/powerpc/perf/power5+-pmu.c
index 0526dac66007..9aa803504cb2 100644
--- a/arch/powerpc/perf/power5+-pmu.c
+++ b/arch/powerpc/perf/power5+-pmu.c
@@ -677,7 +677,7 @@ static struct power_pmu power5p_pmu = {
 	.cache_events		= &power5p_cache_events,
 };
 
-static int __init init_power5p_pmu(void)
+int init_power5p_pmu(void)
 {
 	if (!cur_cpu_spec->oprofile_cpu_type ||
 	    (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5+")
@@ -686,5 +686,3 @@ static int __init init_power5p_pmu(void)
 
 	return register_power_pmu(&power5p_pmu);
 }
-
-early_initcall(init_power5p_pmu);
diff --git a/arch/powerpc/perf/power5-pmu.c b/arch/powerpc/perf/power5-pmu.c
index 4dc99f9f7962..30cb13d081a9 100644
--- a/arch/powerpc/perf/power5-pmu.c
+++ b/arch/powerpc/perf/power5-pmu.c
@@ -618,7 +618,7 @@ static struct power_pmu power5_pmu = {
 	.flags			= PPMU_HAS_SSLOT,
 };
 
-static int __init init_power5_pmu(void)
+int init_power5_pmu(void)
 {
 	if (!cur_cpu_spec->oprofile_cpu_type ||
 	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power5"))
@@ -626,5 +626,3 @@ static int __init init_power5_pmu(void)
 
 	return register_power_pmu(&power5_pmu);
 }
-
-early_initcall(init_power5_pmu);
diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c
index 9c9d646b68a1..80ec48632cfe 100644
--- a/arch/powerpc/perf/power6-pmu.c
+++ b/arch/powerpc/perf/power6-pmu.c
@@ -540,7 +540,7 @@ static struct power_pmu power6_pmu = {
 	.cache_events		= &power6_cache_events,
 };
 
-static int __init init_power6_pmu(void)
+int init_power6_pmu(void)
 {
 	if (!cur_cpu_spec->oprofile_cpu_type ||
 	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power6"))
@@ -548,5 +548,3 @@ static int __init init_power6_pmu(void)
 
 	return register_power_pmu(&power6_pmu);
 }
-
-early_initcall(init_power6_pmu);
diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
index 6dbae9884ec4..bb6efd5d2530 100644
--- a/arch/powerpc/perf/power7-pmu.c
+++ b/arch/powerpc/perf/power7-pmu.c
@@ -445,7 +445,7 @@ static struct power_pmu power7_pmu = {
 	.cache_events		= &power7_cache_events,
 };
 
-static int __init init_power7_pmu(void)
+int init_power7_pmu(void)
 {
 	if (!cur_cpu_spec->oprofile_cpu_type ||
 	    strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power7"))
@@ -456,5 +456,3 @@ static int __init init_power7_pmu(void)
 
 	return register_power_pmu(&power7_pmu);
 }
-
-early_initcall(init_power7_pmu);
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index d12a2db26353..bcc3409a06de 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -379,7 +379,7 @@ static struct power_pmu power8_pmu = {
 	.bhrb_nr		= 32,
 };
 
-static int __init init_power8_pmu(void)
+int init_power8_pmu(void)
 {
 	int rc;
 
@@ -399,4 +399,3 @@ static int __init init_power8_pmu(void)
 
 	return 0;
 }
-early_initcall(init_power8_pmu);
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 030544e35959..3a31ac6f4805 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -437,7 +437,7 @@ static struct power_pmu power9_pmu = {
 	.bhrb_nr		= 32,
 };
 
-static int __init init_power9_pmu(void)
+int init_power9_pmu(void)
 {
 	int rc = 0;
 	unsigned int pvr = mfspr(SPRN_PVR);
@@ -467,4 +467,3 @@ static int __init init_power9_pmu(void)
 
 	return 0;
 }
-early_initcall(init_power9_pmu);
diff --git a/arch/powerpc/perf/ppc970-pmu.c b/arch/powerpc/perf/ppc970-pmu.c
index 8b6a8a36fa38..1d3370914022 100644
--- a/arch/powerpc/perf/ppc970-pmu.c
+++ b/arch/powerpc/perf/ppc970-pmu.c
@@ -490,7 +490,7 @@ static struct power_pmu ppc970_pmu = {
 	.flags			= PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING,
 };
 
-static int __init init_ppc970_pmu(void)
+int init_ppc970_pmu(void)
 {
 	if (!cur_cpu_spec->oprofile_cpu_type ||
 	    (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/970")
@@ -499,5 +499,3 @@ static int __init init_ppc970_pmu(void)
 
 	return register_power_pmu(&ppc970_pmu);
 }
-
-early_initcall(init_ppc970_pmu);
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH v8 05/20] KVM: PPC: Book3S HV: Remove pmd_is_leaf()
From: Paul Mackerras @ 2019-04-29  2:05 UTC (permalink / raw)
  To: Steven Price
  Cc: Mark Rutland, Peter Zijlstra, Dave Hansen, Will Deacon, linux-mm,
	H. Peter Anvin, Liang, Kan, x86, Ingo Molnar, Catalin Marinas,
	Arnd Bergmann, kvm-ppc, Jérôme Glisse, Borislav Petkov,
	Andy Lutomirski, Thomas Gleixner, linux-arm-kernel,
	Ard Biesheuvel, linux-kernel, James Morse, Andrew Morton,
	linuxppc-dev
In-Reply-To: <20190403141627.11664-6-steven.price@arm.com>

On Wed, Apr 03, 2019 at 03:16:12PM +0100, Steven Price wrote:
> Since pmd_large() is now always available, pmd_is_leaf() is redundant.
> Replace all uses with calls to pmd_large().

NAK.  I don't want to do this, because pmd_is_leaf() is purely about
the guest page tables (the "partition-scoped" radix tree which
specifies the guest physical to host physical translation), not about
anything to do with the Linux process page tables.  The guest page
tables have the same format as the Linux process page tables, but they
are managed separately.

If it makes things clearer, I could rename it to "guest_pmd_is_leaf()"
or something similar.

Paul.

^ permalink raw reply

* Re: [PATCH V32 01/27] Add the ability to lock down access to the running kernel image
From: Daniel Axtens @ 2019-04-29  0:06 UTC (permalink / raw)
  To: Matthew Garrett, Andrew Donnellan
  Cc: Linux API, cmr, James Morris, Linux Kernel Mailing List,
	David Howells, LSM List, Andy Lutomirski, linuxppc-dev
In-Reply-To: <CACdnJus9AhAAYs-R94BH7HDuuQfXjgdhdqUR6Pvk9mxbuPx1=Q@mail.gmail.com>

Matthew Garrett <mjg59@google.com> writes:

> On Tue, Apr 16, 2019 at 1:40 AM Andrew Donnellan
> <andrew.donnellan@au1.ibm.com> wrote:
>> I'm thinking about whether we should lock down the powerpc xmon debug
>> monitor - intuitively, I think the answer is yes if for no other reason
>> than Least Astonishment, when lockdown is enabled you probably don't
>> expect xmon to keep letting you access kernel memory.
>
> The original patchset contained a sysrq hotkey to allow physically
> present users to disable lockdown, so I'm not super concerned about
> this case - I could definitely be convinced otherwise, though.

So currently (and I'm pretty new to this as I've only recently rejoined
IBM) we aren't considering access to the console to be sufficient to
assert physical presence on bare-metal server-class Power machines. The
short argument for this is that with IPMI and BMCs, a server's console
isn't what it used to be. Our console is also a bit different to x86:
we don't generally have bios configuration screens on the console.

In your example, a sysrq key would allow you to disable lockdown after
the system has booted. On Power though, we use Linux as a bootloader
(Petitboot: https://github.com/open-power/petitboot) so being able to
disable lockdown there allows an IPMI-connected user to prevent a signed
kernel being loaded in the first place. I don't know if this is
_actually_ worse, but it certainly feels worse.

There are of course some arguments against our approach. I'm aware of
some of them. I'm also very open to being told that not equating console
access with physical access is fundamentally silly or broken and that we
should rethink things.

Regards,
Daniel

^ permalink raw reply

* Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.1-6 tag
From: pr-tracker-bot @ 2019-04-28 18:05 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: aik, linuxppc-dev, Linus Torvalds, linux-kernel
In-Reply-To: <87ftq2aa9u.fsf@concordia.ellerman.id.au>

The pull request you sent on Sun, 28 Apr 2019 16:55:57 +1000:

> https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git tags/powerpc-5.1-6

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/0d82044e1b7e5497c2177abd39b31e9ba27be8b7

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker

^ permalink raw reply

* Re: [PATCH 40/41] drivers: tty: serial: helper for setting mmio range
From: Andy Shevchenko @ 2019-04-28 15:39 UTC (permalink / raw)
  To: Enrico Weigelt, metux IT consult
  Cc: lorenzo.pieralisi, linux-ia64, linux-serial, andrew, gregkh,
	sudeep.holla, liviu.dudau, linux-kernel, vz, linux, sparclinux,
	khilman, macro, slemieux.tyco, matthias.bgg, jacmet,
	linux-amlogic, linux-mips, linuxppc-dev, davem
In-Reply-To: <1556369542-13247-41-git-send-email-info@metux.net>

On Sat, Apr 27, 2019 at 02:52:21PM +0200, Enrico Weigelt, metux IT consult wrote:
> Introduce a little helpers for settings the mmio range from an
> struct resource or start/len parameters with less code.
> (also setting iotype to UPIO_MEM)
> 
> Also converting drivers to use these new helpers as well as
> fetching mapsize field instead of using hardcoded values.
> (the runtime overhead of that should be negligible)
> 
> The idea is moving to a consistent scheme, so later common
> calls like request+ioremap combination can be done by generic
> helpers.

> --- a/drivers/tty/serial/8250/8250_exar.c
> +++ b/drivers/tty/serial/8250/8250_exar.c
> @@ -134,8 +134,10 @@ static int default_setup(struct exar8250 *priv, struct pci_dev *pcidev,
>  	const struct exar8250_board *board = priv->board;
>  	unsigned int bar = 0;
>  
> -	port->port.iotype = UPIO_MEM;
> -	port->port.mapbase = pci_resource_start(pcidev, bar) + offset;
> +	uart_memres_set_start_len(&port->port,
> +				  pci_resource_start(pcidev, bar) + offset,
> +				  pci_resource_len(pcidev, bar));
> +

I don't see how it's better.
Moreover, the size argument seems wrong here.

> +		uart_memres_set_start_len(
> +			&port,
> +			FRODO_BASE + FRODO_APCI_OFFSET(1), 0);

Please, avoid such splitting, first parameter is quite fit above line.

>  		port.uartclk = HPDCA_BAUD_BASE * 16;
> -		port.mapbase = (pa + UART_OFFSET);
> +
> +		uart_memres_set_start_len(&port, (pa + UART_OFFSET));
>  		port.membase = (char *)(port.mapbase + DIO_VIRADDRBASE);
>  		port.regshift = 1;
>  		port.irq = DIO_IPL(pa + DIO_VIRADDRBASE);

Here...

>  	uart.port.flags = UPF_SKIP_TEST | UPF_SHARE_IRQ | UPF_BOOT_AUTOCONF;
>  	uart.port.irq = d->ipl;
>  	uart.port.uartclk = HPDCA_BAUD_BASE * 16;
> -	uart.port.mapbase = (d->resource.start + UART_OFFSET);
> +	uart_memres_set_start_len(&uart.port,
> +				  (d->resource.start + UART_OFFSET),
> +				  resource_size(&d->resource));
>  	uart.port.membase = (char *)(uart.port.mapbase + DIO_VIRADDRBASE);
>  	uart.port.regshift = 1;
>  	uart.port.dev = &d->dev;

...and here, and maybe in other places you split the assignments to the members
in two part. Better to call your function before or after these blocks of
assignments.

> -			uport->mapsize	= ZS_CHAN_IO_SIZE;
> -			uport->mapbase	= dec_kn_slot_base +
> -					  zs_parms.scc[chip] +
> -					  (side ^ ZS_CHAN_B) * ZS_CHAN_IO_SIZE;
> +
> +			uart_memres_set_start_len(dec_kn_slot_base +
> +						    zs_parms.scc[chip] +
> +						    (side ^ ZS_CHAN_B) *
> +							ZS_CHAN_IO_SIZE,
> +						  ZS_CHAN_IO_SIZE);

This looks hard to read. Think of temporary variables and better formatting
style.

>  /*
> + * set physical io range from struct resource
> + * if resource is NULL, clear the fields
> + * also set the iotype to UPIO_MEM

Something wrong with punctuation and style. Please, use proper casing and
sentences split.

> + */

Shouldn't be kernel-doc formatted?

> +static inline void uart_memres_set_res(struct uart_port *port,

Perhaps better name can be found.
Especially taking into account that it handles IO / MMIO here.

> +				       struct resource *res)
> +{
> +	if (!res) {

It should return an error in such case.

> +		port->mapsize = 0;
> +		port->mapbase = 0;
> +		port->iobase = 0;
> +		return;
> +	}
> +
> +	if (resource_type(res) == IORESOURCE_IO) {
> +		port->iotype = UPIO_PORT;
> +		port->iobase = resource->start;
> +		return;
> +	}
> +
> +	uart->mapbase = res->start;
> +	uart->mapsize = resource_size(res);

> +	uart->iotype  = UPIO_MEM;

Only one type? Why type is even set here?

> +}
> +
> +/*
> + * set physical io range by start address and length
> + * if resource is NULL, clear the fields
> + * also set the iotype to UPIO_MEM

Should be fixed as told above.

> + */

> +static inline void uart_memres_set_start_len(struct uart_driver *uart,
> +					     resource_size_t start,
> +					     resource_size_t len)

The comment doesn't tell why this is needed when we have one for struct
resource.

> +{
> +	uart->mapbase = start;
> +	uart->mapsize = len;

> +	uart->iotype  = UPIO_MEM;

Only one type?

> +}
> +
> +/*

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply

* Re: [PATCH 37/41] drivers: tty: serial: 8250: simplify io resource size computation
From: Andy Shevchenko @ 2019-04-28 15:21 UTC (permalink / raw)
  To: Enrico Weigelt, metux IT consult
  Cc: lorenzo.pieralisi, linux-ia64, linux-serial, andrew, gregkh,
	sudeep.holla, liviu.dudau, linux-kernel, vz, linux, sparclinux,
	khilman, macro, slemieux.tyco, matthias.bgg, jacmet,
	linux-amlogic, linux-mips, linuxppc-dev, davem
In-Reply-To: <1556369542-13247-38-git-send-email-info@metux.net>

On Sat, Apr 27, 2019 at 02:52:18PM +0200, Enrico Weigelt, metux IT consult wrote:
> Simpily io resource size computation by setting mapsize field.
> 
> Some of the special cases handled by serial8250_port_size() can be
> simplified by putting this data to corresponding platform data
> or probe function.


> --- a/drivers/tty/serial/8250/8250.h
> +++ b/drivers/tty/serial/8250/8250.h
> @@ -105,6 +105,7 @@ struct serial8250_config {
>  
>  #define SERIAL8250_PORT(_base, _irq) SERIAL8250_PORT_FLAGS(_base, _irq, 0)
>  

> +#define SERIAL_RT2880_IOSIZE	0x100

And why this is in the header file and not in corresponding C one?

> diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
> index d09af4c..51d6076 100644
> --- a/drivers/tty/serial/8250/8250_port.c
> +++ b/drivers/tty/serial/8250/8250_port.c
> @@ -2833,11 +2833,7 @@ unsigned int serial8250_port_size(struct uart_8250_port *pt)
>  {
>  	if (pt->port.mapsize)
>  		return pt->port.mapsize;
> -	if (pt->port.iotype == UPIO_AU) {
> -		if (pt->port.type == PORT_RT2880)
> -			return 0x100;
> -		return 0x1000;
> -	}
> +
>  	if (is_omap1_8250(pt))
>  		return 0x16 << pt->port.regshift;

This is good. We definitely need to get rid of custom stuff in generic
8250_port, etc.

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply

* Re: [PATCH 36/41] drivers: tty: serial: 8250: store mmio resource size in port struct
From: Andy Shevchenko @ 2019-04-28 15:18 UTC (permalink / raw)
  To: Enrico Weigelt, metux IT consult
  Cc: lorenzo.pieralisi, linux-ia64, linux-serial, andrew, gregkh,
	sudeep.holla, liviu.dudau, linux-kernel, vz, linux, sparclinux,
	khilman, macro, slemieux.tyco, matthias.bgg, jacmet,
	linux-amlogic, linux-mips, linuxppc-dev, davem
In-Reply-To: <1556369542-13247-37-git-send-email-info@metux.net>

On Sat, Apr 27, 2019 at 02:52:17PM +0200, Enrico Weigelt, metux IT consult wrote:
> The io resource size is currently recomputed when it's needed but this
> actually needs to be done once (or drivers could specify fixed values)

io -> IO

> 
> Simplify that by doing this computation only once and storing the result
> into the mapsize field. serial8250_register_8250_port() is now called
> only once on driver init, the previous call sites now just fetch the
> value from the mapsize field.

Do I understand correctly that this has no side effects?

Which hardware did you test this on?

> @@ -979,6 +979,9 @@ int serial8250_register_8250_port(struct uart_8250_port *up)
>  	if (up->port.uartclk == 0)
>  		return -EINVAL;
>  
> +	/* compute the mapsize in case the driver didn't specify one */
> +	up->mapsize = serial8250_port_size(up);

I don't know all quirks in 8250 drivers by heart, though can you guarantee that
at this point the device reports correct IO resource size? (If I'm not mistaken
some broken hardware needs some magic to be done before card can be properly
handled)

> -	unsigned int size = serial8250_port_size(up);
>  	struct uart_port *port = &up->port;

> -	int ret = 0;

This and Co is a separate change that can be done in its own patch.

> +			port->membase = ioremap_nocache(port->mapbase,
> +							port->mapsize);

You may increase readability by introducing temporary variables

	... mapbase = port->mapbase;
	... mapsize = port->mapsize;
	...
	port->membase = ioremap_nocache(mapbase, mapsize);
	...

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply

* [PATCH v10 2/2] powerpc/64s: KVM update for reimplement book3s idle code in C
From: Nicholas Piggin @ 2019-04-28 11:45 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gautham R . Shenoy, kvm-ppc, Nicholas Piggin
In-Reply-To: <20190428114515.32683-1-npiggin@gmail.com>

This is the KVM update to the new idle code. A few improvements:

- Idle sleepers now always return to caller rather than branch out
  to KVM first.
- This allows optimisations like very fast return to caller when no
  state has been lost.
- KVM no longer requires nap_state_lost because it controls NVGPR
  save/restore itself on the way in and out.
- The heavy idle wakeup KVM request check can be moved out of the
  normal host idle code and into the not-performance-critical offline
  code.
- KVM nap code now returns from where it is called, which makes the
  flow a bit easier to follow.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/paca.h         |   1 -
 arch/powerpc/kernel/asm-offsets.c       |   1 -
 arch/powerpc/kernel/exceptions-64s.S    |  14 ++-
 arch/powerpc/kernel/idle_book3s.S       |  22 -----
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 118 ++++++++++++++----------
 arch/powerpc/platforms/powernv/idle.c   |  15 +++
 arch/powerpc/xmon/xmon.c                |   3 -
 7 files changed, 92 insertions(+), 82 deletions(-)

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index e55dedd7ee3e..245d11a71784 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -174,7 +174,6 @@ struct paca_struct {
 	u8 io_sync;			/* writel() needs spin_unlock sync */
 	u8 irq_work_pending;		/* IRQ_WORK interrupt while soft-disable */
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	u8 nap_state_lost;		/* NV GPR values lost in power7_idle */
 	u8 pmcregs_in_use;		/* pseries puts this in lppaca */
 #endif
 	u64 sprg_vdso;			/* Saved user-visible sprg */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index a14a77bc4243..8e02444e9d3d 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -271,7 +271,6 @@ int main(void)
 	OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime);
 	OFFSET(ACCOUNT_SYSTEM_TIME, paca_struct, accounting.stime);
 	OFFSET(PACA_TRAP_SAVE, paca_struct, trap_save);
-	OFFSET(PACA_NAPSTATELOST, paca_struct, nap_state_lost);
 	OFFSET(PACA_SPRG_VDSO, paca_struct, sprg_vdso);
 #else /* CONFIG_PPC64 */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 671c256da167..6b86055e5251 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -122,6 +122,8 @@ EXC_VIRT_NONE(0x4000, 0x100)
 	rlwinm.	r10,r10,47-31,30,31 ;					\
 	beq-	1f ;							\
 	cmpwi	cr1,r10,2 ;						\
+	mfspr	r3,SPRN_SRR1 ;						\
+	bltlr	cr1 ;	/* no state loss, return to idle caller */	\
 	BRANCH_TO_C000(r10, system_reset_idle_common) ;			\
 1:									\
 	KVMTEST_PR(n) ;							\
@@ -145,12 +147,10 @@ TRAMP_KVM(PACA_EXNMI, 0x100)
 
 #ifdef CONFIG_PPC_P7_NAP
 EXC_COMMON_BEGIN(system_reset_idle_common)
-	mfspr	r3,SPRN_SRR1
-#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	/* this bltlr could  be moved before the branch_to, and the
-	 * branch_to could maybe go straight to idle_return */
-	bltlr	cr1	/* no state loss, return to idle caller */
-#endif
+	/*
+	 * This must be a direct branch (without linker branch stub) because
+	 * we can not use TOC at this point as r2 may not be restored yet.
+	 */
 	b	idle_return_gpr_loss
 #endif
 
@@ -443,9 +443,7 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
 	mtlr	r4
 	rlwinm	r10,r3,47-31,30,31
 	cmpwi	cr1,r10,2
-#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	bltlr	cr1	/* no state loss, return to idle caller */
-#endif
 	b	idle_return_gpr_loss
 #endif
 	/*
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 0fb2eb731a29..2dfbd5d5b932 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -19,9 +19,6 @@
 #include <asm/asm-offsets.h>
 #include <asm/ppc-opcode.h>
 #include <asm/cpuidle.h>
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-#include <asm/kvm_book3s_asm.h>
-#endif
 
 /*
  * Desired PSSCR in r3
@@ -93,25 +90,6 @@ _GLOBAL(isa300_idle_stop_mayloss)
  * a simple blr instead).
  */
 _GLOBAL(idle_return_gpr_loss)
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	lbz	r0,HSTATE_HWTHREAD_STATE(r13)
-	cmpwi	r0,KVM_HWTHREAD_IN_KERNEL
-	beq	0f
-	li	r0,KVM_HWTHREAD_IN_KERNEL
-	stb	r0,HSTATE_HWTHREAD_STATE(r13)
-	/* Order setting hwthread_state vs. testing hwthread_req */
-	sync
-0:	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
-	cmpwi	r0,0
-	beq	1f
-	b	kvm_start_guest
-1:
-	lbz	r0,PACA_NAPSTATELOST(r13)
-	cmpwi	r0,0
-	bne	2f
-	bltlr	cr1
-2:
-#endif
 	ld	r1,PACAR1(r13)
 	ld	r4,-8*19(r1)
 	ld	r5,-8*20(r1)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 58d0f1ba845d..f66191d8f841 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -35,6 +35,7 @@
 #include <asm/thread_info.h>
 #include <asm/asm-compat.h>
 #include <asm/feature-fixups.h>
+#include <asm/cpuidle.h>
 
 /* Sign-extend HDEC if not on POWER9 */
 #define EXTEND_HDEC(reg)			\
@@ -45,6 +46,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 /* Values in HSTATE_NAPPING(r13) */
 #define NAPPING_CEDE	1
 #define NAPPING_NOVCPU	2
+#define NAPPING_UNSPLIT	3
 
 /* Stack frame offsets for kvmppc_hv_entry */
 #define SFS			208
@@ -290,17 +292,19 @@ kvm_novcpu_exit:
 	b	kvmhv_switch_to_host
 
 /*
- * We come in here when wakened from nap mode.
- * Relocation is off and most register values are lost.
- * r13 points to the PACA.
+ * We come in here when wakened from Linux offline idle code.
+ * Relocation is off
  * r3 contains the SRR1 wakeup value, SRR1 is trashed.
  */
-	.globl	kvm_start_guest
-kvm_start_guest:
-	/* Set runlatch bit the minute you wake up from nap */
-	mfspr	r0, SPRN_CTRLF
-	ori 	r0, r0, 1
-	mtspr	SPRN_CTRLT, r0
+_GLOBAL(idle_kvm_start_guest)
+	ld	r4,PACAEMERGSP(r13)
+	mfcr	r5
+	mflr	r0
+	std	r1,0(r4)
+	std	r5,8(r4)
+	std	r0,16(r4)
+	subi	r1,r4,STACK_FRAME_OVERHEAD
+	SAVE_NVGPRS(r1)
 
 	/*
 	 * Could avoid this and pass it through in r3. For now,
@@ -308,27 +312,23 @@ kvm_start_guest:
 	 */
 	mtspr	SPRN_SRR1,r3
 
-	ld	r2,PACATOC(r13)
-
 	li	r0,0
 	stb	r0,PACA_FTRACE_ENABLED(r13)
 
 	li	r0,KVM_HWTHREAD_IN_KVM
 	stb	r0,HSTATE_HWTHREAD_STATE(r13)
 
-	/* NV GPR values from power7_idle() will no longer be valid */
-	li	r0,1
-	stb	r0,PACA_NAPSTATELOST(r13)
-
-	/* were we napping due to cede? */
+	/* kvm cede / napping does not come through here */
 	lbz	r0,HSTATE_NAPPING(r13)
-	cmpwi	r0,NAPPING_CEDE
-	beq	kvm_end_cede
-	cmpwi	r0,NAPPING_NOVCPU
-	beq	kvm_novcpu_wakeup
+	twnei	r0,0
+
+	b	1f
 
-	ld	r1,PACAEMERGSP(r13)
-	subi	r1,r1,STACK_FRAME_OVERHEAD
+kvm_unsplit_wakeup:
+	li	r0, 0
+	stb	r0, HSTATE_NAPPING(r13)
+
+1:
 
 	/*
 	 * We weren't napping due to cede, so this must be a secondary
@@ -437,21 +437,25 @@ kvm_no_guest:
 	lbz	r3, HSTATE_HWTHREAD_REQ(r13)
 	cmpwi	r3, 0
 	bne	54f
-/*
- * We jump to pnv_wakeup_loss, which will return to the caller
- * of power7_nap in the powernv cpu offline loop.  The value we
- * put in r3 becomes the return value for power7_nap. pnv_wakeup_loss
- * requires SRR1 in r12.
- */
+
+	/*
+	 * Jump to idle_return_gpr_loss, which returns to the
+	 * idle_kvm_start_guest caller.
+	 */
 	li	r3, LPCR_PECE0
 	mfspr	r4, SPRN_LPCR
 	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
 	mtspr	SPRN_LPCR, r4
-	li	r3, 0
-	/* set up cr3 and r3 for return */
-	cmpdi	cr3, r3, 0
+	/* set up r3 for return */
 	mfspr	r3,SPRN_SRR1
-	b	idle_return_gpr_loss
+	REST_NVGPRS(r1)
+	addi	r1, r1, STACK_FRAME_OVERHEAD
+	ld	r0, 16(r1)
+	ld	r5, 8(r1)
+	ld	r1, 0(r1)
+	mtlr	r0
+	mtcr	r5
+	blr
 
 53:	HMT_LOW
 	ld	r5, HSTATE_KVM_VCORE(r13)
@@ -536,6 +540,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	lbz	r0, KVM_SPLIT_DO_NAP(r3)
 	cmpwi	r0, 0
 	beq	57f
+	li	r3, NAPPING_UNSPLIT
+	stb	r3, HSTATE_NAPPING(r13)
 	li	r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
 	mfspr	r5, SPRN_LPCR
 	rlwimi	r5, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
@@ -2656,6 +2662,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 	lis	r3, LPCR_PECEDP@h	/* Do wake on privileged doorbell */
 
+	/* Go back to host stack */
+	ld	r1, HSTATE_HOST_R1(r13)
+
 	/*
 	 * Take a nap until a decrementer or external or doobell interrupt
 	 * occurs, with PECE1 and PECE0 set in LPCR.
@@ -2684,26 +2693,42 @@ BEGIN_FTR_SECTION
 	 *		requested level = 0 (just stop dispatching)
 	 */
 	lis	r3, (PSSCR_EC | PSSCR_ESL)@h
-	mtspr	SPRN_PSSCR, r3
 	/* Set LPCR_PECE_HVEE bit to enable wakeup by HV interrupts */
 	li	r4, LPCR_PECE_HVEE@higher
 	sldi	r4, r4, 32
 	or	r5, r5, r4
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+FTR_SECTION_ELSE
+	li	r3, PNV_THREAD_NAP
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 	mtspr	SPRN_LPCR,r5
 	isync
-	li	r0, 0
-	std	r0, HSTATE_SCRATCH0(r13)
-	ptesync
-	ld	r0, HSTATE_SCRATCH0(r13)
-1:	cmpd	r0, r0
-	bne	1b
+
 BEGIN_FTR_SECTION
-	nap
+	bl	isa300_idle_stop_mayloss
 FTR_SECTION_ELSE
-	PPC_STOP
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
-	b	.
+	bl	isa206_idle_insn_mayloss
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
+
+	mfspr	r0, SPRN_CTRLF
+	ori	r0, r0, 1
+	mtspr	SPRN_CTRLT, r0
+
+	mtspr	SPRN_SRR1, r3
+
+	li	r0, 0
+	stb	r0, PACA_FTRACE_ENABLED(r13)
+
+	li	r0, KVM_HWTHREAD_IN_KVM
+	stb	r0, HSTATE_HWTHREAD_STATE(r13)
+
+	lbz	r0, HSTATE_NAPPING(r13)
+	cmpwi	r0, NAPPING_CEDE
+	beq	kvm_end_cede
+	cmpwi	r0, NAPPING_NOVCPU
+	beq	kvm_novcpu_wakeup
+	cmpwi	r0, NAPPING_UNSPLIT
+	beq	kvm_unsplit_wakeup
+	twi	31,0,0 /* Nap state must not be zero */
 
 33:	mr	r4, r3
 	li	r3, 0
@@ -2711,12 +2736,11 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	b	34f
 
 kvm_end_cede:
+	/* Woken by external or decrementer interrupt */
+
 	/* get vcpu pointer */
 	ld	r4, HSTATE_KVM_VCPU(r13)
 
-	/* Woken by external or decrementer interrupt */
-	ld	r1, HSTATE_HOST_R1(r13)
-
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
 	addi	r3, r4, VCPU_TB_RMINTR
 	bl	kvmhv_accumulate_time
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 182112fd845a..c9133f7908ca 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -505,6 +505,8 @@ static unsigned long power7_idle_insn(unsigned long type)
 	return srr1;
 }
 
+extern unsigned long idle_kvm_start_guest(unsigned long srr1);
+
 #ifdef CONFIG_HOTPLUG_CPU
 static unsigned long power7_offline(void)
 {
@@ -536,6 +538,14 @@ static unsigned long power7_offline(void)
 	srr1 = power7_idle_insn(power7_offline_type);
 	__ppc64_runlatch_on();
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	smp_mb();
+	if (local_paca->kvm_hstate.hwthread_req)
+		srr1 = idle_kvm_start_guest(srr1);
+#endif
+
 	mtmsr(MSR_KERNEL);
 
 	return srr1;
@@ -823,6 +833,11 @@ static unsigned long power9_offline_stop(unsigned long psscr)
 	srr1 = power9_idle_stop(psscr, false);
 	__ppc64_runlatch_on();
 
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	smp_mb();
+	if (local_paca->kvm_hstate.hwthread_req)
+		srr1 = idle_kvm_start_guest(srr1);
 	mtmsr(MSR_KERNEL);
 #endif
 
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 77197110e900..e583ed3f6b93 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2431,9 +2431,6 @@ static void dump_one_paca(int cpu)
 	DUMP(p, irq_happened, "%#-*x");
 	DUMP(p, io_sync, "%#-*x");
 	DUMP(p, irq_work_pending, "%#-*x");
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	DUMP(p, nap_state_lost, "%#-*x");
-#endif
 	DUMP(p, sprg_vdso, "%#-*llx");
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-- 
2.20.1


^ permalink raw reply related

* [PATCH v10 1/2] powerpc/64s: reimplement book3s idle code in C
From: Nicholas Piggin @ 2019-04-28 11:45 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gautham R . Shenoy, kvm-ppc, Nicholas Piggin
In-Reply-To: <20190428114515.32683-1-npiggin@gmail.com>

Reimplement Book3S idle code in C, moving POWER7/8/9 implementation
speific HV idle code to the powernv platform code.

Book3S assembly stubs are kept in common code and used only to save
the stack frame and non-volatile GPRs before executing architected
idle instructions, and restoring the stack and reloading GPRs then
returning to C after waking from idle.

The complex logic dealing with threads and subcores, locking, SPRs,
HMIs, timebase resync, etc., is all done in C which makes it more
maintainable.

This is not a strict 1:1 translation to C code, there are some
significant differences:

- Idle wakeup no longer uses the ->cpu_restore call to reinit SPRs,
  but saves and restores them itself.

- The optimisation where EC=ESL=0 idle modes did not have to save GPRs
  or change MSR is restored, because it's now simple to do. ESL=1
  sleeps that do not lose GPRs can use this optimization too.

- KVM secondary entry and cede is now more of a call/return style
  rather than branchy. nap_state_lost is not required because KVM
  always returns via NVGPR restoring path.

- KVM secondary wakeup from offline sequence is moved entirely into
  the offline wakeup, which avoids a hwsync in the normal idle wakeup
  path.

Performance measured with context switch ping-pong on different
threads or cores, is possibly improved a small amount, 1-3% depending
on stop state and core vs thread test for shallow states. Deep states
it's in the noise compared with other latencies.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Notes:
- I've broken the KVM code into the second patch just for review. The
  first patch makes KVM kind-of work with its existing design in the
  new idle code.

  The problem is the kvm_start_guest case branches out from the low
  level asm, which now no longer fixes up SPRs. Guests will somewhat
  run with deep idle states disabled for offline.

  Rather than a significant rework of the code to make old-KVM work
  with new-idle code that would need testing and review and then gets
  undone by the next patch, I have just broken it up like this for
  hopefully easier review of the KVM parts. Patches can be squashed
  together before upstream merge.

- P9 restores some of the PMU SPRs, but not others, and P8 only zeroes
  them. There are improvmets to be made to SPR save restore policies and
  documentation, but this first pass tries to keep things as they were.
  More work likely needs to be done here.

Left to do:
- Test actual POWER7 hardware. Will check about that tomorrow.

Since v9:
- Fix ELFv1 build error.
- Port to powerpc next.
- Test POWER8 and POWER9 online/offline
- Test POWER8 and POWER9 KVM guests with threads, combinations of MMU
  modes in guest and host, indep threads mode, etc.
---
 arch/powerpc/include/asm/cpuidle.h       |   19 +-
 arch/powerpc/include/asm/paca.h          |   41 +-
 arch/powerpc/include/asm/processor.h     |    9 +-
 arch/powerpc/include/asm/reg.h           |    8 +-
 arch/powerpc/kernel/asm-offsets.c        |   17 -
 arch/powerpc/kernel/exceptions-64s.S     |   25 +-
 arch/powerpc/kernel/idle_book3s.S        | 1097 +++-------------------
 arch/powerpc/kernel/setup-common.c       |    4 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |    6 +-
 arch/powerpc/platforms/powernv/idle.c    |  887 +++++++++++++----
 arch/powerpc/platforms/powernv/subcore.c |    2 +-
 arch/powerpc/xmon/xmon.c                 |   25 +-
 12 files changed, 941 insertions(+), 1199 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 43e5f31fe64d..9844b3ded187 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -27,10 +27,11 @@
  * the THREAD_WINKLE_BITS are set, which indicate which threads have not
  * yet woken from the winkle state.
  */
-#define PNV_CORE_IDLE_LOCK_BIT			0x10000000
+#define NR_PNV_CORE_IDLE_LOCK_BIT		28
+#define PNV_CORE_IDLE_LOCK_BIT			(1ULL << NR_PNV_CORE_IDLE_LOCK_BIT)
 
+#define PNV_CORE_IDLE_WINKLE_COUNT_SHIFT	16
 #define PNV_CORE_IDLE_WINKLE_COUNT		0x00010000
-#define PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT	0x00080000
 #define PNV_CORE_IDLE_WINKLE_COUNT_BITS		0x000F0000
 #define PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT	8
 #define PNV_CORE_IDLE_THREAD_WINKLE_BITS	0x0000FF00
@@ -68,16 +69,6 @@
 #define ERR_DEEP_STATE_ESL_MISMATCH	-2
 
 #ifndef __ASSEMBLY__
-/* Additional SPRs that need to be saved/restored during stop */
-struct stop_sprs {
-	u64 pid;
-	u64 ldbar;
-	u64 fscr;
-	u64 hfscr;
-	u64 mmcr1;
-	u64 mmcr2;
-	u64 mmcra;
-};
 
 #define PNV_IDLE_NAME_LEN    16
 struct pnv_idle_states_t {
@@ -92,10 +83,6 @@ struct pnv_idle_states_t {
 
 extern struct pnv_idle_states_t *pnv_idle_states;
 extern int nr_pnv_idle_states;
-extern u32 pnv_fastsleep_workaround_at_entry[];
-extern u32 pnv_fastsleep_workaround_at_exit[];
-
-extern u64 pnv_first_deep_stop_state;
 
 unsigned long pnv_cpu_offline(unsigned int cpu);
 int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index e843bc5d1a0f..e55dedd7ee3e 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -173,8 +173,8 @@ struct paca_struct {
 	u8 irq_happened;		/* irq happened while soft-disabled */
 	u8 io_sync;			/* writel() needs spin_unlock sync */
 	u8 irq_work_pending;		/* IRQ_WORK interrupt while soft-disable */
-	u8 nap_state_lost;		/* NV GPR values lost in power7_idle */
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	u8 nap_state_lost;		/* NV GPR values lost in power7_idle */
 	u8 pmcregs_in_use;		/* pseries puts this in lppaca */
 #endif
 	u64 sprg_vdso;			/* Saved user-visible sprg */
@@ -183,23 +183,28 @@ struct paca_struct {
 #endif
 
 #ifdef CONFIG_PPC_POWERNV
-	/* Per-core mask tracking idle threads and a lock bit-[L][TTTTTTTT] */
-	u32 *core_idle_state_ptr;
-	u8 thread_idle_state;		/* PNV_THREAD_RUNNING/NAP/SLEEP	*/
-	/* Mask to indicate thread id in core */
-	u8 thread_mask;
-	/* Mask to denote subcore sibling threads */
-	u8 subcore_sibling_mask;
-	/* Flag to request this thread not to stop */
-	atomic_t dont_stop;
-	/* The PSSCR value that the kernel requested before going to stop */
-	u64 requested_psscr;
-
-	/*
-	 * Save area for additional SPRs that need to be
-	 * saved/restored during cpuidle stop.
-	 */
-	struct stop_sprs stop_sprs;
+	/* PowerNV idle fields */
+	/* PNV_CORE_IDLE_* bits, all siblings work on thread 0 paca */
+	unsigned long idle_state;
+	union {
+		/* P7/P8 specific fields */
+		struct {
+			/* PNV_THREAD_RUNNING/NAP/SLEEP	*/
+			u8 thread_idle_state;
+			/* Mask to denote subcore sibling threads */
+			u8 subcore_sibling_mask;
+		};
+
+		/* P9 specific fields */
+		struct {
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+			/* The PSSCR value that the kernel requested before going to stop */
+			u64 requested_psscr;
+			/* Flag to request this thread not to stop */
+			atomic_t dont_stop;
+#endif
+		};
+	};
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 540949b397d4..706ac5df546f 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -414,14 +414,17 @@ static inline unsigned long get_clean_sp(unsigned long sp, int is_32)
 }
 #endif
 
+/* asm stubs */
+extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val);
+extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val);
+extern unsigned long isa206_idle_insn_mayloss(unsigned long type);
+
 extern unsigned long cpuidle_disable;
 enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
 extern int powersave_nap;	/* set if nap mode can be used in idle loop */
-extern unsigned long power7_idle_insn(unsigned long type); /* PNV_THREAD_NAP/etc*/
+
 extern void power7_idle_type(unsigned long type);
-extern unsigned long power9_idle_stop(unsigned long psscr_val);
-extern unsigned long power9_offline_stop(unsigned long psscr_val);
 extern void power9_idle_type(unsigned long stop_psscr_val,
 			      unsigned long stop_psscr_mask);
 
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c5b2aff0ce8e..10caa145f98b 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -168,6 +168,7 @@
 #define PSSCR_ESL		0x00200000 /* Enable State Loss */
 #define PSSCR_SD		0x00400000 /* Status Disable */
 #define PSSCR_PLS	0xf000000000000000 /* Power-saving Level Status */
+#define PSSCR_PLS_SHIFT	60
 #define PSSCR_GUEST_VIS	0xf0000000000003ffUL /* Guest-visible PSSCR fields */
 #define PSSCR_FAKE_SUSPEND	0x00000400 /* Fake-suspend bit (P9 DD2.2) */
 #define PSSCR_FAKE_SUSPEND_LG	10	   /* Fake-suspend bit position */
@@ -758,10 +759,9 @@
 #define	  SRR1_WAKERESET	0x00100000 /* System reset */
 #define   SRR1_WAKEHDBELL	0x000c0000 /* Hypervisor doorbell on P8 */
 #define	  SRR1_WAKESTATE	0x00030000 /* Powersave exit mask [46:47] */
-#define	  SRR1_WS_DEEPEST	0x00030000 /* Some resources not maintained,
-					  * may not be recoverable */
-#define	  SRR1_WS_DEEPER	0x00020000 /* Some resources not maintained */
-#define	  SRR1_WS_DEEP		0x00010000 /* All resources maintained */
+#define	  SRR1_WS_HVLOSS	0x00030000 /* HV resources not maintained */
+#define	  SRR1_WS_GPRLOSS	0x00020000 /* GPRs not maintained */
+#define	  SRR1_WS_NOLOSS	0x00010000 /* All resources maintained */
 #define   SRR1_PROGTM		0x00200000 /* TM Bad Thing */
 #define   SRR1_PROGFPE		0x00100000 /* Floating Point Enabled */
 #define   SRR1_PROGILL		0x00080000 /* Illegal instruction */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 60b82198de7c..a14a77bc4243 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -773,23 +773,6 @@ int main(void)
 	OFFSET(VCPU_TIMING_LAST_ENTER_TBL, kvm_vcpu, arch.timing_last_enter.tv32.tbl);
 #endif
 
-#ifdef CONFIG_PPC_POWERNV
-	OFFSET(PACA_CORE_IDLE_STATE_PTR, paca_struct, core_idle_state_ptr);
-	OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state);
-	OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
-	OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
-	OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr);
-	OFFSET(PACA_DONT_STOP, paca_struct, dont_stop);
-#define STOP_SPR(x, f)	OFFSET(x, paca_struct, stop_sprs.f)
-	STOP_SPR(STOP_PID, pid);
-	STOP_SPR(STOP_LDBAR, ldbar);
-	STOP_SPR(STOP_FSCR, fscr);
-	STOP_SPR(STOP_HFSCR, hfscr);
-	STOP_SPR(STOP_MMCR1, mmcr1);
-	STOP_SPR(STOP_MMCR2, mmcr2);
-	STOP_SPR(STOP_MMCRA, mmcra);
-#endif
-
 	DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
 	DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE);
 
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index bedd89438827..671c256da167 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -121,7 +121,7 @@ EXC_VIRT_NONE(0x4000, 0x100)
 	mfspr	r10,SPRN_SRR1 ;						\
 	rlwinm.	r10,r10,47-31,30,31 ;					\
 	beq-	1f ;							\
-	cmpwi	cr3,r10,2 ;						\
+	cmpwi	cr1,r10,2 ;						\
 	BRANCH_TO_C000(r10, system_reset_idle_common) ;			\
 1:									\
 	KVMTEST_PR(n) ;							\
@@ -145,8 +145,13 @@ TRAMP_KVM(PACA_EXNMI, 0x100)
 
 #ifdef CONFIG_PPC_P7_NAP
 EXC_COMMON_BEGIN(system_reset_idle_common)
-	mfspr	r12,SPRN_SRR1
-	b	pnv_powersave_wakeup
+	mfspr	r3,SPRN_SRR1
+#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/* this bltlr could  be moved before the branch_to, and the
+	 * branch_to could maybe go straight to idle_return */
+	bltlr	cr1	/* no state loss, return to idle caller */
+#endif
+	b	idle_return_gpr_loss
 #endif
 
 /*
@@ -429,17 +434,19 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
 	 * Then decrement MCE nesting after finishing with the stack.
 	 */
 	ld	r3,_MSR(r1)
+	ld	r4,_LINK(r1)
 
 	lhz	r11,PACA_IN_MCE(r13)
 	subi	r11,r11,1
 	sth	r11,PACA_IN_MCE(r13)
 
-	/* Turn off the RI bit because SRR1 is used by idle wakeup code. */
-	/* Recoverability could be improved by reducing the use of SRR1. */
-	li	r11,0
-	mtmsrd	r11,1
-
-	b	pnv_powersave_wakeup_mce
+	mtlr	r4
+	rlwinm	r10,r3,47-31,30,31
+	cmpwi	cr1,r10,2
+#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	bltlr	cr1	/* no state loss, return to idle caller */
+#endif
+	b	idle_return_gpr_loss
 #endif
 	/*
 	 * Handle machine check early in real mode. We come here with
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 4a860d3b9229..0fb2eb731a29 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -1,539 +1,98 @@
 /*
- *  This file contains idle entry/exit functions for POWER7,
- *  POWER8 and POWER9 CPUs.
+ *  Copyright 2018, IBM Corporation.
  *
  *  This program is free software; you can redistribute it and/or
  *  modify it under the terms of the GNU General Public License
  *  as published by the Free Software Foundation; either version
  *  2 of the License, or (at your option) any later version.
+ *
+ *  This file contains general idle entry/exit functions to save
+ *  and restore stack and NVGPRs which allows C code to call idle
+ *  states that lose GPRs, and it will return transparently with
+ *  SRR1 wakeup reason return value.
+ *
+ *  The platform / CPU caller must ensure SPRs and any other non-GPR
+ *  state is saved and restored correctly, handle KVM, interrupts, etc.
  */
 
-#include <linux/threads.h>
-#include <asm/processor.h>
-#include <asm/page.h>
-#include <asm/cputable.h>
-#include <asm/thread_info.h>
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/ppc-opcode.h>
-#include <asm/hw_irq.h>
-#include <asm/kvm_book3s_asm.h>
-#include <asm/opal.h>
 #include <asm/cpuidle.h>
-#include <asm/exception-64s.h>
-#include <asm/book3s/64/mmu-hash.h>
-#include <asm/mmu.h>
-#include <asm/asm-compat.h>
-#include <asm/feature-fixups.h>
-
-#undef DEBUG
-
-/*
- * Use unused space in the interrupt stack to save and restore
- * registers for winkle support.
- */
-#define _MMCR0	GPR0
-#define _SDR1	GPR3
-#define _PTCR	GPR3
-#define _RPR	GPR4
-#define _SPURR	GPR5
-#define _PURR	GPR6
-#define _TSCR	GPR7
-#define _DSCR	GPR8
-#define _AMOR	GPR9
-#define _WORT	GPR10
-#define _WORC	GPR11
-#define _LPCR	GPR12
-
-#define PSSCR_EC_ESL_MASK_SHIFTED          (PSSCR_EC | PSSCR_ESL) >> 16
-
-	.text
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+#include <asm/kvm_book3s_asm.h>
+#endif
 
 /*
- * Used by threads before entering deep idle states. Saves SPRs
- * in interrupt stack frame
- */
-save_sprs_to_stack:
-	/*
-	 * Note all register i.e per-core, per-subcore or per-thread is saved
-	 * here since any thread in the core might wake up first
-	 */
-BEGIN_FTR_SECTION
-	/*
-	 * Note - SDR1 is dropped in Power ISA v3. Hence not restoring
-	 * SDR1 here
-	 */
-	mfspr	r3,SPRN_PTCR
-	std	r3,_PTCR(r1)
-	mfspr	r3,SPRN_LPCR
-	std	r3,_LPCR(r1)
-FTR_SECTION_ELSE
-	mfspr	r3,SPRN_SDR1
-	std	r3,_SDR1(r1)
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
-	mfspr	r3,SPRN_RPR
-	std	r3,_RPR(r1)
-	mfspr	r3,SPRN_SPURR
-	std	r3,_SPURR(r1)
-	mfspr	r3,SPRN_PURR
-	std	r3,_PURR(r1)
-	mfspr	r3,SPRN_TSCR
-	std	r3,_TSCR(r1)
-	mfspr	r3,SPRN_DSCR
-	std	r3,_DSCR(r1)
-	mfspr	r3,SPRN_AMOR
-	std	r3,_AMOR(r1)
-	mfspr	r3,SPRN_WORT
-	std	r3,_WORT(r1)
-	mfspr	r3,SPRN_WORC
-	std	r3,_WORC(r1)
-/*
- * On POWER9, there are idle states such as stop4, invoked via cpuidle,
- * that lose hypervisor resources. In such cases, we need to save
- * additional SPRs before entering those idle states so that they can
- * be restored to their older values on wakeup from the idle state.
+ * Desired PSSCR in r3
  *
- * On POWER8, the only such deep idle state is winkle which is used
- * only in the context of CPU-Hotplug, where these additional SPRs are
- * reinitiazed to a sane value. Hence there is no need to save/restore
- * these SPRs.
- */
-BEGIN_FTR_SECTION
-	blr
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
-
-power9_save_additional_sprs:
-	mfspr	r3, SPRN_PID
-	mfspr	r4, SPRN_LDBAR
-	std	r3, STOP_PID(r13)
-	std	r4, STOP_LDBAR(r13)
-
-	mfspr	r3, SPRN_FSCR
-	mfspr	r4, SPRN_HFSCR
-	std	r3, STOP_FSCR(r13)
-	std	r4, STOP_HFSCR(r13)
-
-	mfspr	r3, SPRN_MMCRA
-	mfspr	r4, SPRN_MMCR0
-	std	r3, STOP_MMCRA(r13)
-	std	r4, _MMCR0(r1)
-
-	mfspr	r3, SPRN_MMCR1
-	mfspr	r4, SPRN_MMCR2
-	std	r3, STOP_MMCR1(r13)
-	std	r4, STOP_MMCR2(r13)
-	blr
-
-power9_restore_additional_sprs:
-	ld	r3,_LPCR(r1)
-	ld	r4, STOP_PID(r13)
-	mtspr	SPRN_LPCR,r3
-	mtspr	SPRN_PID, r4
-
-	ld	r3, STOP_LDBAR(r13)
-	ld	r4, STOP_FSCR(r13)
-	mtspr	SPRN_LDBAR, r3
-	mtspr	SPRN_FSCR, r4
-
-	ld	r3, STOP_HFSCR(r13)
-	ld	r4, STOP_MMCRA(r13)
-	mtspr	SPRN_HFSCR, r3
-	mtspr	SPRN_MMCRA, r4
-
-	ld	r3, _MMCR0(r1)
-	ld	r4, STOP_MMCR1(r13)
-	mtspr	SPRN_MMCR0, r3
-	mtspr	SPRN_MMCR1, r4
-
-	ld	r3, STOP_MMCR2(r13)
-	ld	r4, PACA_SPRG_VDSO(r13)
-	mtspr	SPRN_MMCR2, r3
-	mtspr	SPRN_SPRG3, r4
-	blr
-
-/*
- * Used by threads when the lock bit of core_idle_state is set.
- * Threads will spin in HMT_LOW until the lock bit is cleared.
- * r14 - pointer to core_idle_state
- * r15 - used to load contents of core_idle_state
- * r9  - used as a temporary variable
+ * No state will be lost regardless of wakeup mechanism (interrupt or NIA).
+ *
+ * An EC=0 type wakeup will return with a value of 0. SRESET wakeup (which can
+ * happen with xscom SRESET and possibly MCE) may clobber volatiles except LR,
+ * and must blr, to return to caller with r3 set according to caller's expected
+ * return code (for Book3S/64 that is SRR1).
  */
-
-core_idle_lock_held:
-	HMT_LOW
-3:	lwz	r15,0(r14)
-	andis.	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	bne	3b
-	HMT_MEDIUM
-	lwarx	r15,0,r14
-	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	bne-	core_idle_lock_held
+_GLOBAL(isa300_idle_stop_noloss)
+	mtspr 	SPRN_PSSCR,r3
+	PPC_STOP
+	li	r3,0
 	blr
 
-/* Reuse some unused pt_regs slots for AMR/IAMR/UAMOR/UAMOR */
-#define PNV_POWERSAVE_AMR	_TRAP
-#define PNV_POWERSAVE_IAMR	_DAR
-#define PNV_POWERSAVE_UAMOR	_DSISR
-#define PNV_POWERSAVE_AMOR	RESULT
-
 /*
- * Pass requested state in r3:
- *	r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8
- *	   - Requested PSSCR value in POWER9
+ * Desired PSSCR in r3
+ *
+ * GPRs may be lost, so they are saved here. Wakeup is by interrupt only.
+ * The SRESET wakeup returns to this function's caller by calling
+ * idle_return_gpr_loss with r3 set to desired return value.
+ *
+ * A wakeup without GPR loss may alteratively be handled as in
+ * isa300_idle_stop_noloss and blr directly, as an optimisation.
  *
- * Address of idle handler to branch to in realmode in r4
+ * The caller is responsible for saving/restoring SPRs, MSR, timebase,
+ * etc.
  */
-pnv_powersave_common:
-	/* Use r3 to pass state nap/sleep/winkle */
-	/* NAP is a state loss, we create a regs frame on the
-	 * stack, fill it up with the state we care about and
-	 * stick a pointer to it in PACAR1. We really only
-	 * need to save PC, some CR bits and the NV GPRs,
-	 * but for now an interrupt frame will do.
-	 */
-	mtctr	r4
-
-	mflr	r0
-	std	r0,16(r1)
-	stdu	r1,-INT_FRAME_SIZE(r1)
-	std	r0,_LINK(r1)
-	std	r0,_NIP(r1)
-
-	/* We haven't lost state ... yet */
-	li	r0,0
-	stb	r0,PACA_NAPSTATELOST(r13)
-
-	/* Continue saving state */
-	SAVE_GPR(2, r1)
-	SAVE_NVGPRS(r1)
-
-BEGIN_FTR_SECTION
-	mfspr	r4, SPRN_AMR
-	mfspr	r5, SPRN_IAMR
-	mfspr	r6, SPRN_UAMOR
-	std	r4, PNV_POWERSAVE_AMR(r1)
-	std	r5, PNV_POWERSAVE_IAMR(r1)
-	std	r6, PNV_POWERSAVE_UAMOR(r1)
-BEGIN_FTR_SECTION_NESTED(42)
-	mfspr	r7, SPRN_AMOR
-	std	r7, PNV_POWERSAVE_AMOR(r1)
-END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-
-	mfcr	r5
-	std	r5,_CCR(r1)
+_GLOBAL(isa300_idle_stop_mayloss)
+	mtspr 	SPRN_PSSCR,r3
 	std	r1,PACAR1(r13)
-
-BEGIN_FTR_SECTION
-	/*
-	 * POWER9 does not require real mode to stop, and presently does not
-	 * set hwthread_state for KVM (threads don't share MMU context), so
-	 * we can remain in virtual mode for this.
-	 */
-	bctr
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-	/*
-	 * POWER8
-	 * Go to real mode to do the nap, as required by the architecture.
-	 * Also, we need to be in real mode before setting hwthread_state,
-	 * because as soon as we do that, another thread can switch
-	 * the MMU context to the guest.
-	 */
-	LOAD_REG_IMMEDIATE(r7, MSR_IDLE)
-	mtmsrd	r7,0
-	bctr
-
-/*
- * This is the sequence required to execute idle instructions, as
- * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0.
- */
-#define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST)			\
-	/* Magic NAP/SLEEP/WINKLE mode enter sequence */	\
-	std	r0,0(r1);					\
-	ptesync;						\
-	ld	r0,0(r1);					\
-236:	cmpd	cr0,r0,r0;					\
-	bne	236b;						\
-	IDLE_INST;
-
-
-	.globl pnv_enter_arch207_idle_mode
-pnv_enter_arch207_idle_mode:
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	/* Tell KVM we're entering idle */
-	li	r4,KVM_HWTHREAD_IN_IDLE
-	/******************************************************/
-	/*  N O T E   W E L L    ! ! !    N O T E   W E L L   */
-	/* The following store to HSTATE_HWTHREAD_STATE(r13)  */
-	/* MUST occur in real mode, i.e. with the MMU off,    */
-	/* and the MMU must stay off until we clear this flag */
-	/* and test HSTATE_HWTHREAD_REQ(r13) in               */
-	/* pnv_powersave_wakeup in this file.                 */
-	/* The reason is that another thread can switch the   */
-	/* MMU to a guest context whenever this flag is set   */
-	/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on,    */
-	/* that would potentially cause this thread to start  */
-	/* executing instructions from guest memory in        */
-	/* hypervisor mode, leading to a host crash or data   */
-	/* corruption, or worse.                              */
-	/******************************************************/
-	stb	r4,HSTATE_HWTHREAD_STATE(r13)
-#endif
-	stb	r3,PACA_THREAD_IDLE_STATE(r13)
-	cmpwi	cr3,r3,PNV_THREAD_SLEEP
-	bge	cr3,2f
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
-	/* No return */
-2:
-	/* Sleep or winkle */
-	lbz	r7,PACA_THREAD_MASK(r13)
-	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
-	li	r5,0
-	beq	cr3,3f
-	lis	r5,PNV_CORE_IDLE_WINKLE_COUNT@h
-3:
-lwarx_loop1:
-	lwarx	r15,0,r14
-
-	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	bnel-	core_idle_lock_held
-
-	add	r15,r15,r5			/* Add if winkle */
-	andc	r15,r15,r7			/* Clear thread bit */
-
-	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
-
-/*
- * If cr0 = 0, then current thread is the last thread of the core entering
- * sleep. Last thread needs to execute the hardware bug workaround code if
- * required by the platform.
- * Make the workaround call unconditionally here. The below branch call is
- * patched out when the idle states are discovered if the platform does not
- * require it.
- */
-.global pnv_fastsleep_workaround_at_entry
-pnv_fastsleep_workaround_at_entry:
-	beq	fastsleep_workaround_at_entry
-
-	stwcx.	r15,0,r14
-	bne-	lwarx_loop1
-	isync
-
-common_enter: /* common code for all the threads entering sleep or winkle */
-	bgt	cr3,enter_winkle
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
-
-fastsleep_workaround_at_entry:
-	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	stwcx.	r15,0,r14
-	bne-	lwarx_loop1
-	isync
-
-	/* Fast sleep workaround */
-	li	r3,1
-	li	r4,1
-	bl	opal_config_cpu_idle_state
-
-	/* Unlock */
-	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	lwsync
-	stw	r15,0(r14)
-	b	common_enter
-
-enter_winkle:
-	bl	save_sprs_to_stack
-
-	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
-
-/*
- * r3 - PSSCR value corresponding to the requested stop state.
- */
-power_enter_stop:
-/*
- * Check if we are executing the lite variant with ESL=EC=0
- */
-	andis.   r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
-	clrldi   r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */
-	bne	 .Lhandle_esl_ec_set
+	mflr	r4
+	mfcr	r5
+	/* use stack red zone rather than a new frame for saving regs */
+	std	r2,-8*0(r1)
+	std	r14,-8*1(r1)
+	std	r15,-8*2(r1)
+	std	r16,-8*3(r1)
+	std	r17,-8*4(r1)
+	std	r18,-8*5(r1)
+	std	r19,-8*6(r1)
+	std	r20,-8*7(r1)
+	std	r21,-8*8(r1)
+	std	r22,-8*9(r1)
+	std	r23,-8*10(r1)
+	std	r24,-8*11(r1)
+	std	r25,-8*12(r1)
+	std	r26,-8*13(r1)
+	std	r27,-8*14(r1)
+	std	r28,-8*15(r1)
+	std	r29,-8*16(r1)
+	std	r30,-8*17(r1)
+	std	r31,-8*18(r1)
+	std	r4,-8*19(r1)
+	std	r5,-8*20(r1)
+	/* 168 bytes */
 	PPC_STOP
-	li	r3,0  /* Since we didn't lose state, return 0 */
-	std	r3, PACA_REQ_PSSCR(r13)
-
-	/*
-	 * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so
-	 * it can determine if the wakeup reason is an HMI in
-	 * CHECK_HMI_INTERRUPT.
-	 *
-	 * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup
-	 * reason, so there is no point setting r12 to SRR1.
-	 *
-	 * Further, we clear r12 here, so that we don't accidentally enter the
-	 * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI.
-	 */
-	li	r12, 0
-	b 	pnv_wakeup_noloss
-
-.Lhandle_esl_ec_set:
-BEGIN_FTR_SECTION
-	/*
-	 * POWER9 DD2.0 or earlier can incorrectly set PMAO when waking up after
-	 * a state-loss idle. Saving and restoring MMCR0 over idle is a
-	 * workaround.
-	 */
-	mfspr	r4,SPRN_MMCR0
-	std	r4,_MMCR0(r1)
-END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
+	b	.	/* catch bugs */
 
 /*
- * Check if the requested state is a deep idle state.
- */
-	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
-	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
-	cmpd	r3,r4
-	bge	.Lhandle_deep_stop
-	PPC_STOP	/* Does not return (system reset interrupt) */
-
-.Lhandle_deep_stop:
-/*
- * Entering deep idle state.
- * Clear thread bit in PACA_CORE_IDLE_STATE, save SPRs to
- * stack and enter stop
- */
-	lbz     r7,PACA_THREAD_MASK(r13)
-	ld      r14,PACA_CORE_IDLE_STATE_PTR(r13)
-
-lwarx_loop_stop:
-	lwarx   r15,0,r14
-	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	bnel-	core_idle_lock_held
-	andc    r15,r15,r7                      /* Clear thread bit */
-
-	stwcx.  r15,0,r14
-	bne-    lwarx_loop_stop
-	isync
-
-	bl	save_sprs_to_stack
-
-	PPC_STOP	/* Does not return (system reset interrupt) */
-
-/*
- * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
- * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE).
- */
-_GLOBAL(power7_idle_insn)
-	/* Now check if user or arch enabled NAP mode */
-	LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode)
-	b	pnv_powersave_common
-
-#define CHECK_HMI_INTERRUPT						\
-BEGIN_FTR_SECTION_NESTED(66);						\
-	rlwinm	r0,r12,45-31,0xf;  /* extract wake reason field (P8) */	\
-FTR_SECTION_ELSE_NESTED(66);						\
-	rlwinm	r0,r12,45-31,0xe;  /* P7 wake reason field is 3 bits */	\
-ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
-	cmpwi	r0,0xa;			/* Hypervisor maintenance ? */	\
-	bne+	20f;							\
-	/* Invoke opal call to handle hmi */				\
-	ld	r2,PACATOC(r13);					\
-	ld	r1,PACAR1(r13);						\
-	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
-	li	r3,0;			/* NULL argument */		\
-	bl	hmi_exception_realmode;					\
-	nop;								\
-	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
-20:	nop;
-
-/*
- * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
- * r3 contains desired PSSCR register value.
+ * Desired return value in r3
  *
- * Offline (CPU unplug) case also must notify KVM that the CPU is
- * idle.
- */
-_GLOBAL(power9_offline_stop)
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	/*
-	 * Tell KVM we're entering idle.
-	 * This does not have to be done in real mode because the P9 MMU
-	 * is independent per-thread. Some steppings share radix/hash mode
-	 * between threads, but in that case KVM has a barrier sync in real
-	 * mode before and after switching between radix and hash.
-	 */
-	li	r4,KVM_HWTHREAD_IN_IDLE
-	stb	r4,HSTATE_HWTHREAD_STATE(r13)
-#endif
-	/* fall through */
-
-_GLOBAL(power9_idle_stop)
-	std	r3, PACA_REQ_PSSCR(r13)
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-BEGIN_FTR_SECTION
-	sync
-	lwz	r5, PACA_DONT_STOP(r13)
-	cmpwi	r5, 0
-	bne	1f
-END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
-#endif
-	mtspr 	SPRN_PSSCR,r3
-	LOAD_REG_ADDR(r4,power_enter_stop)
-	b	pnv_powersave_common
-	/* No return */
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-1:
-	/*
-	 * We get here when TM / thread reconfiguration bug workaround
-	 * code wants to get the CPU into SMT4 mode, and therefore
-	 * we are being asked not to stop.
-	 */
-	li	r3, 0
-	std	r3, PACA_REQ_PSSCR(r13)
-	blr		/* return 0 for wakeup cause / SRR1 value */
-#endif
-
-/*
- * Called from machine check handler for powersave wakeups.
- * Low level machine check processing has already been done. Now just
- * go through the wake up path to get everything in order.
+ * The idle wakeup SRESET interrupt can call this after calling
+ * to return to the idle sleep function caller with r3 as the return code.
  *
- * r3 - The original SRR1 value.
- * Original SRR[01] have been clobbered.
- * MSR_RI is clear.
- */
-.global pnv_powersave_wakeup_mce
-pnv_powersave_wakeup_mce:
-	/* Set cr3 for pnv_powersave_wakeup */
-	rlwinm	r11,r3,47-31,30,31
-	cmpwi	cr3,r11,2
-
-	/*
-	 * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake
-	 * reason into r12, which allows reuse of the system reset wakeup
-	 * code without being mistaken for another type of wakeup.
-	 */
-	oris	r12,r3,SRR1_WAKEMCE_RESVD@h
-
-	b	pnv_powersave_wakeup
-
-/*
- * Called from reset vector for powersave wakeups.
- * cr3 - set to gt if waking up with partial/complete hypervisor state loss
- * r12 - SRR1
+ * This must not be used if idle was entered via a _noloss function (use
+ * a simple blr instead).
  */
-.global pnv_powersave_wakeup
-pnv_powersave_wakeup:
-	ld	r2, PACATOC(r13)
-
-BEGIN_FTR_SECTION
-	bl	pnv_restore_hyp_resource_arch300
-FTR_SECTION_ELSE
-	bl	pnv_restore_hyp_resource_arch207
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
-
-	li	r0,PNV_THREAD_RUNNING
-	stb	r0,PACA_THREAD_IDLE_STATE(r13)	/* Clear thread state */
-
-	mr	r3,r12
-
+_GLOBAL(idle_return_gpr_loss)
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	lbz	r0,HSTATE_HWTHREAD_STATE(r13)
 	cmpwi	r0,KVM_HWTHREAD_IN_KERNEL
@@ -547,449 +106,105 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 	beq	1f
 	b	kvm_start_guest
 1:
+	lbz	r0,PACA_NAPSTATELOST(r13)
+	cmpwi	r0,0
+	bne	2f
+	bltlr	cr1
+2:
 #endif
-
-	/* Return SRR1 from power7_nap() */
-	blt	cr3,pnv_wakeup_noloss
-	b	pnv_wakeup_loss
+	ld	r1,PACAR1(r13)
+	ld	r4,-8*19(r1)
+	ld	r5,-8*20(r1)
+	mtlr	r4
+	mtcr	r5
+	/*
+	 * KVM nap requires r2 to be saved, rather than just restoring it
+	 * from PACATOC. This could be avoided for that less common case
+	 * if KVM saved its r2.
+	 */
+	ld	r2,-8*0(r1)
+	ld	r14,-8*1(r1)
+	ld	r15,-8*2(r1)
+	ld	r16,-8*3(r1)
+	ld	r17,-8*4(r1)
+	ld	r18,-8*5(r1)
+	ld	r19,-8*6(r1)
+	ld	r20,-8*7(r1)
+	ld	r21,-8*8(r1)
+	ld	r22,-8*9(r1)
+	ld	r23,-8*10(r1)
+	ld	r24,-8*11(r1)
+	ld	r25,-8*12(r1)
+	ld	r26,-8*13(r1)
+	ld	r27,-8*14(r1)
+	ld	r28,-8*15(r1)
+	ld	r29,-8*16(r1)
+	ld	r30,-8*17(r1)
+	ld	r31,-8*18(r1)
+	blr
 
 /*
- * Check whether we have woken up with hypervisor state loss.
- * If yes, restore hypervisor state and return back to link.
+ * This is the sequence required to execute idle instructions, as
+ * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0.
  *
- * cr3 - set to gt if waking up with partial/complete hypervisor state loss
+ * The 0(r1) slot is used to save r2 in isa206, so use that here.
  */
-pnv_restore_hyp_resource_arch300:
-	/*
-	 * Workaround for POWER9, if we lost resources, the ERAT
-	 * might have been mixed up and needs flushing. We also need
-	 * to reload MMCR0 (see comment above). We also need to set
-	 * then clear bit 60 in MMCRA to ensure the PMU starts running.
-	 */
-	blt	cr3,1f
-BEGIN_FTR_SECTION
-	PPC_INVALIDATE_ERAT
-	ld	r1,PACAR1(r13)
-	ld	r4,_MMCR0(r1)
-	mtspr	SPRN_MMCR0,r4
-END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
-	mfspr	r4,SPRN_MMCRA
-	ori	r4,r4,(1 << (63-60))
-	mtspr	SPRN_MMCRA,r4
-	xori	r4,r4,(1 << (63-60))
-	mtspr	SPRN_MMCRA,r4
-1:
-	/*
-	 * POWER ISA 3. Use PSSCR to determine if we
-	 * are waking up from deep idle state
-	 */
-	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
-	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
-
-	/*
-	 * 0-3 bits correspond to Power-Saving Level Status
-	 * which indicates the idle state we are waking up from
-	 */
-	mfspr	r5, SPRN_PSSCR
-	rldicl  r5,r5,4,60
-	li	r0, 0		/* clear requested_psscr to say we're awake */
-	std	r0, PACA_REQ_PSSCR(r13)
-	cmpd	cr4,r5,r4
-	bge	cr4,pnv_wakeup_tb_loss /* returns to caller */
-
-	blr	/* Waking up without hypervisor state loss. */
-
-/* Same calling convention as arch300 */
-pnv_restore_hyp_resource_arch207:
-	/*
-	 * POWER ISA 2.07 or less.
-	 * Check if we slept with sleep or winkle.
-	 */
-	lbz	r4,PACA_THREAD_IDLE_STATE(r13)
-	cmpwi	cr2,r4,PNV_THREAD_NAP
-	bgt	cr2,pnv_wakeup_tb_loss	/* Either sleep or Winkle */
-
-	/*
-	 * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking
-	 * up from nap. At this stage CR3 shouldn't contains 'gt' since that
-	 * indicates we are waking with hypervisor state loss from nap.
-	 */
-	bgt	cr3,.
-
-	blr	/* Waking up without hypervisor state loss */
+#define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST)			\
+	/* Magic NAP/SLEEP/WINKLE mode enter sequence */	\
+	std	r2,0(r1);					\
+	ptesync;						\
+	ld	r2,0(r1);					\
+236:	cmpd	cr0,r2,r2;					\
+	bne	236b;						\
+	IDLE_INST;						\
+	b	.	/* catch bugs */
 
 /*
- * Called if waking up from idle state which can cause either partial or
- * complete hyp state loss.
- * In POWER8, called if waking up from fastsleep or winkle
- * In POWER9, called if waking up from stop state >= pnv_first_deep_stop_state
+ * Desired instruction type in r3
  *
- * r13 - PACA
- * cr3 - gt if waking up with partial/complete hypervisor state loss
+ * GPRs may be lost, so they are saved here. Wakeup is by interrupt only.
+ * The SRESET wakeup returns to this function's caller by calling
+ * idle_return_gpr_loss with r3 set to desired return value.
  *
- * If ISA300:
- * cr4 - gt or eq if waking up from complete hypervisor state loss.
+ * A wakeup without GPR loss may alteratively be handled as in
+ * isa300_idle_stop_noloss and blr directly, as an optimisation.
  *
- * If ISA207:
- * r4 - PACA_THREAD_IDLE_STATE
- */
-pnv_wakeup_tb_loss:
-	ld	r1,PACAR1(r13)
-	/*
-	 * Before entering any idle state, the NVGPRs are saved in the stack.
-	 * If there was a state loss, or PACA_NAPSTATELOST was set, then the
-	 * NVGPRs are restored. If we are here, it is likely that state is lost,
-	 * but not guaranteed -- neither ISA207 nor ISA300 tests to reach
-	 * here are the same as the test to restore NVGPRS:
-	 * PACA_THREAD_IDLE_STATE test for ISA207, PSSCR test for ISA300,
-	 * and SRR1 test for restoring NVGPRs.
-	 *
-	 * We are about to clobber NVGPRs now, so set NAPSTATELOST to
-	 * guarantee they will always be restored. This might be tightened
-	 * with careful reading of specs (particularly for ISA300) but this
-	 * is already a slow wakeup path and it's simpler to be safe.
-	 */
-	li	r0,1
-	stb	r0,PACA_NAPSTATELOST(r13)
-
-	/*
-	 *
-	 * Save SRR1 and LR in NVGPRs as they might be clobbered in
-	 * opal_call() (called in CHECK_HMI_INTERRUPT). SRR1 is required
-	 * to determine the wakeup reason if we branch to kvm_start_guest. LR
-	 * is required to return back to reset vector after hypervisor state
-	 * restore is complete.
-	 */
-	mr	r19,r12
-	mr	r18,r4
-	mflr	r17
-BEGIN_FTR_SECTION
-	CHECK_HMI_INTERRUPT
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
-
-	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
-	lbz	r7,PACA_THREAD_MASK(r13)
-
-	/*
-	 * Take the core lock to synchronize against other threads.
-	 *
-	 * Lock bit is set in one of the 2 cases-
-	 * a. In the sleep/winkle enter path, the last thread is executing
-	 * fastsleep workaround code.
-	 * b. In the wake up path, another thread is executing fastsleep
-	 * workaround undo code or resyncing timebase or restoring context
-	 * In either case loop until the lock bit is cleared.
-	 */
-1:
-	lwarx	r15,0,r14
-	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	bnel-	core_idle_lock_held
-	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	stwcx.	r15,0,r14
-	bne-	1b
-	isync
-
-	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
-	cmpwi	cr2,r9,0
-
-	/*
-	 * At this stage
-	 * cr2 - eq if first thread to wakeup in core
-	 * cr3-  gt if waking up with partial/complete hypervisor state loss
-	 * ISA300:
-	 * cr4 - gt or eq if waking up from complete hypervisor state loss.
-	 */
-
-BEGIN_FTR_SECTION
-	/*
-	 * Were we in winkle?
-	 * If yes, check if all threads were in winkle, decrement our
-	 * winkle count, set all thread winkle bits if all were in winkle.
-	 * Check if our thread has a winkle bit set, and set cr4 accordingly
-	 * (to match ISA300, above). Pseudo-code for core idle state
-	 * transitions for ISA207 is as follows (everything happens atomically
-	 * due to store conditional and/or lock bit):
-	 *
-	 * nap_idle() { }
-	 * nap_wake() { }
-	 *
-	 * sleep_idle()
-	 * {
-	 *	core_idle_state &= ~thread_in_core
-	 * }
-	 *
-	 * sleep_wake()
-	 * {
-	 *     bool first_in_core, first_in_subcore;
-	 *
-	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
-	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
-	 *
-	 *     core_idle_state |= thread_in_core;
-	 * }
-	 *
-	 * winkle_idle()
-	 * {
-	 *	core_idle_state &= ~thread_in_core;
-	 *	core_idle_state += 1 << WINKLE_COUNT_SHIFT;
-	 * }
-	 *
-	 * winkle_wake()
-	 * {
-	 *     bool first_in_core, first_in_subcore, winkle_state_lost;
-	 *
-	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
-	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
-	 *
-	 *     core_idle_state |= thread_in_core;
-	 *
-	 *     if ((core_idle_state & WINKLE_MASK) == (8 << WINKLE_COUNT_SIHFT))
-	 *         core_idle_state |= THREAD_WINKLE_BITS;
-	 *     core_idle_state -= 1 << WINKLE_COUNT_SHIFT;
-	 *
-	 *     winkle_state_lost = core_idle_state &
-	 *				(thread_in_core << WINKLE_THREAD_SHIFT);
-	 *     core_idle_state &= ~(thread_in_core << WINKLE_THREAD_SHIFT);
-	 * }
-	 *
-	 */
-	cmpwi	r18,PNV_THREAD_WINKLE
-	bne	2f
-	andis.	r9,r15,PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT@h
-	subis	r15,r15,PNV_CORE_IDLE_WINKLE_COUNT@h
-	beq	2f
-	ori	r15,r15,PNV_CORE_IDLE_THREAD_WINKLE_BITS /* all were winkle */
-2:
-	/* Shift thread bit to winkle mask, then test if this thread is set,
-	 * and remove it from the winkle bits */
-	slwi	r8,r7,8
-	and	r8,r8,r15
-	andc	r15,r15,r8
-	cmpwi	cr4,r8,1 /* cr4 will be gt if our bit is set, lt if not */
-
-	lbz	r4,PACA_SUBCORE_SIBLING_MASK(r13)
-	and	r4,r4,r15
-	cmpwi	r4,0	/* Check if first in subcore */
-
-	or	r15,r15,r7		/* Set thread bit */
-	beq	first_thread_in_subcore
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
-
-	or	r15,r15,r7		/* Set thread bit */
-	beq	cr2,first_thread_in_core
-
-	/* Not first thread in core or subcore to wake up */
-	b	clear_lock
-
-first_thread_in_subcore:
-	/*
-	 * If waking up from sleep, subcore state is not lost. Hence
-	 * skip subcore state restore
-	 */
-	blt	cr4,subcore_state_restored
-
-	/* Restore per-subcore state */
-	ld      r4,_SDR1(r1)
-	mtspr   SPRN_SDR1,r4
-
-	ld      r4,_RPR(r1)
-	mtspr   SPRN_RPR,r4
-	ld	r4,_AMOR(r1)
-	mtspr	SPRN_AMOR,r4
-
-subcore_state_restored:
-	/*
-	 * Check if the thread is also the first thread in the core. If not,
-	 * skip to clear_lock.
-	 */
-	bne	cr2,clear_lock
-
-first_thread_in_core:
-
-	/*
-	 * First thread in the core waking up from any state which can cause
-	 * partial or complete hypervisor state loss. It needs to
-	 * call the fastsleep workaround code if the platform requires it.
-	 * Call it unconditionally here. The below branch instruction will
-	 * be patched out if the platform does not have fastsleep or does not
-	 * require the workaround. Patching will be performed during the
-	 * discovery of idle-states.
-	 */
-.global pnv_fastsleep_workaround_at_exit
-pnv_fastsleep_workaround_at_exit:
-	b	fastsleep_workaround_at_exit
-
-timebase_resync:
-	/*
-	 * Use cr3 which indicates that we are waking up with atleast partial
-	 * hypervisor state loss to determine if TIMEBASE RESYNC is needed.
-	 */
-	ble	cr3,.Ltb_resynced
-	/* Time base re-sync */
-	bl	opal_resync_timebase;
-	/*
-	 * If waking up from sleep (POWER8), per core state
-	 * is not lost, skip to clear_lock.
-	 */
-.Ltb_resynced:
-	blt	cr4,clear_lock
-
-	/*
-	 * First thread in the core to wake up and its waking up with
-	 * complete hypervisor state loss. Restore per core hypervisor
-	 * state.
-	 */
-BEGIN_FTR_SECTION
-	ld	r4,_PTCR(r1)
-	mtspr	SPRN_PTCR,r4
-	ld	r4,_RPR(r1)
-	mtspr	SPRN_RPR,r4
-	ld	r4,_AMOR(r1)
-	mtspr	SPRN_AMOR,r4
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-
-	ld	r4,_TSCR(r1)
-	mtspr	SPRN_TSCR,r4
-	ld	r4,_WORC(r1)
-	mtspr	SPRN_WORC,r4
-
-clear_lock:
-	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
-	lwsync
-	stw	r15,0(r14)
-
-common_exit:
-	/*
-	 * Common to all threads.
-	 *
-	 * If waking up from sleep, hypervisor state is not lost. Hence
-	 * skip hypervisor state restore.
-	 */
-	blt	cr4,hypervisor_state_restored
-
-	/* Waking up from winkle */
-
-BEGIN_MMU_FTR_SECTION
-	b	no_segments
-END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
-	/* Restore SLB  from PACA */
-	ld	r8,PACA_SLBSHADOWPTR(r13)
-
-	.rept	SLB_NUM_BOLTED
-	li	r3, SLBSHADOW_SAVEAREA
-	LDX_BE	r5, r8, r3
-	addi	r3, r3, 8
-	LDX_BE	r6, r8, r3
-	andis.	r7,r5,SLB_ESID_V@h
-	beq	1f
-	slbmte	r6,r5
-1:	addi	r8,r8,16
-	.endr
-no_segments:
-
-	/* Restore per thread state */
-
-	ld	r4,_SPURR(r1)
-	mtspr	SPRN_SPURR,r4
-	ld	r4,_PURR(r1)
-	mtspr	SPRN_PURR,r4
-	ld	r4,_DSCR(r1)
-	mtspr	SPRN_DSCR,r4
-	ld	r4,_WORT(r1)
-	mtspr	SPRN_WORT,r4
-
-	/* Call cur_cpu_spec->cpu_restore() */
-	LOAD_REG_ADDR(r4, cur_cpu_spec)
-	ld	r4,0(r4)
-	ld	r12,CPU_SPEC_RESTORE(r4)
-#ifdef PPC64_ELF_ABI_v1
-	ld	r12,0(r12)
-#endif
-	mtctr	r12
-	bctrl
-
-/*
- * On POWER9, we can come here on wakeup from a cpuidle stop state.
- * Hence restore the additional SPRs to the saved value.
+ * The caller is responsible for saving/restoring SPRs, MSR, timebase,
+ * etc.
  *
- * On POWER8, we come here only on winkle. Since winkle is used
- * only in the case of CPU-Hotplug, we don't need to restore
- * the additional SPRs.
+ * This must be called in real-mode (MSR_IDLE).
  */
-BEGIN_FTR_SECTION
-	bl 	power9_restore_additional_sprs
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-hypervisor_state_restored:
-
-	mr	r12,r19
-	mtlr	r17
-	blr		/* return to pnv_powersave_wakeup */
-
-fastsleep_workaround_at_exit:
-	li	r3,1
-	li	r4,0
-	bl	opal_config_cpu_idle_state
-	b	timebase_resync
-
-/*
- * R3 here contains the value that will be returned to the caller
- * of power7_nap.
- * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
- */
-.global pnv_wakeup_loss
-pnv_wakeup_loss:
-	ld	r1,PACAR1(r13)
-BEGIN_FTR_SECTION
-	CHECK_HMI_INTERRUPT
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
-	REST_NVGPRS(r1)
-	REST_GPR(2, r1)
-
-BEGIN_FTR_SECTION
-	/* These regs were saved in pnv_powersave_common() */
-	ld	r4, PNV_POWERSAVE_AMR(r1)
-	ld	r5, PNV_POWERSAVE_IAMR(r1)
-	ld	r6, PNV_POWERSAVE_UAMOR(r1)
-	mtspr	SPRN_AMR, r4
-	mtspr	SPRN_IAMR, r5
-	mtspr	SPRN_UAMOR, r6
-BEGIN_FTR_SECTION_NESTED(42)
-	ld	r7, PNV_POWERSAVE_AMOR(r1)
-	mtspr	SPRN_AMOR, r7
-END_FTR_SECTION_NESTED_IFSET(CPU_FTR_HVMODE, 42)
-	/*
-	 * We don't need an isync here after restoring IAMR because the upcoming
-	 * mtmsrd is execution synchronizing.
-	 */
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-
-	ld	r4,PACAKMSR(r13)
-	ld	r5,_LINK(r1)
-	ld	r6,_CCR(r1)
-	addi	r1,r1,INT_FRAME_SIZE
-	mtlr	r5
-	mtcr	r6
-	mtmsrd	r4
-	blr
+_GLOBAL(isa206_idle_insn_mayloss)
+	std	r1,PACAR1(r13)
+	mflr	r4
+	mfcr	r5
+	/* use stack red zone rather than a new frame for saving regs */
+	std	r2,-8*0(r1)
+	std	r14,-8*1(r1)
+	std	r15,-8*2(r1)
+	std	r16,-8*3(r1)
+	std	r17,-8*4(r1)
+	std	r18,-8*5(r1)
+	std	r19,-8*6(r1)
+	std	r20,-8*7(r1)
+	std	r21,-8*8(r1)
+	std	r22,-8*9(r1)
+	std	r23,-8*10(r1)
+	std	r24,-8*11(r1)
+	std	r25,-8*12(r1)
+	std	r26,-8*13(r1)
+	std	r27,-8*14(r1)
+	std	r28,-8*15(r1)
+	std	r29,-8*16(r1)
+	std	r30,-8*17(r1)
+	std	r31,-8*18(r1)
+	std	r4,-8*19(r1)
+	std	r5,-8*20(r1)
+	cmpwi	r3,PNV_THREAD_NAP
+	bne	1f
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
+1:	cmpwi	r3,PNV_THREAD_SLEEP
+	bne	2f
+	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
+2:	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
 
-/*
- * R3 here contains the value that will be returned to the caller
- * of power7_nap.
- * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
- */
-pnv_wakeup_noloss:
-	lbz	r0,PACA_NAPSTATELOST(r13)
-	cmpwi	r0,0
-	bne	pnv_wakeup_loss
-	ld	r1,PACAR1(r13)
-BEGIN_FTR_SECTION
-	CHECK_HMI_INTERRUPT
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
-	ld	r4,PACAKMSR(r13)
-	ld	r5,_NIP(r1)
-	ld	r6,_CCR(r1)
-	addi	r1,r1,INT_FRAME_SIZE
-	mtlr	r5
-	mtcr	r6
-	mtmsrd	r4
-	blr
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 1729bf409562..70dc10aa0ccf 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -401,8 +401,8 @@ void __init check_for_initrd(void)
 
 #ifdef CONFIG_SMP
 
-int threads_per_core, threads_per_subcore, threads_shift;
-cpumask_t threads_core_mask;
+int threads_per_core, threads_per_subcore, threads_shift __read_mostly;
+cpumask_t threads_core_mask __read_mostly;
 EXPORT_SYMBOL_GPL(threads_per_core);
 EXPORT_SYMBOL_GPL(threads_per_subcore);
 EXPORT_SYMBOL_GPL(threads_shift);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 3a5e719ef032..58d0f1ba845d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -448,8 +448,10 @@ kvm_no_guest:
 	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
 	mtspr	SPRN_LPCR, r4
 	li	r3, 0
-	mfspr	r12,SPRN_SRR1
-	b	pnv_wakeup_loss
+	/* set up cr3 and r3 for return */
+	cmpdi	cr3, r3, 0
+	mfspr	r3,SPRN_SRR1
+	b	idle_return_gpr_loss
 
 53:	HMT_LOW
 	ld	r5, HSTATE_KVM_VCORE(r13)
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index e52f9b06dd9c..182112fd845a 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -16,6 +16,7 @@
 #include <linux/device.h>
 #include <linux/cpu.h>
 
+#include <asm/asm-prototypes.h>
 #include <asm/firmware.h>
 #include <asm/machdep.h>
 #include <asm/opal.h>
@@ -48,10 +49,10 @@ static u64 pnv_default_stop_mask;
 static bool default_stop_found;
 
 /*
- * First deep stop state. Used to figure out when to save/restore
- * hypervisor context.
+ * First stop state levels when SPR and TB loss can occur.
  */
-u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
+static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
+static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
 
 /*
  * psscr value and mask of the deepest stop idle state.
@@ -62,6 +63,8 @@ static u64 pnv_deepest_stop_psscr_mask;
 static u64 pnv_deepest_stop_flag;
 static bool deepest_stop_found;
 
+static unsigned long power7_offline_type;
+
 static int pnv_save_sprs_for_deep_states(void)
 {
 	int cpu;
@@ -72,12 +75,12 @@ static int pnv_save_sprs_for_deep_states(void)
 	 * all cpus at boot. Get these reg values of current cpu and use the
 	 * same across all cpus.
 	 */
-	uint64_t lpcr_val = mfspr(SPRN_LPCR);
-	uint64_t hid0_val = mfspr(SPRN_HID0);
-	uint64_t hid1_val = mfspr(SPRN_HID1);
-	uint64_t hid4_val = mfspr(SPRN_HID4);
-	uint64_t hid5_val = mfspr(SPRN_HID5);
-	uint64_t hmeer_val = mfspr(SPRN_HMEER);
+	uint64_t lpcr_val	= mfspr(SPRN_LPCR);
+	uint64_t hid0_val	= mfspr(SPRN_HID0);
+	uint64_t hid1_val	= mfspr(SPRN_HID1);
+	uint64_t hid4_val	= mfspr(SPRN_HID4);
+	uint64_t hid5_val	= mfspr(SPRN_HID5);
+	uint64_t hmeer_val	= mfspr(SPRN_HMEER);
 	uint64_t msr_val = MSR_IDLE;
 	uint64_t psscr_val = pnv_deepest_stop_psscr_val;
 
@@ -137,89 +140,6 @@ static int pnv_save_sprs_for_deep_states(void)
 	return 0;
 }
 
-static void pnv_alloc_idle_core_states(void)
-{
-	int i, j;
-	int nr_cores = cpu_nr_cores();
-	u32 *core_idle_state;
-
-	/*
-	 * core_idle_state - The lower 8 bits track the idle state of
-	 * each thread of the core.
-	 *
-	 * The most significant bit is the lock bit.
-	 *
-	 * Initially all the bits corresponding to threads_per_core
-	 * are set. They are cleared when the thread enters deep idle
-	 * state like sleep and winkle/stop.
-	 *
-	 * Initially the lock bit is cleared.  The lock bit has 2
-	 * purposes:
-	 * 	a. While the first thread in the core waking up from
-	 * 	   idle is restoring core state, it prevents other
-	 * 	   threads in the core from switching to process
-	 * 	   context.
-	 * 	b. While the last thread in the core is saving the
-	 *	   core state, it prevents a different thread from
-	 *	   waking up.
-	 */
-	for (i = 0; i < nr_cores; i++) {
-		int first_cpu = i * threads_per_core;
-		int node = cpu_to_node(first_cpu);
-		size_t paca_ptr_array_size;
-
-		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
-		*core_idle_state = (1 << threads_per_core) - 1;
-		paca_ptr_array_size = (threads_per_core *
-				       sizeof(struct paca_struct *));
-
-		for (j = 0; j < threads_per_core; j++) {
-			int cpu = first_cpu + j;
-
-			paca_ptrs[cpu]->core_idle_state_ptr = core_idle_state;
-			paca_ptrs[cpu]->thread_idle_state = PNV_THREAD_RUNNING;
-			paca_ptrs[cpu]->thread_mask = 1 << j;
-		}
-	}
-
-	update_subcore_sibling_mask();
-
-	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
-		int rc = pnv_save_sprs_for_deep_states();
-
-		if (likely(!rc))
-			return;
-
-		/*
-		 * The stop-api is unable to restore hypervisor
-		 * resources on wakeup from platform idle states which
-		 * lose full context. So disable such states.
-		 */
-		supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
-		pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
-		pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
-
-		if (cpu_has_feature(CPU_FTR_ARCH_300) &&
-		    (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
-			/*
-			 * Use the default stop state for CPU-Hotplug
-			 * if available.
-			 */
-			if (default_stop_found) {
-				pnv_deepest_stop_psscr_val =
-					pnv_default_stop_val;
-				pnv_deepest_stop_psscr_mask =
-					pnv_default_stop_mask;
-				pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
-					pnv_deepest_stop_psscr_val);
-			} else { /* Fallback to snooze loop for CPU-Hotplug */
-				deepest_stop_found = false;
-				pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
-			}
-		}
-	}
-}
-
 u32 pnv_get_supported_cpuidle_states(void)
 {
 	return supported_cpuidle_states;
@@ -238,6 +158,9 @@ static void pnv_fastsleep_workaround_apply(void *info)
 		*err = 1;
 }
 
+static bool power7_fastsleep_workaround_entry = true;
+static bool power7_fastsleep_workaround_exit = true;
+
 /*
  * Used to store fastsleep workaround state
  * 0 - Workaround applied/undone at fastsleep entry/exit path (Default)
@@ -269,21 +192,15 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
 	 * fastsleep_workaround_applyonce = 1 implies
 	 * fastsleep workaround needs to be left in 'applied' state on all
 	 * the cores. Do this by-
-	 * 1. Patching out the call to 'undo' workaround in fastsleep exit path
-	 * 2. Sending ipi to all the cores which have at least one online thread
-	 * 3. Patching out the call to 'apply' workaround in fastsleep entry
-	 * path
+	 * 1. Disable the 'undo' workaround in fastsleep exit path
+	 * 2. Sendi IPIs to all the cores which have at least one online thread
+	 * 3. Disable the 'apply' workaround in fastsleep entry path
+	 *
 	 * There is no need to send ipi to cores which have all threads
 	 * offlined, as last thread of the core entering fastsleep or deeper
 	 * state would have applied workaround.
 	 */
-	err = patch_instruction(
-		(unsigned int *)pnv_fastsleep_workaround_at_exit,
-		PPC_INST_NOP);
-	if (err) {
-		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_exit");
-		goto fail;
-	}
+	power7_fastsleep_workaround_exit = false;
 
 	get_online_cpus();
 	primary_thread_mask = cpu_online_cores_map();
@@ -296,13 +213,7 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
 		goto fail;
 	}
 
-	err = patch_instruction(
-		(unsigned int *)pnv_fastsleep_workaround_at_entry,
-		PPC_INST_NOP);
-	if (err) {
-		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_entry");
-		goto fail;
-	}
+	power7_fastsleep_workaround_entry = false;
 
 	fastsleep_workaround_applyonce = 1;
 
@@ -315,27 +226,336 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
 			show_fastsleep_workaround_applyonce,
 			store_fastsleep_workaround_applyonce);
 
-static unsigned long __power7_idle_type(unsigned long type)
+static inline void atomic_start_thread_idle(void)
 {
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	int thread_nr = cpu_thread_in_core(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	clear_bit(thread_nr, state);
+}
+
+static inline void atomic_stop_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	int thread_nr = cpu_thread_in_core(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	set_bit(thread_nr, state);
+}
+
+static inline void atomic_lock_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	while (unlikely(test_and_set_bit_lock(NR_PNV_CORE_IDLE_LOCK_BIT, state)))
+		barrier();
+}
+
+static inline void atomic_unlock_and_stop_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long thread = 1UL << cpu_thread_in_core(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+	u64 s = READ_ONCE(*state);
+	u64 new, tmp;
+
+	BUG_ON(!(s & PNV_CORE_IDLE_LOCK_BIT));
+	BUG_ON(s & thread);
+
+again:
+	new = (s | thread) & ~PNV_CORE_IDLE_LOCK_BIT;
+	tmp = cmpxchg(state, s, new);
+	if (unlikely(tmp != s)) {
+		s = tmp;
+		goto again;
+	}
+}
+
+static inline void atomic_unlock_thread_idle(void)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+
+	BUG_ON(!test_bit(NR_PNV_CORE_IDLE_LOCK_BIT, state));
+	clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, state);
+}
+
+/* P7 and P8 */
+struct p7_sprs {
+	/* per core */
+	u64 tscr;
+	u64 worc;
+
+	/* per subcore */
+	u64 sdr1;
+	u64 rpr;
+
+	/* per thread */
+	u64 lpcr;
+	u64 hfscr;
+	u64 fscr;
+	u64 purr;
+	u64 spurr;
+	u64 dscr;
+	u64 wort;
+
+	/* per thread SPRs that get lost in shallow states */
+	u64 amr;
+	u64 iamr;
+	u64 amor;
+	u64 uamor;
+};
+
+static unsigned long power7_idle_insn(unsigned long type)
+{
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+	unsigned long thread = 1UL << cpu_thread_in_core(cpu);
+	unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
 	unsigned long srr1;
+	bool full_winkle;
+	struct p7_sprs sprs = {}; /* avoid false use-uninitialised */
+	bool sprs_saved = false;
+	int rc;
 
-	if (!prep_irq_for_idle_irqsoff())
-		return 0;
+	if (unlikely(type != PNV_THREAD_NAP)) {
+		atomic_lock_thread_idle();
+
+		BUG_ON(!(*state & thread));
+		*state &= ~thread;
+
+		if (power7_fastsleep_workaround_entry) {
+			if ((*state & core_thread_mask) == 0) {
+				rc = opal_config_cpu_idle_state(
+						OPAL_CONFIG_IDLE_FASTSLEEP,
+						OPAL_CONFIG_IDLE_APPLY);
+				BUG_ON(rc);
+			}
+		}
+
+		if (type == PNV_THREAD_WINKLE) {
+			sprs.tscr	= mfspr(SPRN_TSCR);
+			sprs.worc	= mfspr(SPRN_WORC);
+
+			sprs.sdr1	= mfspr(SPRN_SDR1);
+			sprs.rpr	= mfspr(SPRN_RPR);
+
+			sprs.lpcr	= mfspr(SPRN_LPCR);
+			if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+				sprs.hfscr	= mfspr(SPRN_HFSCR);
+				sprs.fscr	= mfspr(SPRN_FSCR);
+			}
+			sprs.purr	= mfspr(SPRN_PURR);
+			sprs.spurr	= mfspr(SPRN_SPURR);
+			sprs.dscr	= mfspr(SPRN_DSCR);
+			sprs.wort	= mfspr(SPRN_WORT);
+
+			sprs_saved = true;
+
+			/*
+			 * Increment winkle counter and set all winkle bits if
+			 * all threads are winkling. This allows wakeup side to
+			 * distinguish between fast sleep and winkle state
+			 * loss. Fast sleep still has to resync the timebase so
+			 * this may not be a really big win.
+			 */
+			*state += 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+			if ((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS)
+					>> PNV_CORE_IDLE_WINKLE_COUNT_SHIFT
+					== threads_per_core)
+				*state |= PNV_CORE_IDLE_THREAD_WINKLE_BITS;
+			WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+		}
+
+		atomic_unlock_thread_idle();
+	}
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		sprs.amr	= mfspr(SPRN_AMR);
+		sprs.iamr	= mfspr(SPRN_IAMR);
+		sprs.amor	= mfspr(SPRN_AMOR);
+		sprs.uamor	= mfspr(SPRN_UAMOR);
+	}
+
+	local_paca->thread_idle_state = type;
+	srr1 = isa206_idle_insn_mayloss(type);		/* go idle */
+	local_paca->thread_idle_state = PNV_THREAD_RUNNING;
+
+	WARN_ON_ONCE(!srr1);
+	WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
+			/*
+			 * We don't need an isync after the mtsprs here because
+			 * the upcoming mtmsrd is execution synchronizing.
+			 */
+			mtspr(SPRN_AMR,		sprs.amr);
+			mtspr(SPRN_IAMR,	sprs.iamr);
+			mtspr(SPRN_AMOR,	sprs.amor);
+			mtspr(SPRN_UAMOR,	sprs.uamor);
+		}
+	}
+
+	if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
+		hmi_exception_realmode(NULL);
+
+	if (likely((srr1 & SRR1_WAKESTATE) != SRR1_WS_HVLOSS)) {
+		if (unlikely(type != PNV_THREAD_NAP)) {
+			atomic_lock_thread_idle();
+			if (type == PNV_THREAD_WINKLE) {
+				WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+				*state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+				*state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT);
+			}
+			atomic_unlock_and_stop_thread_idle();
+		}
+		return srr1;
+	}
+
+	/* HV state loss */
+	BUG_ON(type == PNV_THREAD_NAP);
+
+	atomic_lock_thread_idle();
+
+	full_winkle = false;
+	if (type == PNV_THREAD_WINKLE) {
+		WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+		*state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+		if (*state & (thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT)) {
+			*state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT);
+			full_winkle = true;
+			BUG_ON(!sprs_saved);
+		}
+	}
+
+	WARN_ON(*state & thread);
+
+	if ((*state & core_thread_mask) != 0)
+		goto core_woken;
+
+	/* Per-core SPRs */
+	if (full_winkle) {
+		mtspr(SPRN_TSCR,	sprs.tscr);
+		mtspr(SPRN_WORC,	sprs.worc);
+	}
+
+	if (power7_fastsleep_workaround_exit) {
+		rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
+						OPAL_CONFIG_IDLE_UNDO);
+		BUG_ON(rc);
+	}
+
+	/* TB */
+	if (opal_resync_timebase() != OPAL_SUCCESS)
+		BUG();
+
+core_woken:
+	if (!full_winkle)
+		goto subcore_woken;
+
+	if ((*state & local_paca->subcore_sibling_mask) != 0)
+		goto subcore_woken;
+
+	/* Per-subcore SPRs */
+	mtspr(SPRN_SDR1,	sprs.sdr1);
+	mtspr(SPRN_RPR,		sprs.rpr);
+
+subcore_woken:
+	/*
+	 * isync after restoring shared SPRs and before unlocking. Unlock
+	 * only contains hwsync which does not necessarily do the right
+	 * thing for SPRs.
+	 */
+	isync();
+	atomic_unlock_and_stop_thread_idle();
+
+	/* Fast sleep does not lose SPRs */
+	if (!full_winkle)
+		return srr1;
+
+	/* Per-thread SPRs */
+	mtspr(SPRN_LPCR,	sprs.lpcr);
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		mtspr(SPRN_HFSCR,	sprs.hfscr);
+		mtspr(SPRN_FSCR,	sprs.fscr);
+	}
+	mtspr(SPRN_PURR,	sprs.purr);
+	mtspr(SPRN_SPURR,	sprs.spurr);
+	mtspr(SPRN_DSCR,	sprs.dscr);
+	mtspr(SPRN_WORT,	sprs.wort);
+
+	mtspr(SPRN_SPRG3,	local_paca->sprg_vdso);
+
+	/*
+	 * The SLB has to be restored here, but it sometimes still
+	 * contains entries, so the __ variant must be used to prevent
+	 * multi hits.
+	 */
+	__slb_restore_bolted_realmode();
+
+	return srr1;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static unsigned long power7_offline(void)
+{
+	unsigned long srr1;
+
+	mtmsr(MSR_IDLE);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	/* Tell KVM we're entering idle. */
+	/******************************************************/
+	/*  N O T E   W E L L    ! ! !    N O T E   W E L L   */
+	/* The following store to HSTATE_HWTHREAD_STATE(r13)  */
+	/* MUST occur in real mode, i.e. with the MMU off,    */
+	/* and the MMU must stay off until we clear this flag */
+	/* and test HSTATE_HWTHREAD_REQ(r13) in               */
+	/* pnv_powersave_wakeup in this file.                 */
+	/* The reason is that another thread can switch the   */
+	/* MMU to a guest context whenever this flag is set   */
+	/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on,    */
+	/* that would potentially cause this thread to start  */
+	/* executing instructions from guest memory in        */
+	/* hypervisor mode, leading to a host crash or data   */
+	/* corruption, or worse.                              */
+	/******************************************************/
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
+#endif
 
 	__ppc64_runlatch_off();
-	srr1 = power7_idle_insn(type);
+	srr1 = power7_idle_insn(power7_offline_type);
 	__ppc64_runlatch_on();
 
-	fini_irq_for_idle_irqsoff();
+	mtmsr(MSR_KERNEL);
 
 	return srr1;
 }
+#endif
 
 void power7_idle_type(unsigned long type)
 {
 	unsigned long srr1;
 
-	srr1 = __power7_idle_type(type);
+	if (!prep_irq_for_idle_irqsoff())
+		return;
+
+	mtmsr(MSR_IDLE);
+	__ppc64_runlatch_off();
+	srr1 = power7_idle_insn(type);
+	__ppc64_runlatch_on();
+	mtmsr(MSR_KERNEL);
+
+	fini_irq_for_idle_irqsoff();
 	irq_set_pending_from_srr1(srr1);
 }
 
@@ -347,33 +567,287 @@ void power7_idle(void)
 	power7_idle_type(PNV_THREAD_NAP);
 }
 
-static unsigned long __power9_idle_type(unsigned long stop_psscr_val,
-				      unsigned long stop_psscr_mask)
+struct p9_sprs {
+	/* per core */
+	u64 ptcr;
+	u64 rpr;
+	u64 tscr;
+	u64 ldbar;
+
+	/* per thread */
+	u64 lpcr;
+	u64 hfscr;
+	u64 fscr;
+	u64 pid;
+	u64 purr;
+	u64 spurr;
+	u64 dscr;
+	u64 wort;
+
+	u64 mmcra;
+	u32 mmcr0;
+	u32 mmcr1;
+	u64 mmcr2;
+
+	/* per thread SPRs that get lost in shallow states */
+	u64 amr;
+	u64 iamr;
+	u64 amor;
+	u64 uamor;
+};
+
+static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
 {
-	unsigned long psscr;
+	int cpu = raw_smp_processor_id();
+	int first = cpu_first_thread_sibling(cpu);
+	unsigned long *state = &paca_ptrs[first]->idle_state;
+	unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
 	unsigned long srr1;
+	unsigned long pls;
+	unsigned long mmcr0 = 0;
+	struct p9_sprs sprs = {}; /* avoid false used-uninitialised */
+	bool sprs_saved = false;
 
-	if (!prep_irq_for_idle_irqsoff())
-		return 0;
+	if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
+		/* EC=ESL=0 case */
+
+		BUG_ON(!mmu_on);
+
+		/*
+		 * Wake synchronously. SRESET via xscom may still cause
+		 * a 0x100 powersave wakeup with SRR1 reason!
+		 */
+		srr1 = isa300_idle_stop_noloss(psscr);		/* go idle */
+		if (likely(!srr1))
+			return 0;
+
+		/*
+		 * Registers not saved, can't recover!
+		 * This would be a hardware bug
+		 */
+		BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS);
+
+		goto out;
+	}
+
+	/* EC=ESL=1 case */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	if (cpu_has_feature(CPU_FTR_P9_TM_XER_SO_BUG)) {
+		local_paca->requested_psscr = psscr;
+		/* order setting requested_psscr vs testing dont_stop */
+		smp_mb();
+		if (atomic_read(&local_paca->dont_stop)) {
+			local_paca->requested_psscr = 0;
+			return 0;
+		}
+	}
+#endif
+
+	if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
+		 /*
+		  * POWER9 DD2 can incorrectly set PMAO when waking up
+		  * after a state-loss idle. Saving and restoring MMCR0
+		  * over idle is a workaround.
+		  */
+		mmcr0		= mfspr(SPRN_MMCR0);
+	}
+	if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) {
+		sprs.lpcr	= mfspr(SPRN_LPCR);
+		sprs.hfscr	= mfspr(SPRN_HFSCR);
+		sprs.fscr	= mfspr(SPRN_FSCR);
+		sprs.pid	= mfspr(SPRN_PID);
+		sprs.purr	= mfspr(SPRN_PURR);
+		sprs.spurr	= mfspr(SPRN_SPURR);
+		sprs.dscr	= mfspr(SPRN_DSCR);
+		sprs.wort	= mfspr(SPRN_WORT);
+
+		sprs.mmcra	= mfspr(SPRN_MMCRA);
+		sprs.mmcr0	= mfspr(SPRN_MMCR0);
+		sprs.mmcr1	= mfspr(SPRN_MMCR1);
+		sprs.mmcr2	= mfspr(SPRN_MMCR2);
+
+		sprs.ptcr	= mfspr(SPRN_PTCR);
+		sprs.rpr	= mfspr(SPRN_RPR);
+		sprs.tscr	= mfspr(SPRN_TSCR);
+		sprs.ldbar	= mfspr(SPRN_LDBAR);
+
+		sprs_saved = true;
+
+		atomic_start_thread_idle();
+	}
+
+	sprs.amr	= mfspr(SPRN_AMR);
+	sprs.iamr	= mfspr(SPRN_IAMR);
+	sprs.amor	= mfspr(SPRN_AMOR);
+	sprs.uamor	= mfspr(SPRN_UAMOR);
+
+	srr1 = isa300_idle_stop_mayloss(psscr);		/* go idle */
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	local_paca->requested_psscr = 0;
+#endif
 
 	psscr = mfspr(SPRN_PSSCR);
-	psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
 
+	WARN_ON_ONCE(!srr1);
+	WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
+
+	if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
+		unsigned long mmcra;
+
+		/*
+		 * We don't need an isync after the mtsprs here because the
+		 * upcoming mtmsrd is execution synchronizing.
+		 */
+		mtspr(SPRN_AMR,		sprs.amr);
+		mtspr(SPRN_IAMR,	sprs.iamr);
+		mtspr(SPRN_AMOR,	sprs.amor);
+		mtspr(SPRN_UAMOR,	sprs.uamor);
+
+		/*
+		 * Workaround for POWER9 DD2.0, if we lost resources, the ERAT
+		 * might have been corrupted and needs flushing. We also need
+		 * to reload MMCR0 (see mmcr0 comment above).
+		 */
+		if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
+			asm volatile(PPC_INVALIDATE_ERAT);
+			mtspr(SPRN_MMCR0, mmcr0);
+		}
+
+		/*
+		 * DD2.2 and earlier need to set then clear bit 60 in MMCRA
+		 * to ensure the PMU starts running.
+		 */
+		mmcra = mfspr(SPRN_MMCRA);
+		mmcra |= PPC_BIT(60);
+		mtspr(SPRN_MMCRA, mmcra);
+		mmcra &= ~PPC_BIT(60);
+		mtspr(SPRN_MMCRA, mmcra);
+	}
+
+	if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
+		hmi_exception_realmode(NULL);
+
+	/*
+	 * On POWER9, SRR1 bits do not match exactly as expected.
+	 * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so
+	 * just always test PSSCR for SPR/TB state loss.
+	 */
+	pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
+	if (likely(pls < pnv_first_spr_loss_level)) {
+		if (sprs_saved)
+			atomic_stop_thread_idle();
+		goto out;
+	}
+
+	/* HV state loss */
+	BUG_ON(!sprs_saved);
+
+	atomic_lock_thread_idle();
+
+	if ((*state & core_thread_mask) != 0)
+		goto core_woken;
+
+	/* Per-core SPRs */
+	mtspr(SPRN_PTCR,	sprs.ptcr);
+	mtspr(SPRN_RPR,		sprs.rpr);
+	mtspr(SPRN_TSCR,	sprs.tscr);
+	mtspr(SPRN_LDBAR,	sprs.ldbar);
+
+	if (pls >= pnv_first_tb_loss_level) {
+		/* TB loss */
+		if (opal_resync_timebase() != OPAL_SUCCESS)
+			BUG();
+	}
+
+	/*
+	 * isync after restoring shared SPRs and before unlocking. Unlock
+	 * only contains hwsync which does not necessarily do the right
+	 * thing for SPRs.
+	 */
+	isync();
+
+core_woken:
+	atomic_unlock_and_stop_thread_idle();
+
+	/* Per-thread SPRs */
+	mtspr(SPRN_LPCR,	sprs.lpcr);
+	mtspr(SPRN_HFSCR,	sprs.hfscr);
+	mtspr(SPRN_FSCR,	sprs.fscr);
+	mtspr(SPRN_PID,		sprs.pid);
+	mtspr(SPRN_PURR,	sprs.purr);
+	mtspr(SPRN_SPURR,	sprs.spurr);
+	mtspr(SPRN_DSCR,	sprs.dscr);
+	mtspr(SPRN_WORT,	sprs.wort);
+
+	mtspr(SPRN_MMCRA,	sprs.mmcra);
+	mtspr(SPRN_MMCR0,	sprs.mmcr0);
+	mtspr(SPRN_MMCR1,	sprs.mmcr1);
+	mtspr(SPRN_MMCR2,	sprs.mmcr2);
+
+	mtspr(SPRN_SPRG3,	local_paca->sprg_vdso);
+
+	if (!radix_enabled())
+		__slb_restore_bolted_realmode();
+
+out:
+	if (mmu_on)
+		mtmsr(MSR_KERNEL);
+
+	return srr1;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static unsigned long power9_offline_stop(unsigned long psscr)
+{
+	unsigned long srr1;
+
+#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	__ppc64_runlatch_off();
-	srr1 = power9_idle_stop(psscr);
+	srr1 = power9_idle_stop(psscr, true);
 	__ppc64_runlatch_on();
+#else
+	/*
+	 * Tell KVM we're entering idle.
+	 * This does not have to be done in real mode because the P9 MMU
+	 * is independent per-thread. Some steppings share radix/hash mode
+	 * between threads, but in that case KVM has a barrier sync in real
+	 * mode before and after switching between radix and hash.
+	 *
+	 * kvm_start_guest must still be called in real mode though, hence
+	 * the false argument.
+	 */
+	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
 
-	fini_irq_for_idle_irqsoff();
+	__ppc64_runlatch_off();
+	srr1 = power9_idle_stop(psscr, false);
+	__ppc64_runlatch_on();
+
+	mtmsr(MSR_KERNEL);
+#endif
 
 	return srr1;
 }
+#endif
 
 void power9_idle_type(unsigned long stop_psscr_val,
 				      unsigned long stop_psscr_mask)
 {
+	unsigned long psscr;
 	unsigned long srr1;
 
-	srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask);
+	if (!prep_irq_for_idle_irqsoff())
+		return;
+
+	psscr = mfspr(SPRN_PSSCR);
+	psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
+
+	__ppc64_runlatch_off();
+	srr1 = power9_idle_stop(psscr, true);
+	__ppc64_runlatch_on();
+
+	fini_irq_for_idle_irqsoff();
+
 	irq_set_pending_from_srr1(srr1);
 }
 
@@ -409,7 +883,7 @@ void pnv_power9_force_smt4_catch(void)
 			atomic_inc(&paca_ptrs[cpu0+thr]->dont_stop);
 	}
 	/* order setting dont_stop vs testing requested_psscr */
-	mb();
+	smp_mb();
 	for (thr = 0; thr < threads_per_core; ++thr) {
 		if (!paca_ptrs[cpu0+thr]->requested_psscr)
 			++awake_threads;
@@ -481,7 +955,6 @@ void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
 unsigned long pnv_cpu_offline(unsigned int cpu)
 {
 	unsigned long srr1;
-	u32 idle_states = pnv_get_supported_cpuidle_states();
 
 	__ppc64_runlatch_off();
 
@@ -492,15 +965,8 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
 		psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
 						pnv_deepest_stop_psscr_val;
 		srr1 = power9_offline_stop(psscr);
-
-	} else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
-		   (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
-		srr1 = power7_idle_insn(PNV_THREAD_WINKLE);
-	} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
-		   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
-		srr1 = power7_idle_insn(PNV_THREAD_SLEEP);
-	} else if (idle_states & OPAL_PM_NAP_ENABLED) {
-		srr1 = power7_idle_insn(PNV_THREAD_NAP);
+	} else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) {
+		srr1 = power7_offline();
 	} else {
 		/* This is the fallback method. We emulate snooze */
 		while (!generic_check_cpu_restart(cpu)) {
@@ -596,33 +1062,44 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
  * @dt_idle_states: Number of idle state entries
  * Returns 0 on success
  */
-static int __init pnv_power9_idle_init(void)
+static void __init pnv_power9_idle_init(void)
 {
 	u64 max_residency_ns = 0;
 	int i;
 
 	/*
-	 * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask},
-	 * and the pnv_default_stop_{val,mask}.
-	 *
-	 * pnv_first_deep_stop_state should be set to the first stop
-	 * level to cause hypervisor state loss.
-	 *
 	 * pnv_deepest_stop_{val,mask} should be set to values corresponding to
 	 * the deepest stop state.
 	 *
 	 * pnv_default_stop_{val,mask} should be set to values corresponding to
-	 * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state.
+	 * the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state.
 	 */
-	pnv_first_deep_stop_state = MAX_STOP_STATE;
+	pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
+	pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
 	for (i = 0; i < nr_pnv_idle_states; i++) {
 		int err;
 		struct pnv_idle_states_t *state = &pnv_idle_states[i];
 		u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK;
 
+		if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
+		     (pnv_first_tb_loss_level > psscr_rl))
+			pnv_first_tb_loss_level = psscr_rl;
+
 		if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
-		    pnv_first_deep_stop_state > psscr_rl)
-			pnv_first_deep_stop_state = psscr_rl;
+		     (pnv_first_spr_loss_level > psscr_rl))
+			pnv_first_spr_loss_level = psscr_rl;
+
+		/*
+		 * The idle code does not deal with TB loss occurring
+		 * in a shallower state than SPR loss, so force it to
+		 * behave like SPRs are lost if TB is lost. POWER9 would
+		 * never encouter this, but a POWER8 core would if it
+		 * implemented the stop instruction. So this is for forward
+		 * compatibility.
+		 */
+		if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
+		     (pnv_first_spr_loss_level > psscr_rl))
+			pnv_first_spr_loss_level = psscr_rl;
 
 		err = validate_psscr_val_mask(&state->psscr_val,
 					      &state->psscr_mask,
@@ -647,6 +1124,7 @@ static int __init pnv_power9_idle_init(void)
 			pnv_default_stop_val = state->psscr_val;
 			pnv_default_stop_mask = state->psscr_mask;
 			default_stop_found = true;
+			WARN_ON(state->flags & OPAL_PM_LOSE_FULL_CONTEXT);
 		}
 	}
 
@@ -666,10 +1144,40 @@ static int __init pnv_power9_idle_init(void)
 			pnv_deepest_stop_psscr_mask);
 	}
 
-	pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n",
-		pnv_first_deep_stop_state);
+	pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%lld\n",
+		pnv_first_spr_loss_level);
 
-	return 0;
+	pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%lld\n",
+		pnv_first_tb_loss_level);
+}
+
+static void __init pnv_disable_deep_states(void)
+{
+	/*
+	 * The stop-api is unable to restore hypervisor
+	 * resources on wakeup from platform idle states which
+	 * lose full context. So disable such states.
+	 */
+	supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
+	pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
+	pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+	    (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
+		/*
+		 * Use the default stop state for CPU-Hotplug
+		 * if available.
+		 */
+		if (default_stop_found) {
+			pnv_deepest_stop_psscr_val = pnv_default_stop_val;
+			pnv_deepest_stop_psscr_mask = pnv_default_stop_mask;
+			pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
+				pnv_deepest_stop_psscr_val);
+		} else { /* Fallback to snooze loop for CPU-Hotplug */
+			deepest_stop_found = false;
+			pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
+		}
+	}
 }
 
 /*
@@ -684,10 +1192,8 @@ static void __init pnv_probe_idle_states(void)
 		return;
 	}
 
-	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-		if (pnv_power9_idle_init())
-			return;
-	}
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		pnv_power9_idle_init();
 
 	for (i = 0; i < nr_pnv_idle_states; i++)
 		supported_cpuidle_states |= pnv_idle_states[i].flags;
@@ -807,11 +1313,33 @@ static int pnv_parse_cpuidle_dt(void)
 
 static int __init pnv_init_idle_states(void)
 {
+	int cpu;
 	int rc = 0;
-	supported_cpuidle_states = 0;
+
+	/* Set up PACA fields */
+	for_each_present_cpu(cpu) {
+		struct paca_struct *p = paca_ptrs[cpu];
+
+		p->idle_state = 0;
+		if (cpu == cpu_first_thread_sibling(cpu))
+			p->idle_state = (1 << threads_per_core) - 1;
+
+		if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+			/* P7/P8 nap */
+			p->thread_idle_state = PNV_THREAD_RUNNING;
+		} else {
+			/* P9 stop */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+			p->requested_psscr = 0;
+			atomic_set(&p->dont_stop, 0);
+#endif
+		}
+	}
 
 	/* In case we error out nr_pnv_idle_states will be zero */
 	nr_pnv_idle_states = 0;
+	supported_cpuidle_states = 0;
+
 	if (cpuidle_disable != IDLE_NO_OVERRIDE)
 		goto out;
 	rc = pnv_parse_cpuidle_dt();
@@ -819,27 +1347,40 @@ static int __init pnv_init_idle_states(void)
 		return rc;
 	pnv_probe_idle_states();
 
-	if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
-		patch_instruction(
-			(unsigned int *)pnv_fastsleep_workaround_at_entry,
-			PPC_INST_NOP);
-		patch_instruction(
-			(unsigned int *)pnv_fastsleep_workaround_at_exit,
-			PPC_INST_NOP);
-	} else {
-		/*
-		 * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
-		 * workaround is needed to use fastsleep. Provide sysfs
-		 * control to choose how this workaround has to be applied.
-		 */
-		device_create_file(cpu_subsys.dev_root,
+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+		if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+			power7_fastsleep_workaround_entry = false;
+			power7_fastsleep_workaround_exit = false;
+		} else {
+			/*
+			 * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
+			 * workaround is needed to use fastsleep. Provide sysfs
+			 * control to choose how this workaround has to be
+			 * applied.
+			 */
+			device_create_file(cpu_subsys.dev_root,
 				&dev_attr_fastsleep_workaround_applyonce);
-	}
+		}
+
+		update_subcore_sibling_mask();
+
+		if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) {
+			ppc_md.power_save = power7_idle;
+			power7_offline_type = PNV_THREAD_NAP;
+		}
 
-	pnv_alloc_idle_core_states();
+		if ((supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) &&
+			   (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT))
+			power7_offline_type = PNV_THREAD_WINKLE;
+		else if ((supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED) ||
+			   (supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1))
+			power7_offline_type = PNV_THREAD_SLEEP;
+	}
 
-	if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
-		ppc_md.power_save = power7_idle;
+	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
+		if (pnv_save_sprs_for_deep_states())
+			pnv_disable_deep_states();
+	}
 
 out:
 	return 0;
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
index 45563004feda..1d7a9fd30dd1 100644
--- a/arch/powerpc/platforms/powernv/subcore.c
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -183,7 +183,7 @@ static void unsplit_core(void)
 	cpu = smp_processor_id();
 	if (cpu_thread_in_core(cpu) != 0) {
 		while (mfspr(SPRN_HID0) & mask)
-			power7_idle_insn(PNV_THREAD_NAP);
+			power7_idle_type(PNV_THREAD_NAP);
 
 		per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT;
 		return;
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index a0f44f992360..77197110e900 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2431,7 +2431,9 @@ static void dump_one_paca(int cpu)
 	DUMP(p, irq_happened, "%#-*x");
 	DUMP(p, io_sync, "%#-*x");
 	DUMP(p, irq_work_pending, "%#-*x");
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 	DUMP(p, nap_state_lost, "%#-*x");
+#endif
 	DUMP(p, sprg_vdso, "%#-*llx");
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -2439,19 +2441,16 @@ static void dump_one_paca(int cpu)
 #endif
 
 #ifdef CONFIG_PPC_POWERNV
-	DUMP(p, core_idle_state_ptr, "%-*px");
-	DUMP(p, thread_idle_state, "%#-*x");
-	DUMP(p, thread_mask, "%#-*x");
-	DUMP(p, subcore_sibling_mask, "%#-*x");
-	DUMP(p, requested_psscr, "%#-*llx");
-	DUMP(p, stop_sprs.pid, "%#-*llx");
-	DUMP(p, stop_sprs.ldbar, "%#-*llx");
-	DUMP(p, stop_sprs.fscr, "%#-*llx");
-	DUMP(p, stop_sprs.hfscr, "%#-*llx");
-	DUMP(p, stop_sprs.mmcr1, "%#-*llx");
-	DUMP(p, stop_sprs.mmcr2, "%#-*llx");
-	DUMP(p, stop_sprs.mmcra, "%#-*llx");
-	DUMP(p, dont_stop.counter, "%#-*x");
+	DUMP(p, idle_state, "%#-*lx");
+	if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) {
+		DUMP(p, thread_idle_state, "%#-*x");
+		DUMP(p, subcore_sibling_mask, "%#-*x");
+	} else {
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+		DUMP(p, requested_psscr, "%#-*llx");
+		DUMP(p, dont_stop.counter, "%#-*x");
+#endif
+	}
 #endif
 
 	DUMP(p, accounting.utime, "%#-*lx");
-- 
2.20.1


^ permalink raw reply related

* [PATCH v10 0/2] powerpc/64s: reimplement book3s idle code in C
From: Nicholas Piggin @ 2019-04-28 11:45 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Gautham R . Shenoy, kvm-ppc, Nicholas Piggin

The KVM code is in better shape now, survives various testing I came
up with, so should be ready for more review.

I won't post it again with the KVM part split out unless significant
changes are required there. As explained in the comments for patch 1,
the split results in some intermediate KVM issues, but the split is
supposed to make KVM review easier.

Gautham I left your review tag, quite a lot of KVM changes since he
reviewed it, but non-KVM patch is not much different. Let me know if
I should drop it. 

Thanks,
Nick

Nicholas Piggin (2):
  powerpc/64s: reimplement book3s idle code in C
  powerpc/64s: KVM update for reimplement book3s idle code in C

 arch/powerpc/include/asm/cpuidle.h       |   19 +-
 arch/powerpc/include/asm/paca.h          |   40 +-
 arch/powerpc/include/asm/processor.h     |    9 +-
 arch/powerpc/include/asm/reg.h           |    8 +-
 arch/powerpc/kernel/asm-offsets.c        |   18 -
 arch/powerpc/kernel/exceptions-64s.S     |   23 +-
 arch/powerpc/kernel/idle_book3s.S        | 1099 +++-------------------
 arch/powerpc/kernel/setup-common.c       |    4 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |  118 ++-
 arch/powerpc/platforms/powernv/idle.c    |  902 ++++++++++++++----
 arch/powerpc/platforms/powernv/subcore.c |    2 +-
 arch/powerpc/xmon/xmon.c                 |   24 +-
 12 files changed, 1009 insertions(+), 1257 deletions(-)

-- 
2.20.1

^ permalink raw reply

* Re: [PATCHv2] kernel/crash: make parse_crashkernel()'s return value more indicant
From: Dave Young @ 2019-04-28  8:37 UTC (permalink / raw)
  To: Pingfan Liu
  Cc: Rich Felker, linux-ia64, Julien Thierry, Yangtao Li,
	Palmer Dabbelt, Heiko Carstens, Stefan Agner, linux-mips,
	Paul Mackerras, H. Peter Anvin, Thomas Gleixner, Logan Gunthorpe,
	linux-s390, Florian Fainelli, Yoshinori Sato, linux-sh, x86,
	Russell King, Ingo Molnar, Hari Bathini, Catalin Marinas,
	James Hogan, Fenghua Yu, Tony Luck, Will Deacon, Johannes Weiner,
	Borislav Petkov, David Hildenbrand, linux-arm-kernel, Jens Axboe,
	Matthias Brugger, Baoquan He, Ard Biesheuvel, Robin Murphy, kexec,
	LKML, Ralf Baechle, Thomas Bogendoerfer, Paul Burton,
	Greg Kroah-Hartman, Martin Schwidefsky, Andrew Morton,
	linuxppc-dev, Greg Hackmann
In-Reply-To: <CAFgQCTstd667wP6g+maxYekz4u3iBR2R=FHUiS1V=XxTs6MKUw@mail.gmail.com>

On 04/25/19 at 04:20pm, Pingfan Liu wrote:
> On Wed, Apr 24, 2019 at 4:31 PM Matthias Brugger <mbrugger@suse.com> wrote:
> >
> >
> [...]
> > > @@ -139,6 +141,8 @@ static int __init parse_crashkernel_simple(char *cmdline,
> > >               pr_warn("crashkernel: unrecognized char: %c\n", *cur);
> > >               return -EINVAL;
> > >       }
> > > +     if (*crash_size == 0)
> > > +             return -EINVAL;
> >
> > This covers the case where I pass an argument like "crashkernel=0M" ?
> > Can't we fix that by using kstrtoull() in memparse and check if the return value
> > is < 0? In that case we could return without updating the retptr and we will be
> > fine.
> >
> It seems that kstrtoull() treats 0M as invalid parameter, while
> simple_strtoull() does not.
> 
> If changed like your suggestion, then all the callers of memparse()
> will treats 0M as invalid parameter. This affects many components
> besides kexec.  Not sure this can be done or not.

simple_strtoull is obsolete, move to kstrtoull is the right way.

$ git grep memparse|wc
    158     950   10479

Except some documentation/tools etc there are still a log of callers
which directly use the return value as the ull number without error
checking.

So it would be good to mark memparse as obsolete as well in
lib/cmdline.c, and introduce a new function eg. kmemparse() to use
kstrtoull,  and return a real error code, and save the size in an
argument like &size.  Then update X86 crashkernel code to use it.

Thanks
Dave

^ permalink raw reply

* [GIT PULL] Please pull powerpc/linux.git powerpc-5.1-6 tag
From: Michael Ellerman @ 2019-04-28  6:55 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: aik, linuxppc-dev, linux-kernel

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Hi Linus,

Please pull some more powerpc fixes for 5.1.

I was 50/50 on whether these were worthy of sending at rc6, but decided I would
send them as they're in obscure areas of the code and they do fix user-visible
bugs.

cheers


The following changes since commit cf7cf6977f531acd5dfe55250d0ee8cbbb6f1ae8:

  powerpc/mm: Define MAX_PHYSMEM_BITS for all 64-bit configs (2019-04-10 14:45:57 +1000)

are available in the git repository at:

  https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git tags/powerpc-5.1-6

for you to fetch changes up to 7a3a4d763837d3aa654cd1059030950410c04d77:

  powerpc/mm_iommu: Allow pinning large regions (2019-04-17 21:36:51 +1000)

- ------------------------------------------------------------------
powerpc fixes for 5.1 #6

A one-liner to make our Radix MMU support depend on HUGETLB_PAGE. We use some of
the hugetlb inlines (eg. pud_huge()) when operating on the linear mapping and if
they're compiled into empty wrappers we can corrupt memory.

Then two fixes to our VFIO IOMMU code. The first is not a regression but fixes
the locking to avoid a user-triggerable deadlock.

The second does fix a regression since rc1, and depends on the first fix. It
makes it possible to run guests with large amounts of memory again (~256GB).

Thanks to:
  Alexey Kardashevskiy.

- ------------------------------------------------------------------
Alexey Kardashevskiy (2):
      powerpc/mm_iommu: Fix potential deadlock
      powerpc/mm_iommu: Allow pinning large regions

Michael Ellerman (1):
      powerpc/mm/radix: Make Radix require HUGETLB_PAGE


 arch/powerpc/configs/skiroot_defconfig |  1 +
 arch/powerpc/mm/mmu_context_iommu.c    | 97 ++++++++++++++++++++--------------
 arch/powerpc/platforms/Kconfig.cputype |  2 +-
 3 files changed, 60 insertions(+), 40 deletions(-)
-----BEGIN PGP SIGNATURE-----

iQIcBAEBAgAGBQJcxU1hAAoJEFHr6jzI4aWAafIP/2Bg4WFdvVfmfMFSxZPP/5Rg
haS/4x6A6JJCVnxn0czaIUzY01s2Bgex4xAexoP1GtFTnNqXpWaxoPLveQQp0n4s
lrRC2e5R2O5d9CHVC+qzkUk16Cd6RjrXLzz1YhJo2kQrERJ/CcPQPF2SNkg08kyz
PtqrQVz2KMQsKiV/wPILwmOGLrxcJfH+01Z4/hbmJzveb9l4/SRN4aVZC12Wv7jQ
HDszTeDTNpFpTysA+cqC95t0ZQRK+Hb0c23zsDqVVFZxAwf083ZcU9IXwVdhoSsp
yXaoN/EFdTeVt8A4OL0/ER4wwyx1ShahvR1LuPDZ1x5/rzV0msExsNfRUOqeGUww
K9KnYEi5wWes/o+aLgf4adG96swmTvVChOSe7OUpod1hPO3UZ8vMpeAt0XB527TY
ptAcq7d6kpimd84A7qWM4lmC2QT9K5TqE9NoVpeCDgHbDqZZlWG2oMiXQXk/Thu2
OvpQwrbvX+vYgPgwuQbdyGnlNoxiTVT4oorsMo4YNIUi8OJB3Gu8ROL59tBjY/L+
jZFTNvj3fv/XYGy9mBBJhTpCG4SDz0D593iomrui+oAYczUkuNvlUv6lf9DCbXlU
mVbuqMbFin1JUtPngXROQbKkaQ8pf0QgGBS3y7ipPrV/3vAyJ66SqIcrm8XsXyzl
kAP6787TNrq3fosnfgBb
=EDog
-----END PGP SIGNATURE-----

^ permalink raw reply

* Re: [PATCH stable v4.4 00/52] powerpc spectre backports for 4.4
From: Michael Ellerman @ 2019-04-28  6:20 UTC (permalink / raw)
  To: Diana Madalina Craciun, stable@vger.kernel.org,
	gregkh@linuxfoundation.org
  Cc: linuxppc-dev@ozlabs.org, msuchanek@suse.de, npiggin@gmail.com
In-Reply-To: <VI1PR0401MB2463F6397FDC281DFDFF61FBFF220@VI1PR0401MB2463.eurprd04.prod.outlook.com>

Diana Madalina Craciun <diana.craciun@nxp.com> writes:
> Hi Michael,
>
> There are some missing NXP Spectre v2 patches. I can send them
> separately if the series will be accepted. I have merged them, but I did
> not test them, I was sick today and incapable of doing that.

No worries, there's no rush :)

Sorry I missed them, I thought I had a list that included everything.
Which commits was it I missed?

I guess post them as a reply to this thread? That way whether the series
is merged by Greg or not, there's a record here of what the backports
look like.

cheers

> On 4/21/2019 5:21 PM, Michael Ellerman wrote:
>> -----BEGIN PGP SIGNED MESSAGE-----
>> Hash: SHA1
>>
>> Hi Greg/Sasha,
>>
>> Please queue up these powerpc patches for 4.4 if you have no objections.
>>
>> cheers
>>
>>
>> Christophe Leroy (1):
>>   powerpc/fsl: Fix the flush of branch predictor.
>>
>> Diana Craciun (10):
>>   powerpc/64: Disable the speculation barrier from the command line
>>   powerpc/64: Make stf barrier PPC_BOOK3S_64 specific.
>>   powerpc/64: Make meltdown reporting Book3S 64 specific
>>   powerpc/fsl: Add barrier_nospec implementation for NXP PowerPC Book3E
>>   powerpc/fsl: Add infrastructure to fixup branch predictor flush
>>   powerpc/fsl: Add macro to flush the branch predictor
>>   powerpc/fsl: Fix spectre_v2 mitigations reporting
>>   powerpc/fsl: Add nospectre_v2 command line argument
>>   powerpc/fsl: Flush the branch predictor at each kernel entry (64bit)
>>   powerpc/fsl: Update Spectre v2 reporting
>>
>> Mauricio Faria de Oliveira (4):
>>   powerpc/rfi-flush: Differentiate enabled and patched flush types
>>   powerpc/pseries: Fix clearing of security feature flags
>>   powerpc: Move default security feature flags
>>   powerpc/pseries: Restore default security feature flags on setup
>>
>> Michael Ellerman (29):
>>   powerpc/xmon: Add RFI flush related fields to paca dump
>>   powerpc/pseries: Support firmware disable of RFI flush
>>   powerpc/powernv: Support firmware disable of RFI flush
>>   powerpc/rfi-flush: Move the logic to avoid a redo into the debugfs
>>     code
>>   powerpc/rfi-flush: Make it possible to call setup_rfi_flush() again
>>   powerpc/rfi-flush: Always enable fallback flush on pseries
>>   powerpc/pseries: Add new H_GET_CPU_CHARACTERISTICS flags
>>   powerpc/rfi-flush: Call setup_rfi_flush() after LPM migration
>>   powerpc: Add security feature flags for Spectre/Meltdown
>>   powerpc/pseries: Set or clear security feature flags
>>   powerpc/powernv: Set or clear security feature flags
>>   powerpc/64s: Move cpu_show_meltdown()
>>   powerpc/64s: Enhance the information in cpu_show_meltdown()
>>   powerpc/powernv: Use the security flags in pnv_setup_rfi_flush()
>>   powerpc/pseries: Use the security flags in pseries_setup_rfi_flush()
>>   powerpc/64s: Wire up cpu_show_spectre_v1()
>>   powerpc/64s: Wire up cpu_show_spectre_v2()
>>   powerpc/64s: Fix section mismatch warnings from setup_rfi_flush()
>>   powerpc/64: Use barrier_nospec in syscall entry
>>   powerpc: Use barrier_nospec in copy_from_user()
>>   powerpc64s: Show ori31 availability in spectre_v1 sysfs file not v2
>>   powerpc/64: Add CONFIG_PPC_BARRIER_NOSPEC
>>   powerpc/64: Call setup_barrier_nospec() from setup_arch()
>>   powerpc/asm: Add a patch_site macro & helpers for patching
>>     instructions
>>   powerpc/64s: Add new security feature flags for count cache flush
>>   powerpc/64s: Add support for software count cache flush
>>   powerpc/pseries: Query hypervisor for count cache flush settings
>>   powerpc/powernv: Query firmware for count cache flush settings
>>   powerpc/security: Fix spectre_v2 reporting
>>
>> Michael Neuling (1):
>>   powerpc: Avoid code patching freed init sections
>>
>> Michal Suchanek (5):
>>   powerpc/64s: Add barrier_nospec
>>   powerpc/64s: Add support for ori barrier_nospec patching
>>   powerpc/64s: Patch barrier_nospec in modules
>>   powerpc/64s: Enable barrier_nospec based on firmware settings
>>   powerpc/64s: Enhance the information in cpu_show_spectre_v1()
>>
>> Nicholas Piggin (2):
>>   powerpc/64s: Improve RFI L1-D cache flush fallback
>>   powerpc/64s: Add support for a store forwarding barrier at kernel
>>     entry/exit
>>
>>  arch/powerpc/Kconfig                         |   7 +-
>>  arch/powerpc/include/asm/asm-prototypes.h    |  21 +
>>  arch/powerpc/include/asm/barrier.h           |  21 +
>>  arch/powerpc/include/asm/code-patching-asm.h |  18 +
>>  arch/powerpc/include/asm/code-patching.h     |   2 +
>>  arch/powerpc/include/asm/exception-64s.h     |  35 ++
>>  arch/powerpc/include/asm/feature-fixups.h    |  40 ++
>>  arch/powerpc/include/asm/hvcall.h            |   5 +
>>  arch/powerpc/include/asm/paca.h              |   3 +-
>>  arch/powerpc/include/asm/ppc-opcode.h        |   1 +
>>  arch/powerpc/include/asm/ppc_asm.h           |  11 +
>>  arch/powerpc/include/asm/security_features.h |  92 ++++
>>  arch/powerpc/include/asm/setup.h             |  23 +-
>>  arch/powerpc/include/asm/uaccess.h           |  18 +-
>>  arch/powerpc/kernel/Makefile                 |   1 +
>>  arch/powerpc/kernel/asm-offsets.c            |   3 +-
>>  arch/powerpc/kernel/entry_64.S               |  69 +++
>>  arch/powerpc/kernel/exceptions-64e.S         |  27 +-
>>  arch/powerpc/kernel/exceptions-64s.S         |  98 +++--
>>  arch/powerpc/kernel/module.c                 |  10 +-
>>  arch/powerpc/kernel/security.c               | 433 +++++++++++++++++++
>>  arch/powerpc/kernel/setup_32.c               |   2 +
>>  arch/powerpc/kernel/setup_64.c               |  50 +--
>>  arch/powerpc/kernel/vmlinux.lds.S            |  33 +-
>>  arch/powerpc/lib/code-patching.c             |  29 ++
>>  arch/powerpc/lib/feature-fixups.c            | 218 +++++++++-
>>  arch/powerpc/mm/mem.c                        |   2 +
>>  arch/powerpc/mm/tlb_low_64e.S                |   7 +
>>  arch/powerpc/platforms/powernv/setup.c       |  99 +++--
>>  arch/powerpc/platforms/pseries/mobility.c    |   3 +
>>  arch/powerpc/platforms/pseries/pseries.h     |   2 +
>>  arch/powerpc/platforms/pseries/setup.c       |  88 +++-
>>  arch/powerpc/xmon/xmon.c                     |   2 +
>>  33 files changed, 1345 insertions(+), 128 deletions(-)
>>  create mode 100644 arch/powerpc/include/asm/asm-prototypes.h
>>  create mode 100644 arch/powerpc/include/asm/code-patching-asm.h
>>  create mode 100644 arch/powerpc/include/asm/security_features.h
>>  create mode 100644 arch/powerpc/kernel/security.c
>>
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index 58a1fa979655..01b6c00a7060 100644
>> - --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -136,7 +136,7 @@ config PPC
>>  	select GENERIC_SMP_IDLE_THREAD
>>  	select GENERIC_CMOS_UPDATE
>>  	select GENERIC_TIME_VSYSCALL_OLD
>> - -	select GENERIC_CPU_VULNERABILITIES	if PPC_BOOK3S_64
>> +	select GENERIC_CPU_VULNERABILITIES	if PPC_BARRIER_NOSPEC
>>  	select GENERIC_CLOCKEVENTS
>>  	select GENERIC_CLOCKEVENTS_BROADCAST if SMP
>>  	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
>> @@ -162,6 +162,11 @@ config PPC
>>  	select ARCH_HAS_DMA_SET_COHERENT_MASK
>>  	select HAVE_ARCH_SECCOMP_FILTER
>>  
>> +config PPC_BARRIER_NOSPEC
>> +    bool
>> +    default y
>> +    depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E
>> +
>>  config GENERIC_CSUM
>>  	def_bool CPU_LITTLE_ENDIAN
>>  
>> diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
>> new file mode 100644
>> index 000000000000..8944c55591cf
>> - --- /dev/null
>> +++ b/arch/powerpc/include/asm/asm-prototypes.h
>> @@ -0,0 +1,21 @@
>> +#ifndef _ASM_POWERPC_ASM_PROTOTYPES_H
>> +#define _ASM_POWERPC_ASM_PROTOTYPES_H
>> +/*
>> + * This file is for prototypes of C functions that are only called
>> + * from asm, and any associated variables.
>> + *
>> + * Copyright 2016, Daniel Axtens, IBM Corporation.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License
>> + * as published by the Free Software Foundation; either version 2
>> + * of the License, or (at your option) any later version.
>> + */
>> +
>> +/* Patch sites */
>> +extern s32 patch__call_flush_count_cache;
>> +extern s32 patch__flush_count_cache_return;
>> +
>> +extern long flush_count_cache;
>> +
>> +#endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
>> diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
>> index b9e16855a037..e7cb72cdb2ba 100644
>> - --- a/arch/powerpc/include/asm/barrier.h
>> +++ b/arch/powerpc/include/asm/barrier.h
>> @@ -92,4 +92,25 @@ do {									\
>>  #define smp_mb__after_atomic()      smp_mb()
>>  #define smp_mb__before_spinlock()   smp_mb()
>>  
>> +#ifdef CONFIG_PPC_BOOK3S_64
>> +#define NOSPEC_BARRIER_SLOT   nop
>> +#elif defined(CONFIG_PPC_FSL_BOOK3E)
>> +#define NOSPEC_BARRIER_SLOT   nop; nop
>> +#endif
>> +
>> +#ifdef CONFIG_PPC_BARRIER_NOSPEC
>> +/*
>> + * Prevent execution of subsequent instructions until preceding branches have
>> + * been fully resolved and are no longer executing speculatively.
>> + */
>> +#define barrier_nospec_asm NOSPEC_BARRIER_FIXUP_SECTION; NOSPEC_BARRIER_SLOT
>> +
>> +// This also acts as a compiler barrier due to the memory clobber.
>> +#define barrier_nospec() asm (stringify_in_c(barrier_nospec_asm) ::: "memory")
>> +
>> +#else /* !CONFIG_PPC_BARRIER_NOSPEC */
>> +#define barrier_nospec_asm
>> +#define barrier_nospec()
>> +#endif /* CONFIG_PPC_BARRIER_NOSPEC */
>> +
>>  #endif /* _ASM_POWERPC_BARRIER_H */
>> diff --git a/arch/powerpc/include/asm/code-patching-asm.h b/arch/powerpc/include/asm/code-patching-asm.h
>> new file mode 100644
>> index 000000000000..ed7b1448493a
>> - --- /dev/null
>> +++ b/arch/powerpc/include/asm/code-patching-asm.h
>> @@ -0,0 +1,18 @@
>> +/* SPDX-License-Identifier: GPL-2.0+ */
>> +/*
>> + * Copyright 2018, Michael Ellerman, IBM Corporation.
>> + */
>> +#ifndef _ASM_POWERPC_CODE_PATCHING_ASM_H
>> +#define _ASM_POWERPC_CODE_PATCHING_ASM_H
>> +
>> +/* Define a "site" that can be patched */
>> +.macro patch_site label name
>> +	.pushsection ".rodata"
>> +	.balign 4
>> +	.global \name
>> +\name:
>> +	.4byte	\label - .
>> +	.popsection
>> +.endm
>> +
>> +#endif /* _ASM_POWERPC_CODE_PATCHING_ASM_H */
>> diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
>> index 840a5509b3f1..a734b4b34d26 100644
>> - --- a/arch/powerpc/include/asm/code-patching.h
>> +++ b/arch/powerpc/include/asm/code-patching.h
>> @@ -28,6 +28,8 @@ unsigned int create_cond_branch(const unsigned int *addr,
>>  				unsigned long target, int flags);
>>  int patch_branch(unsigned int *addr, unsigned long target, int flags);
>>  int patch_instruction(unsigned int *addr, unsigned int instr);
>> +int patch_instruction_site(s32 *addr, unsigned int instr);
>> +int patch_branch_site(s32 *site, unsigned long target, int flags);
>>  
>>  int instr_is_relative_branch(unsigned int instr);
>>  int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr);
>> diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
>> index 9bddbec441b8..3ed536bec462 100644
>> - --- a/arch/powerpc/include/asm/exception-64s.h
>> +++ b/arch/powerpc/include/asm/exception-64s.h
>> @@ -50,6 +50,27 @@
>>  #define EX_PPR		88	/* SMT thread status register (priority) */
>>  #define EX_CTR		96
>>  
>> +#define STF_ENTRY_BARRIER_SLOT						\
>> +	STF_ENTRY_BARRIER_FIXUP_SECTION;				\
>> +	nop;								\
>> +	nop;								\
>> +	nop
>> +
>> +#define STF_EXIT_BARRIER_SLOT						\
>> +	STF_EXIT_BARRIER_FIXUP_SECTION;					\
>> +	nop;								\
>> +	nop;								\
>> +	nop;								\
>> +	nop;								\
>> +	nop;								\
>> +	nop
>> +
>> +/*
>> + * r10 must be free to use, r13 must be paca
>> + */
>> +#define INTERRUPT_TO_KERNEL						\
>> +	STF_ENTRY_BARRIER_SLOT
>> +
>>  /*
>>   * Macros for annotating the expected destination of (h)rfid
>>   *
>> @@ -66,16 +87,19 @@
>>  	rfid
>>  
>>  #define RFI_TO_USER							\
>> +	STF_EXIT_BARRIER_SLOT;						\
>>  	RFI_FLUSH_SLOT;							\
>>  	rfid;								\
>>  	b	rfi_flush_fallback
>>  
>>  #define RFI_TO_USER_OR_KERNEL						\
>> +	STF_EXIT_BARRIER_SLOT;						\
>>  	RFI_FLUSH_SLOT;							\
>>  	rfid;								\
>>  	b	rfi_flush_fallback
>>  
>>  #define RFI_TO_GUEST							\
>> +	STF_EXIT_BARRIER_SLOT;						\
>>  	RFI_FLUSH_SLOT;							\
>>  	rfid;								\
>>  	b	rfi_flush_fallback
>> @@ -84,21 +108,25 @@
>>  	hrfid
>>  
>>  #define HRFI_TO_USER							\
>> +	STF_EXIT_BARRIER_SLOT;						\
>>  	RFI_FLUSH_SLOT;							\
>>  	hrfid;								\
>>  	b	hrfi_flush_fallback
>>  
>>  #define HRFI_TO_USER_OR_KERNEL						\
>> +	STF_EXIT_BARRIER_SLOT;						\
>>  	RFI_FLUSH_SLOT;							\
>>  	hrfid;								\
>>  	b	hrfi_flush_fallback
>>  
>>  #define HRFI_TO_GUEST							\
>> +	STF_EXIT_BARRIER_SLOT;						\
>>  	RFI_FLUSH_SLOT;							\
>>  	hrfid;								\
>>  	b	hrfi_flush_fallback
>>  
>>  #define HRFI_TO_UNKNOWN							\
>> +	STF_EXIT_BARRIER_SLOT;						\
>>  	RFI_FLUSH_SLOT;							\
>>  	hrfid;								\
>>  	b	hrfi_flush_fallback
>> @@ -226,6 +254,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
>>  #define __EXCEPTION_PROLOG_1(area, extra, vec)				\
>>  	OPT_SAVE_REG_TO_PACA(area+EX_PPR, r9, CPU_FTR_HAS_PPR);		\
>>  	OPT_SAVE_REG_TO_PACA(area+EX_CFAR, r10, CPU_FTR_CFAR);		\
>> +	INTERRUPT_TO_KERNEL;						\
>>  	SAVE_CTR(r10, area);						\
>>  	mfcr	r9;							\
>>  	extra(vec);							\
>> @@ -512,6 +541,12 @@ label##_relon_hv:						\
>>  #define _MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)		\
>>  	__MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)
>>  
>> +#define MASKABLE_EXCEPTION_OOL(vec, label)				\
>> +	.globl label##_ool;						\
>> +label##_ool:								\
>> +	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_PR, vec);		\
>> +	EXCEPTION_PROLOG_PSERIES_1(label##_common, EXC_STD);
>> +
>>  #define MASKABLE_EXCEPTION_PSERIES(loc, vec, label)			\
>>  	. = loc;							\
>>  	.globl label##_pSeries;						\
>> diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
>> index 7068bafbb2d6..145a37ab2d3e 100644
>> - --- a/arch/powerpc/include/asm/feature-fixups.h
>> +++ b/arch/powerpc/include/asm/feature-fixups.h
>> @@ -184,6 +184,22 @@ label##3:					       	\
>>  	FTR_ENTRY_OFFSET label##1b-label##3b;		\
>>  	.popsection;
>>  
>> +#define STF_ENTRY_BARRIER_FIXUP_SECTION			\
>> +953:							\
>> +	.pushsection __stf_entry_barrier_fixup,"a";	\
>> +	.align 2;					\
>> +954:							\
>> +	FTR_ENTRY_OFFSET 953b-954b;			\
>> +	.popsection;
>> +
>> +#define STF_EXIT_BARRIER_FIXUP_SECTION			\
>> +955:							\
>> +	.pushsection __stf_exit_barrier_fixup,"a";	\
>> +	.align 2;					\
>> +956:							\
>> +	FTR_ENTRY_OFFSET 955b-956b;			\
>> +	.popsection;
>> +
>>  #define RFI_FLUSH_FIXUP_SECTION				\
>>  951:							\
>>  	.pushsection __rfi_flush_fixup,"a";		\
>> @@ -192,10 +208,34 @@ label##3:					       	\
>>  	FTR_ENTRY_OFFSET 951b-952b;			\
>>  	.popsection;
>>  
>> +#define NOSPEC_BARRIER_FIXUP_SECTION			\
>> +953:							\
>> +	.pushsection __barrier_nospec_fixup,"a";	\
>> +	.align 2;					\
>> +954:							\
>> +	FTR_ENTRY_OFFSET 953b-954b;			\
>> +	.popsection;
>> +
>> +#define START_BTB_FLUSH_SECTION			\
>> +955:							\
>> +
>> +#define END_BTB_FLUSH_SECTION			\
>> +956:							\
>> +	.pushsection __btb_flush_fixup,"a";	\
>> +	.align 2;							\
>> +957:						\
>> +	FTR_ENTRY_OFFSET 955b-957b;			\
>> +	FTR_ENTRY_OFFSET 956b-957b;			\
>> +	.popsection;
>>  
>>  #ifndef __ASSEMBLY__
>>  
>> +extern long stf_barrier_fallback;
>> +extern long __start___stf_entry_barrier_fixup, __stop___stf_entry_barrier_fixup;
>> +extern long __start___stf_exit_barrier_fixup, __stop___stf_exit_barrier_fixup;
>>  extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup;
>> +extern long __start___barrier_nospec_fixup, __stop___barrier_nospec_fixup;
>> +extern long __start__btb_flush_fixup, __stop__btb_flush_fixup;
>>  
>>  #endif
>>  
>> diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
>> index 449bbb87c257..b57db9d09db9 100644
>> - --- a/arch/powerpc/include/asm/hvcall.h
>> +++ b/arch/powerpc/include/asm/hvcall.h
>> @@ -292,10 +292,15 @@
>>  #define H_CPU_CHAR_L1D_FLUSH_ORI30	(1ull << 61) // IBM bit 2
>>  #define H_CPU_CHAR_L1D_FLUSH_TRIG2	(1ull << 60) // IBM bit 3
>>  #define H_CPU_CHAR_L1D_THREAD_PRIV	(1ull << 59) // IBM bit 4
>> +#define H_CPU_CHAR_BRANCH_HINTS_HONORED	(1ull << 58) // IBM bit 5
>> +#define H_CPU_CHAR_THREAD_RECONFIG_CTRL	(1ull << 57) // IBM bit 6
>> +#define H_CPU_CHAR_COUNT_CACHE_DISABLED	(1ull << 56) // IBM bit 7
>> +#define H_CPU_CHAR_BCCTR_FLUSH_ASSIST	(1ull << 54) // IBM bit 9
>>  
>>  #define H_CPU_BEHAV_FAVOUR_SECURITY	(1ull << 63) // IBM bit 0
>>  #define H_CPU_BEHAV_L1D_FLUSH_PR	(1ull << 62) // IBM bit 1
>>  #define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR	(1ull << 61) // IBM bit 2
>> +#define H_CPU_BEHAV_FLUSH_COUNT_CACHE	(1ull << 58) // IBM bit 5
>>  
>>  #ifndef __ASSEMBLY__
>>  #include <linux/types.h>
>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>> index 45e2aefece16..08e5df3395fa 100644
>> - --- a/arch/powerpc/include/asm/paca.h
>> +++ b/arch/powerpc/include/asm/paca.h
>> @@ -199,8 +199,7 @@ struct paca_struct {
>>  	 */
>>  	u64 exrfi[13] __aligned(0x80);
>>  	void *rfi_flush_fallback_area;
>> - -	u64 l1d_flush_congruence;
>> - -	u64 l1d_flush_sets;
>> +	u64 l1d_flush_size;
>>  #endif
>>  };
>>  
>> diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
>> index 7ab04fc59e24..faf1bb045dee 100644
>> - --- a/arch/powerpc/include/asm/ppc-opcode.h
>> +++ b/arch/powerpc/include/asm/ppc-opcode.h
>> @@ -147,6 +147,7 @@
>>  #define PPC_INST_LWSYNC			0x7c2004ac
>>  #define PPC_INST_SYNC			0x7c0004ac
>>  #define PPC_INST_SYNC_MASK		0xfc0007fe
>> +#define PPC_INST_ISYNC			0x4c00012c
>>  #define PPC_INST_LXVD2X			0x7c000698
>>  #define PPC_INST_MCRXR			0x7c000400
>>  #define PPC_INST_MCRXR_MASK		0xfc0007fe
>> diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
>> index 160bb2311bbb..d219816b3e19 100644
>> - --- a/arch/powerpc/include/asm/ppc_asm.h
>> +++ b/arch/powerpc/include/asm/ppc_asm.h
>> @@ -821,4 +821,15 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,945)
>>  	.long 0x2400004c  /* rfid				*/
>>  #endif /* !CONFIG_PPC_BOOK3E */
>>  #endif /*  __ASSEMBLY__ */
>> +
>> +#ifdef CONFIG_PPC_FSL_BOOK3E
>> +#define BTB_FLUSH(reg)			\
>> +	lis reg,BUCSR_INIT@h;		\
>> +	ori reg,reg,BUCSR_INIT@l;	\
>> +	mtspr SPRN_BUCSR,reg;		\
>> +	isync;
>> +#else
>> +#define BTB_FLUSH(reg)
>> +#endif /* CONFIG_PPC_FSL_BOOK3E */
>> +
>>  #endif /* _ASM_POWERPC_PPC_ASM_H */
>> diff --git a/arch/powerpc/include/asm/security_features.h b/arch/powerpc/include/asm/security_features.h
>> new file mode 100644
>> index 000000000000..759597bf0fd8
>> - --- /dev/null
>> +++ b/arch/powerpc/include/asm/security_features.h
>> @@ -0,0 +1,92 @@
>> +/* SPDX-License-Identifier: GPL-2.0+ */
>> +/*
>> + * Security related feature bit definitions.
>> + *
>> + * Copyright 2018, Michael Ellerman, IBM Corporation.
>> + */
>> +
>> +#ifndef _ASM_POWERPC_SECURITY_FEATURES_H
>> +#define _ASM_POWERPC_SECURITY_FEATURES_H
>> +
>> +
>> +extern unsigned long powerpc_security_features;
>> +extern bool rfi_flush;
>> +
>> +/* These are bit flags */
>> +enum stf_barrier_type {
>> +	STF_BARRIER_NONE	= 0x1,
>> +	STF_BARRIER_FALLBACK	= 0x2,
>> +	STF_BARRIER_EIEIO	= 0x4,
>> +	STF_BARRIER_SYNC_ORI	= 0x8,
>> +};
>> +
>> +void setup_stf_barrier(void);
>> +void do_stf_barrier_fixups(enum stf_barrier_type types);
>> +void setup_count_cache_flush(void);
>> +
>> +static inline void security_ftr_set(unsigned long feature)
>> +{
>> +	powerpc_security_features |= feature;
>> +}
>> +
>> +static inline void security_ftr_clear(unsigned long feature)
>> +{
>> +	powerpc_security_features &= ~feature;
>> +}
>> +
>> +static inline bool security_ftr_enabled(unsigned long feature)
>> +{
>> +	return !!(powerpc_security_features & feature);
>> +}
>> +
>> +
>> +// Features indicating support for Spectre/Meltdown mitigations
>> +
>> +// The L1-D cache can be flushed with ori r30,r30,0
>> +#define SEC_FTR_L1D_FLUSH_ORI30		0x0000000000000001ull
>> +
>> +// The L1-D cache can be flushed with mtspr 882,r0 (aka SPRN_TRIG2)
>> +#define SEC_FTR_L1D_FLUSH_TRIG2		0x0000000000000002ull
>> +
>> +// ori r31,r31,0 acts as a speculation barrier
>> +#define SEC_FTR_SPEC_BAR_ORI31		0x0000000000000004ull
>> +
>> +// Speculation past bctr is disabled
>> +#define SEC_FTR_BCCTRL_SERIALISED	0x0000000000000008ull
>> +
>> +// Entries in L1-D are private to a SMT thread
>> +#define SEC_FTR_L1D_THREAD_PRIV		0x0000000000000010ull
>> +
>> +// Indirect branch prediction cache disabled
>> +#define SEC_FTR_COUNT_CACHE_DISABLED	0x0000000000000020ull
>> +
>> +// bcctr 2,0,0 triggers a hardware assisted count cache flush
>> +#define SEC_FTR_BCCTR_FLUSH_ASSIST	0x0000000000000800ull
>> +
>> +
>> +// Features indicating need for Spectre/Meltdown mitigations
>> +
>> +// The L1-D cache should be flushed on MSR[HV] 1->0 transition (hypervisor to guest)
>> +#define SEC_FTR_L1D_FLUSH_HV		0x0000000000000040ull
>> +
>> +// The L1-D cache should be flushed on MSR[PR] 0->1 transition (kernel to userspace)
>> +#define SEC_FTR_L1D_FLUSH_PR		0x0000000000000080ull
>> +
>> +// A speculation barrier should be used for bounds checks (Spectre variant 1)
>> +#define SEC_FTR_BNDS_CHK_SPEC_BAR	0x0000000000000100ull
>> +
>> +// Firmware configuration indicates user favours security over performance
>> +#define SEC_FTR_FAVOUR_SECURITY		0x0000000000000200ull
>> +
>> +// Software required to flush count cache on context switch
>> +#define SEC_FTR_FLUSH_COUNT_CACHE	0x0000000000000400ull
>> +
>> +
>> +// Features enabled by default
>> +#define SEC_FTR_DEFAULT \
>> +	(SEC_FTR_L1D_FLUSH_HV | \
>> +	 SEC_FTR_L1D_FLUSH_PR | \
>> +	 SEC_FTR_BNDS_CHK_SPEC_BAR | \
>> +	 SEC_FTR_FAVOUR_SECURITY)
>> +
>> +#endif /* _ASM_POWERPC_SECURITY_FEATURES_H */
>> diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
>> index 7916b56f2e60..d299479c770b 100644
>> - --- a/arch/powerpc/include/asm/setup.h
>> +++ b/arch/powerpc/include/asm/setup.h
>> @@ -8,6 +8,7 @@ extern void ppc_printk_progress(char *s, unsigned short hex);
>>  
>>  extern unsigned int rtas_data;
>>  extern unsigned long long memory_limit;
>> +extern bool init_mem_is_free;
>>  extern unsigned long klimit;
>>  extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
>>  
>> @@ -36,8 +37,28 @@ enum l1d_flush_type {
>>  	L1D_FLUSH_MTTRIG	= 0x8,
>>  };
>>  
>> - -void __init setup_rfi_flush(enum l1d_flush_type, bool enable);
>> +void setup_rfi_flush(enum l1d_flush_type, bool enable);
>>  void do_rfi_flush_fixups(enum l1d_flush_type types);
>> +#ifdef CONFIG_PPC_BARRIER_NOSPEC
>> +void setup_barrier_nospec(void);
>> +#else
>> +static inline void setup_barrier_nospec(void) { };
>> +#endif
>> +void do_barrier_nospec_fixups(bool enable);
>> +extern bool barrier_nospec_enabled;
>> +
>> +#ifdef CONFIG_PPC_BARRIER_NOSPEC
>> +void do_barrier_nospec_fixups_range(bool enable, void *start, void *end);
>> +#else
>> +static inline void do_barrier_nospec_fixups_range(bool enable, void *start, void *end) { };
>> +#endif
>> +
>> +#ifdef CONFIG_PPC_FSL_BOOK3E
>> +void setup_spectre_v2(void);
>> +#else
>> +static inline void setup_spectre_v2(void) {};
>> +#endif
>> +void do_btb_flush_fixups(void);
>>  
>>  #endif /* !__ASSEMBLY__ */
>>  
>> diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
>> index 05f1389228d2..e51ce5a0e221 100644
>> - --- a/arch/powerpc/include/asm/uaccess.h
>> +++ b/arch/powerpc/include/asm/uaccess.h
>> @@ -269,6 +269,7 @@ do {								\
>>  	__chk_user_ptr(ptr);					\
>>  	if (!is_kernel_addr((unsigned long)__gu_addr))		\
>>  		might_fault();					\
>> +	barrier_nospec();					\
>>  	__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
>>  	(x) = (__typeof__(*(ptr)))__gu_val;			\
>>  	__gu_err;						\
>> @@ -283,6 +284,7 @@ do {								\
>>  	__chk_user_ptr(ptr);					\
>>  	if (!is_kernel_addr((unsigned long)__gu_addr))		\
>>  		might_fault();					\
>> +	barrier_nospec();					\
>>  	__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
>>  	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
>>  	__gu_err;						\
>> @@ -295,8 +297,10 @@ do {								\
>>  	unsigned long  __gu_val = 0;					\
>>  	__typeof__(*(ptr)) __user *__gu_addr = (ptr);		\
>>  	might_fault();							\
>> - -	if (access_ok(VERIFY_READ, __gu_addr, (size)))			\
>> +	if (access_ok(VERIFY_READ, __gu_addr, (size))) {		\
>> +		barrier_nospec();					\
>>  		__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
>> +	}								\
>>  	(x) = (__force __typeof__(*(ptr)))__gu_val;				\
>>  	__gu_err;							\
>>  })
>> @@ -307,6 +311,7 @@ do {								\
>>  	unsigned long __gu_val;					\
>>  	__typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
>>  	__chk_user_ptr(ptr);					\
>> +	barrier_nospec();					\
>>  	__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
>>  	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
>>  	__gu_err;						\
>> @@ -323,8 +328,10 @@ extern unsigned long __copy_tofrom_user(void __user *to,
>>  static inline unsigned long copy_from_user(void *to,
>>  		const void __user *from, unsigned long n)
>>  {
>> - -	if (likely(access_ok(VERIFY_READ, from, n)))
>> +	if (likely(access_ok(VERIFY_READ, from, n))) {
>> +		barrier_nospec();
>>  		return __copy_tofrom_user((__force void __user *)to, from, n);
>> +	}
>>  	memset(to, 0, n);
>>  	return n;
>>  }
>> @@ -359,21 +366,27 @@ static inline unsigned long __copy_from_user_inatomic(void *to,
>>  
>>  		switch (n) {
>>  		case 1:
>> +			barrier_nospec();
>>  			__get_user_size(*(u8 *)to, from, 1, ret);
>>  			break;
>>  		case 2:
>> +			barrier_nospec();
>>  			__get_user_size(*(u16 *)to, from, 2, ret);
>>  			break;
>>  		case 4:
>> +			barrier_nospec();
>>  			__get_user_size(*(u32 *)to, from, 4, ret);
>>  			break;
>>  		case 8:
>> +			barrier_nospec();
>>  			__get_user_size(*(u64 *)to, from, 8, ret);
>>  			break;
>>  		}
>>  		if (ret == 0)
>>  			return 0;
>>  	}
>> +
>> +	barrier_nospec();
>>  	return __copy_tofrom_user((__force void __user *)to, from, n);
>>  }
>>  
>> @@ -400,6 +413,7 @@ static inline unsigned long __copy_to_user_inatomic(void __user *to,
>>  		if (ret == 0)
>>  			return 0;
>>  	}
>> +
>>  	return __copy_tofrom_user(to, (__force const void __user *)from, n);
>>  }
>>  
>> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
>> index ba336930d448..22ed3c32fca8 100644
>> - --- a/arch/powerpc/kernel/Makefile
>> +++ b/arch/powerpc/kernel/Makefile
>> @@ -44,6 +44,7 @@ obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
>>  obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o
>>  obj64-$(CONFIG_RELOCATABLE)	+= reloc_64.o
>>  obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o idle_book3e.o
>> +obj-$(CONFIG_PPC_BARRIER_NOSPEC) += security.o
>>  obj-$(CONFIG_PPC64)		+= vdso64/
>>  obj-$(CONFIG_ALTIVEC)		+= vecemu.o
>>  obj-$(CONFIG_PPC_970_NAP)	+= idle_power4.o
>> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
>> index d92705e3a0c1..de3c29c51503 100644
>> - --- a/arch/powerpc/kernel/asm-offsets.c
>> +++ b/arch/powerpc/kernel/asm-offsets.c
>> @@ -245,8 +245,7 @@ int main(void)
>>  	DEFINE(PACA_IN_MCE, offsetof(struct paca_struct, in_mce));
>>  	DEFINE(PACA_RFI_FLUSH_FALLBACK_AREA, offsetof(struct paca_struct, rfi_flush_fallback_area));
>>  	DEFINE(PACA_EXRFI, offsetof(struct paca_struct, exrfi));
>> - -	DEFINE(PACA_L1D_FLUSH_CONGRUENCE, offsetof(struct paca_struct, l1d_flush_congruence));
>> - -	DEFINE(PACA_L1D_FLUSH_SETS, offsetof(struct paca_struct, l1d_flush_sets));
>> +	DEFINE(PACA_L1D_FLUSH_SIZE, offsetof(struct paca_struct, l1d_flush_size));
>>  #endif
>>  	DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id));
>>  	DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state));
>> diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
>> index 59be96917369..6d36a4fb4acf 100644
>> - --- a/arch/powerpc/kernel/entry_64.S
>> +++ b/arch/powerpc/kernel/entry_64.S
>> @@ -25,6 +25,7 @@
>>  #include <asm/page.h>
>>  #include <asm/mmu.h>
>>  #include <asm/thread_info.h>
>> +#include <asm/code-patching-asm.h>
>>  #include <asm/ppc_asm.h>
>>  #include <asm/asm-offsets.h>
>>  #include <asm/cputable.h>
>> @@ -36,6 +37,7 @@
>>  #include <asm/hw_irq.h>
>>  #include <asm/context_tracking.h>
>>  #include <asm/tm.h>
>> +#include <asm/barrier.h>
>>  #ifdef CONFIG_PPC_BOOK3S
>>  #include <asm/exception-64s.h>
>>  #else
>> @@ -75,6 +77,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
>>  	std	r0,GPR0(r1)
>>  	std	r10,GPR1(r1)
>>  	beq	2f			/* if from kernel mode */
>> +#ifdef CONFIG_PPC_FSL_BOOK3E
>> +START_BTB_FLUSH_SECTION
>> +	BTB_FLUSH(r10)
>> +END_BTB_FLUSH_SECTION
>> +#endif
>>  	ACCOUNT_CPU_USER_ENTRY(r10, r11)
>>  2:	std	r2,GPR2(r1)
>>  	std	r3,GPR3(r1)
>> @@ -177,6 +184,15 @@ system_call:			/* label this so stack traces look sane */
>>  	clrldi	r8,r8,32
>>  15:
>>  	slwi	r0,r0,4
>> +
>> +	barrier_nospec_asm
>> +	/*
>> +	 * Prevent the load of the handler below (based on the user-passed
>> +	 * system call number) being speculatively executed until the test
>> +	 * against NR_syscalls and branch to .Lsyscall_enosys above has
>> +	 * committed.
>> +	 */
>> +
>>  	ldx	r12,r11,r0	/* Fetch system call handler [ptr] */
>>  	mtctr   r12
>>  	bctrl			/* Call handler */
>> @@ -440,6 +456,57 @@ _GLOBAL(ret_from_kernel_thread)
>>  	li	r3,0
>>  	b	.Lsyscall_exit
>>  
>> +#ifdef CONFIG_PPC_BOOK3S_64
>> +
>> +#define FLUSH_COUNT_CACHE	\
>> +1:	nop;			\
>> +	patch_site 1b, patch__call_flush_count_cache
>> +
>> +
>> +#define BCCTR_FLUSH	.long 0x4c400420
>> +
>> +.macro nops number
>> +	.rept \number
>> +	nop
>> +	.endr
>> +.endm
>> +
>> +.balign 32
>> +.global flush_count_cache
>> +flush_count_cache:
>> +	/* Save LR into r9 */
>> +	mflr	r9
>> +
>> +	.rept 64
>> +	bl	.+4
>> +	.endr
>> +	b	1f
>> +	nops	6
>> +
>> +	.balign 32
>> +	/* Restore LR */
>> +1:	mtlr	r9
>> +	li	r9,0x7fff
>> +	mtctr	r9
>> +
>> +	BCCTR_FLUSH
>> +
>> +2:	nop
>> +	patch_site 2b patch__flush_count_cache_return
>> +
>> +	nops	3
>> +
>> +	.rept 278
>> +	.balign 32
>> +	BCCTR_FLUSH
>> +	nops	7
>> +	.endr
>> +
>> +	blr
>> +#else
>> +#define FLUSH_COUNT_CACHE
>> +#endif /* CONFIG_PPC_BOOK3S_64 */
>> +
>>  /*
>>   * This routine switches between two different tasks.  The process
>>   * state of one is saved on its kernel stack.  Then the state
>> @@ -503,6 +570,8 @@ BEGIN_FTR_SECTION
>>  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
>>  #endif
>>  
>> +	FLUSH_COUNT_CACHE
>> +
>>  #ifdef CONFIG_SMP
>>  	/* We need a sync somewhere here to make sure that if the
>>  	 * previous task gets rescheduled on another CPU, it sees all
>> diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
>> index 5cc93f0b52ca..48ec841ea1bf 100644
>> - --- a/arch/powerpc/kernel/exceptions-64e.S
>> +++ b/arch/powerpc/kernel/exceptions-64e.S
>> @@ -295,7 +295,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
>>  	andi.	r10,r11,MSR_PR;		/* save stack pointer */	    \
>>  	beq	1f;			/* branch around if supervisor */   \
>>  	ld	r1,PACAKSAVE(r13);	/* get kernel stack coming from usr */\
>> - -1:	cmpdi	cr1,r1,0;		/* check if SP makes sense */	    \
>> +1:	type##_BTB_FLUSH		\
>> +	cmpdi	cr1,r1,0;		/* check if SP makes sense */	    \
>>  	bge-	cr1,exc_##n##_bad_stack;/* bad stack (TODO: out of line) */ \
>>  	mfspr	r10,SPRN_##type##_SRR0;	/* read SRR0 before touching stack */
>>  
>> @@ -327,6 +328,30 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
>>  #define SPRN_MC_SRR0	SPRN_MCSRR0
>>  #define SPRN_MC_SRR1	SPRN_MCSRR1
>>  
>> +#ifdef CONFIG_PPC_FSL_BOOK3E
>> +#define GEN_BTB_FLUSH			\
>> +	START_BTB_FLUSH_SECTION		\
>> +		beq 1f;			\
>> +		BTB_FLUSH(r10)			\
>> +		1:		\
>> +	END_BTB_FLUSH_SECTION
>> +
>> +#define CRIT_BTB_FLUSH			\
>> +	START_BTB_FLUSH_SECTION		\
>> +		BTB_FLUSH(r10)		\
>> +	END_BTB_FLUSH_SECTION
>> +
>> +#define DBG_BTB_FLUSH CRIT_BTB_FLUSH
>> +#define MC_BTB_FLUSH CRIT_BTB_FLUSH
>> +#define GDBELL_BTB_FLUSH GEN_BTB_FLUSH
>> +#else
>> +#define GEN_BTB_FLUSH
>> +#define CRIT_BTB_FLUSH
>> +#define DBG_BTB_FLUSH
>> +#define MC_BTB_FLUSH
>> +#define GDBELL_BTB_FLUSH
>> +#endif
>> +
>>  #define NORMAL_EXCEPTION_PROLOG(n, intnum, addition)			    \
>>  	EXCEPTION_PROLOG(n, intnum, GEN, addition##_GEN(n))
>>  
>> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
>> index 938a30fef031..10e7cec9553d 100644
>> - --- a/arch/powerpc/kernel/exceptions-64s.S
>> +++ b/arch/powerpc/kernel/exceptions-64s.S
>> @@ -36,6 +36,7 @@ BEGIN_FTR_SECTION						\
>>  END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
>>  	mr	r9,r13 ;					\
>>  	GET_PACA(r13) ;						\
>> +	INTERRUPT_TO_KERNEL ;					\
>>  	mfspr	r11,SPRN_SRR0 ;					\
>>  0:
>>  
>> @@ -292,7 +293,9 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
>>  	. = 0x900
>>  	.globl decrementer_pSeries
>>  decrementer_pSeries:
>> - -	_MASKABLE_EXCEPTION_PSERIES(0x900, decrementer, EXC_STD, SOFTEN_TEST_PR)
>> +	SET_SCRATCH0(r13)
>> +	EXCEPTION_PROLOG_0(PACA_EXGEN)
>> +	b	decrementer_ool
>>  
>>  	STD_EXCEPTION_HV(0x980, 0x982, hdecrementer)
>>  
>> @@ -319,6 +322,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
>>  	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);
>>  	HMT_MEDIUM;
>>  	std	r10,PACA_EXGEN+EX_R10(r13)
>> +	INTERRUPT_TO_KERNEL
>>  	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR);
>>  	mfcr	r9
>>  	KVMTEST(0xc00)
>> @@ -607,6 +611,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
>>  
>>  	.align	7
>>  	/* moved from 0xe00 */
>> +	MASKABLE_EXCEPTION_OOL(0x900, decrementer)
>>  	STD_EXCEPTION_HV_OOL(0xe02, h_data_storage)
>>  	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02)
>>  	STD_EXCEPTION_HV_OOL(0xe22, h_instr_storage)
>> @@ -1564,6 +1569,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
>>  	blr
>>  #endif
>>  
>> +	.balign 16
>> +	.globl stf_barrier_fallback
>> +stf_barrier_fallback:
>> +	std	r9,PACA_EXRFI+EX_R9(r13)
>> +	std	r10,PACA_EXRFI+EX_R10(r13)
>> +	sync
>> +	ld	r9,PACA_EXRFI+EX_R9(r13)
>> +	ld	r10,PACA_EXRFI+EX_R10(r13)
>> +	ori	31,31,0
>> +	.rept 14
>> +	b	1f
>> +1:
>> +	.endr
>> +	blr
>> +
>>  	.globl rfi_flush_fallback
>>  rfi_flush_fallback:
>>  	SET_SCRATCH0(r13);
>> @@ -1571,39 +1591,37 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
>>  	std	r9,PACA_EXRFI+EX_R9(r13)
>>  	std	r10,PACA_EXRFI+EX_R10(r13)
>>  	std	r11,PACA_EXRFI+EX_R11(r13)
>> - -	std	r12,PACA_EXRFI+EX_R12(r13)
>> - -	std	r8,PACA_EXRFI+EX_R13(r13)
>>  	mfctr	r9
>>  	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
>> - -	ld	r11,PACA_L1D_FLUSH_SETS(r13)
>> - -	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
>> - -	/*
>> - -	 * The load adresses are at staggered offsets within cachelines,
>> - -	 * which suits some pipelines better (on others it should not
>> - -	 * hurt).
>> - -	 */
>> - -	addi	r12,r12,8
>> +	ld	r11,PACA_L1D_FLUSH_SIZE(r13)
>> +	srdi	r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
>>  	mtctr	r11
>>  	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
>>  
>>  	/* order ld/st prior to dcbt stop all streams with flushing */
>>  	sync
>> - -1:	li	r8,0
>> - -	.rept	8 /* 8-way set associative */
>> - -	ldx	r11,r10,r8
>> - -	add	r8,r8,r12
>> - -	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not
>> - -	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx
>> - -	.endr
>> - -	addi	r10,r10,128 /* 128 byte cache line */
>> +
>> +	/*
>> +	 * The load adresses are at staggered offsets within cachelines,
>> +	 * which suits some pipelines better (on others it should not
>> +	 * hurt).
>> +	 */
>> +1:
>> +	ld	r11,(0x80 + 8)*0(r10)
>> +	ld	r11,(0x80 + 8)*1(r10)
>> +	ld	r11,(0x80 + 8)*2(r10)
>> +	ld	r11,(0x80 + 8)*3(r10)
>> +	ld	r11,(0x80 + 8)*4(r10)
>> +	ld	r11,(0x80 + 8)*5(r10)
>> +	ld	r11,(0x80 + 8)*6(r10)
>> +	ld	r11,(0x80 + 8)*7(r10)
>> +	addi	r10,r10,0x80*8
>>  	bdnz	1b
>>  
>>  	mtctr	r9
>>  	ld	r9,PACA_EXRFI+EX_R9(r13)
>>  	ld	r10,PACA_EXRFI+EX_R10(r13)
>>  	ld	r11,PACA_EXRFI+EX_R11(r13)
>> - -	ld	r12,PACA_EXRFI+EX_R12(r13)
>> - -	ld	r8,PACA_EXRFI+EX_R13(r13)
>>  	GET_SCRATCH0(r13);
>>  	rfid
>>  
>> @@ -1614,39 +1632,37 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
>>  	std	r9,PACA_EXRFI+EX_R9(r13)
>>  	std	r10,PACA_EXRFI+EX_R10(r13)
>>  	std	r11,PACA_EXRFI+EX_R11(r13)
>> - -	std	r12,PACA_EXRFI+EX_R12(r13)
>> - -	std	r8,PACA_EXRFI+EX_R13(r13)
>>  	mfctr	r9
>>  	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
>> - -	ld	r11,PACA_L1D_FLUSH_SETS(r13)
>> - -	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
>> - -	/*
>> - -	 * The load adresses are at staggered offsets within cachelines,
>> - -	 * which suits some pipelines better (on others it should not
>> - -	 * hurt).
>> - -	 */
>> - -	addi	r12,r12,8
>> +	ld	r11,PACA_L1D_FLUSH_SIZE(r13)
>> +	srdi	r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */
>>  	mtctr	r11
>>  	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
>>  
>>  	/* order ld/st prior to dcbt stop all streams with flushing */
>>  	sync
>> - -1:	li	r8,0
>> - -	.rept	8 /* 8-way set associative */
>> - -	ldx	r11,r10,r8
>> - -	add	r8,r8,r12
>> - -	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not
>> - -	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx
>> - -	.endr
>> - -	addi	r10,r10,128 /* 128 byte cache line */
>> +
>> +	/*
>> +	 * The load adresses are at staggered offsets within cachelines,
>> +	 * which suits some pipelines better (on others it should not
>> +	 * hurt).
>> +	 */
>> +1:
>> +	ld	r11,(0x80 + 8)*0(r10)
>> +	ld	r11,(0x80 + 8)*1(r10)
>> +	ld	r11,(0x80 + 8)*2(r10)
>> +	ld	r11,(0x80 + 8)*3(r10)
>> +	ld	r11,(0x80 + 8)*4(r10)
>> +	ld	r11,(0x80 + 8)*5(r10)
>> +	ld	r11,(0x80 + 8)*6(r10)
>> +	ld	r11,(0x80 + 8)*7(r10)
>> +	addi	r10,r10,0x80*8
>>  	bdnz	1b
>>  
>>  	mtctr	r9
>>  	ld	r9,PACA_EXRFI+EX_R9(r13)
>>  	ld	r10,PACA_EXRFI+EX_R10(r13)
>>  	ld	r11,PACA_EXRFI+EX_R11(r13)
>> - -	ld	r12,PACA_EXRFI+EX_R12(r13)
>> - -	ld	r8,PACA_EXRFI+EX_R13(r13)
>>  	GET_SCRATCH0(r13);
>>  	hrfid
>>  
>> diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
>> index 9547381b631a..ff009be97a42 100644
>> - --- a/arch/powerpc/kernel/module.c
>> +++ b/arch/powerpc/kernel/module.c
>> @@ -67,7 +67,15 @@ int module_finalize(const Elf_Ehdr *hdr,
>>  		do_feature_fixups(powerpc_firmware_features,
>>  				  (void *)sect->sh_addr,
>>  				  (void *)sect->sh_addr + sect->sh_size);
>> - -#endif
>> +#endif /* CONFIG_PPC64 */
>> +
>> +#ifdef CONFIG_PPC_BARRIER_NOSPEC
>> +	sect = find_section(hdr, sechdrs, "__spec_barrier_fixup");
>> +	if (sect != NULL)
>> +		do_barrier_nospec_fixups_range(barrier_nospec_enabled,
>> +				  (void *)sect->sh_addr,
>> +				  (void *)sect->sh_addr + sect->sh_size);
>> +#endif /* CONFIG_PPC_BARRIER_NOSPEC */
>>  
>>  	sect = find_section(hdr, sechdrs, "__lwsync_fixup");
>>  	if (sect != NULL)
>> diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
>> new file mode 100644
>> index 000000000000..58f0602a92b9
>> - --- /dev/null
>> +++ b/arch/powerpc/kernel/security.c
>> @@ -0,0 +1,433 @@
>> +// SPDX-License-Identifier: GPL-2.0+
>> +//
>> +// Security related flags and so on.
>> +//
>> +// Copyright 2018, Michael Ellerman, IBM Corporation.
>> +
>> +#include <linux/kernel.h>
>> +#include <linux/debugfs.h>
>> +#include <linux/device.h>
>> +#include <linux/seq_buf.h>
>> +
>> +#include <asm/debug.h>
>> +#include <asm/asm-prototypes.h>
>> +#include <asm/code-patching.h>
>> +#include <asm/security_features.h>
>> +#include <asm/setup.h>
>> +
>> +
>> +unsigned long powerpc_security_features __read_mostly = SEC_FTR_DEFAULT;
>> +
>> +enum count_cache_flush_type {
>> +	COUNT_CACHE_FLUSH_NONE	= 0x1,
>> +	COUNT_CACHE_FLUSH_SW	= 0x2,
>> +	COUNT_CACHE_FLUSH_HW	= 0x4,
>> +};
>> +static enum count_cache_flush_type count_cache_flush_type = COUNT_CACHE_FLUSH_NONE;
>> +
>> +bool barrier_nospec_enabled;
>> +static bool no_nospec;
>> +static bool btb_flush_enabled;
>> +#ifdef CONFIG_PPC_FSL_BOOK3E
>> +static bool no_spectrev2;
>> +#endif
>> +
>> +static void enable_barrier_nospec(bool enable)
>> +{
>> +	barrier_nospec_enabled = enable;
>> +	do_barrier_nospec_fixups(enable);
>> +}
>> +
>> +void setup_barrier_nospec(void)
>> +{
>> +	bool enable;
>> +
>> +	/*
>> +	 * It would make sense to check SEC_FTR_SPEC_BAR_ORI31 below as well.
>> +	 * But there's a good reason not to. The two flags we check below are
>> +	 * both are enabled by default in the kernel, so if the hcall is not
>> +	 * functional they will be enabled.
>> +	 * On a system where the host firmware has been updated (so the ori
>> +	 * functions as a barrier), but on which the hypervisor (KVM/Qemu) has
>> +	 * not been updated, we would like to enable the barrier. Dropping the
>> +	 * check for SEC_FTR_SPEC_BAR_ORI31 achieves that. The only downside is
>> +	 * we potentially enable the barrier on systems where the host firmware
>> +	 * is not updated, but that's harmless as it's a no-op.
>> +	 */
>> +	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
>> +		 security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR);
>> +
>> +	if (!no_nospec)
>> +		enable_barrier_nospec(enable);
>> +}
>> +
>> +static int __init handle_nospectre_v1(char *p)
>> +{
>> +	no_nospec = true;
>> +
>> +	return 0;
>> +}
>> +early_param("nospectre_v1", handle_nospectre_v1);
>> +
>> +#ifdef CONFIG_DEBUG_FS
>> +static int barrier_nospec_set(void *data, u64 val)
>> +{
>> +	switch (val) {
>> +	case 0:
>> +	case 1:
>> +		break;
>> +	default:
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (!!val == !!barrier_nospec_enabled)
>> +		return 0;
>> +
>> +	enable_barrier_nospec(!!val);
>> +
>> +	return 0;
>> +}
>> +
>> +static int barrier_nospec_get(void *data, u64 *val)
>> +{
>> +	*val = barrier_nospec_enabled ? 1 : 0;
>> +	return 0;
>> +}
>> +
>> +DEFINE_SIMPLE_ATTRIBUTE(fops_barrier_nospec,
>> +			barrier_nospec_get, barrier_nospec_set, "%llu\n");
>> +
>> +static __init int barrier_nospec_debugfs_init(void)
>> +{
>> +	debugfs_create_file("barrier_nospec", 0600, powerpc_debugfs_root, NULL,
>> +			    &fops_barrier_nospec);
>> +	return 0;
>> +}
>> +device_initcall(barrier_nospec_debugfs_init);
>> +#endif /* CONFIG_DEBUG_FS */
>> +
>> +#ifdef CONFIG_PPC_FSL_BOOK3E
>> +static int __init handle_nospectre_v2(char *p)
>> +{
>> +	no_spectrev2 = true;
>> +
>> +	return 0;
>> +}
>> +early_param("nospectre_v2", handle_nospectre_v2);
>> +void setup_spectre_v2(void)
>> +{
>> +	if (no_spectrev2)
>> +		do_btb_flush_fixups();
>> +	else
>> +		btb_flush_enabled = true;
>> +}
>> +#endif /* CONFIG_PPC_FSL_BOOK3E */
>> +
>> +#ifdef CONFIG_PPC_BOOK3S_64
>> +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
>> +{
>> +	bool thread_priv;
>> +
>> +	thread_priv = security_ftr_enabled(SEC_FTR_L1D_THREAD_PRIV);
>> +
>> +	if (rfi_flush || thread_priv) {
>> +		struct seq_buf s;
>> +		seq_buf_init(&s, buf, PAGE_SIZE - 1);
>> +
>> +		seq_buf_printf(&s, "Mitigation: ");
>> +
>> +		if (rfi_flush)
>> +			seq_buf_printf(&s, "RFI Flush");
>> +
>> +		if (rfi_flush && thread_priv)
>> +			seq_buf_printf(&s, ", ");
>> +
>> +		if (thread_priv)
>> +			seq_buf_printf(&s, "L1D private per thread");
>> +
>> +		seq_buf_printf(&s, "\n");
>> +
>> +		return s.len;
>> +	}
>> +
>> +	if (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) &&
>> +	    !security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR))
>> +		return sprintf(buf, "Not affected\n");
>> +
>> +	return sprintf(buf, "Vulnerable\n");
>> +}
>> +#endif
>> +
>> +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf)
>> +{
>> +	struct seq_buf s;
>> +
>> +	seq_buf_init(&s, buf, PAGE_SIZE - 1);
>> +
>> +	if (security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR)) {
>> +		if (barrier_nospec_enabled)
>> +			seq_buf_printf(&s, "Mitigation: __user pointer sanitization");
>> +		else
>> +			seq_buf_printf(&s, "Vulnerable");
>> +
>> +		if (security_ftr_enabled(SEC_FTR_SPEC_BAR_ORI31))
>> +			seq_buf_printf(&s, ", ori31 speculation barrier enabled");
>> +
>> +		seq_buf_printf(&s, "\n");
>> +	} else
>> +		seq_buf_printf(&s, "Not affected\n");
>> +
>> +	return s.len;
>> +}
>> +
>> +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf)
>> +{
>> +	struct seq_buf s;
>> +	bool bcs, ccd;
>> +
>> +	seq_buf_init(&s, buf, PAGE_SIZE - 1);
>> +
>> +	bcs = security_ftr_enabled(SEC_FTR_BCCTRL_SERIALISED);
>> +	ccd = security_ftr_enabled(SEC_FTR_COUNT_CACHE_DISABLED);
>> +
>> +	if (bcs || ccd) {
>> +		seq_buf_printf(&s, "Mitigation: ");
>> +
>> +		if (bcs)
>> +			seq_buf_printf(&s, "Indirect branch serialisation (kernel only)");
>> +
>> +		if (bcs && ccd)
>> +			seq_buf_printf(&s, ", ");
>> +
>> +		if (ccd)
>> +			seq_buf_printf(&s, "Indirect branch cache disabled");
>> +	} else if (count_cache_flush_type != COUNT_CACHE_FLUSH_NONE) {
>> +		seq_buf_printf(&s, "Mitigation: Software count cache flush");
>> +
>> +		if (count_cache_flush_type == COUNT_CACHE_FLUSH_HW)
>> +			seq_buf_printf(&s, " (hardware accelerated)");
>> +	} else if (btb_flush_enabled) {
>> +		seq_buf_printf(&s, "Mitigation: Branch predictor state flush");
>> +	} else {
>> +		seq_buf_printf(&s, "Vulnerable");
>> +	}
>> +
>> +	seq_buf_printf(&s, "\n");
>> +
>> +	return s.len;
>> +}
>> +
>> +#ifdef CONFIG_PPC_BOOK3S_64
>> +/*
>> + * Store-forwarding barrier support.
>> + */
>> +
>> +static enum stf_barrier_type stf_enabled_flush_types;
>> +static bool no_stf_barrier;
>> +bool stf_barrier;
>> +
>> +static int __init handle_no_stf_barrier(char *p)
>> +{
>> +	pr_info("stf-barrier: disabled on command line.");
>> +	no_stf_barrier = true;
>> +	return 0;
>> +}
>> +
>> +early_param("no_stf_barrier", handle_no_stf_barrier);
>> +
>> +/* This is the generic flag used by other architectures */
>> +static int __init handle_ssbd(char *p)
>> +{
>> +	if (!p || strncmp(p, "auto", 5) == 0 || strncmp(p, "on", 2) == 0 ) {
>> +		/* Until firmware tells us, we have the barrier with auto */
>> +		return 0;
>> +	} else if (strncmp(p, "off", 3) == 0) {
>> +		handle_no_stf_barrier(NULL);
>> +		return 0;
>> +	} else
>> +		return 1;
>> +
>> +	return 0;
>> +}
>> +early_param("spec_store_bypass_disable", handle_ssbd);
>> +
>> +/* This is the generic flag used by other architectures */
>> +static int __init handle_no_ssbd(char *p)
>> +{
>> +	handle_no_stf_barrier(NULL);
>> +	return 0;
>> +}
>> +early_param("nospec_store_bypass_disable", handle_no_ssbd);
>> +
>> +static void stf_barrier_enable(bool enable)
>> +{
>> +	if (enable)
>> +		do_stf_barrier_fixups(stf_enabled_flush_types);
>> +	else
>> +		do_stf_barrier_fixups(STF_BARRIER_NONE);
>> +
>> +	stf_barrier = enable;
>> +}
>> +
>> +void setup_stf_barrier(void)
>> +{
>> +	enum stf_barrier_type type;
>> +	bool enable, hv;
>> +
>> +	hv = cpu_has_feature(CPU_FTR_HVMODE);
>> +
>> +	/* Default to fallback in case fw-features are not available */
>> +	if (cpu_has_feature(CPU_FTR_ARCH_207S))
>> +		type = STF_BARRIER_SYNC_ORI;
>> +	else if (cpu_has_feature(CPU_FTR_ARCH_206))
>> +		type = STF_BARRIER_FALLBACK;
>> +	else
>> +		type = STF_BARRIER_NONE;
>> +
>> +	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
>> +		(security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR) ||
>> +		 (security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) && hv));
>> +
>> +	if (type == STF_BARRIER_FALLBACK) {
>> +		pr_info("stf-barrier: fallback barrier available\n");
>> +	} else if (type == STF_BARRIER_SYNC_ORI) {
>> +		pr_info("stf-barrier: hwsync barrier available\n");
>> +	} else if (type == STF_BARRIER_EIEIO) {
>> +		pr_info("stf-barrier: eieio barrier available\n");
>> +	}
>> +
>> +	stf_enabled_flush_types = type;
>> +
>> +	if (!no_stf_barrier)
>> +		stf_barrier_enable(enable);
>> +}
>> +
>> +ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf)
>> +{
>> +	if (stf_barrier && stf_enabled_flush_types != STF_BARRIER_NONE) {
>> +		const char *type;
>> +		switch (stf_enabled_flush_types) {
>> +		case STF_BARRIER_EIEIO:
>> +			type = "eieio";
>> +			break;
>> +		case STF_BARRIER_SYNC_ORI:
>> +			type = "hwsync";
>> +			break;
>> +		case STF_BARRIER_FALLBACK:
>> +			type = "fallback";
>> +			break;
>> +		default:
>> +			type = "unknown";
>> +		}
>> +		return sprintf(buf, "Mitigation: Kernel entry/exit barrier (%s)\n", type);
>> +	}
>> +
>> +	if (!security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV) &&
>> +	    !security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR))
>> +		return sprintf(buf, "Not affected\n");
>> +
>> +	return sprintf(buf, "Vulnerable\n");
>> +}
>> +
>> +#ifdef CONFIG_DEBUG_FS
>> +static int stf_barrier_set(void *data, u64 val)
>> +{
>> +	bool enable;
>> +
>> +	if (val == 1)
>> +		enable = true;
>> +	else if (val == 0)
>> +		enable = false;
>> +	else
>> +		return -EINVAL;
>> +
>> +	/* Only do anything if we're changing state */
>> +	if (enable != stf_barrier)
>> +		stf_barrier_enable(enable);
>> +
>> +	return 0;
>> +}
>> +
>> +static int stf_barrier_get(void *data, u64 *val)
>> +{
>> +	*val = stf_barrier ? 1 : 0;
>> +	return 0;
>> +}
>> +
>> +DEFINE_SIMPLE_ATTRIBUTE(fops_stf_barrier, stf_barrier_get, stf_barrier_set, "%llu\n");
>> +
>> +static __init int stf_barrier_debugfs_init(void)
>> +{
>> +	debugfs_create_file("stf_barrier", 0600, powerpc_debugfs_root, NULL, &fops_stf_barrier);
>> +	return 0;
>> +}
>> +device_initcall(stf_barrier_debugfs_init);
>> +#endif /* CONFIG_DEBUG_FS */
>> +
>> +static void toggle_count_cache_flush(bool enable)
>> +{
>> +	if (!enable || !security_ftr_enabled(SEC_FTR_FLUSH_COUNT_CACHE)) {
>> +		patch_instruction_site(&patch__call_flush_count_cache, PPC_INST_NOP);
>> +		count_cache_flush_type = COUNT_CACHE_FLUSH_NONE;
>> +		pr_info("count-cache-flush: software flush disabled.\n");
>> +		return;
>> +	}
>> +
>> +	patch_branch_site(&patch__call_flush_count_cache,
>> +			  (u64)&flush_count_cache, BRANCH_SET_LINK);
>> +
>> +	if (!security_ftr_enabled(SEC_FTR_BCCTR_FLUSH_ASSIST)) {
>> +		count_cache_flush_type = COUNT_CACHE_FLUSH_SW;
>> +		pr_info("count-cache-flush: full software flush sequence enabled.\n");
>> +		return;
>> +	}
>> +
>> +	patch_instruction_site(&patch__flush_count_cache_return, PPC_INST_BLR);
>> +	count_cache_flush_type = COUNT_CACHE_FLUSH_HW;
>> +	pr_info("count-cache-flush: hardware assisted flush sequence enabled\n");
>> +}
>> +
>> +void setup_count_cache_flush(void)
>> +{
>> +	toggle_count_cache_flush(true);
>> +}
>> +
>> +#ifdef CONFIG_DEBUG_FS
>> +static int count_cache_flush_set(void *data, u64 val)
>> +{
>> +	bool enable;
>> +
>> +	if (val == 1)
>> +		enable = true;
>> +	else if (val == 0)
>> +		enable = false;
>> +	else
>> +		return -EINVAL;
>> +
>> +	toggle_count_cache_flush(enable);
>> +
>> +	return 0;
>> +}
>> +
>> +static int count_cache_flush_get(void *data, u64 *val)
>> +{
>> +	if (count_cache_flush_type == COUNT_CACHE_FLUSH_NONE)
>> +		*val = 0;
>> +	else
>> +		*val = 1;
>> +
>> +	return 0;
>> +}
>> +
>> +DEFINE_SIMPLE_ATTRIBUTE(fops_count_cache_flush, count_cache_flush_get,
>> +			count_cache_flush_set, "%llu\n");
>> +
>> +static __init int count_cache_flush_debugfs_init(void)
>> +{
>> +	debugfs_create_file("count_cache_flush", 0600, powerpc_debugfs_root,
>> +			    NULL, &fops_count_cache_flush);
>> +	return 0;
>> +}
>> +device_initcall(count_cache_flush_debugfs_init);
>> +#endif /* CONFIG_DEBUG_FS */
>> +#endif /* CONFIG_PPC_BOOK3S_64 */
>> diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
>> index ad8c9db61237..5a9f035bcd6b 100644
>> - --- a/arch/powerpc/kernel/setup_32.c
>> +++ b/arch/powerpc/kernel/setup_32.c
>> @@ -322,6 +322,8 @@ void __init setup_arch(char **cmdline_p)
>>  		ppc_md.setup_arch();
>>  	if ( ppc_md.progress ) ppc_md.progress("arch: exit", 0x3eab);
>>  
>> +	setup_barrier_nospec();
>> +
>>  	paging_init();
>>  
>>  	/* Initialize the MMU context management stuff */
>> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
>> index 9eb469bed22b..6bb731ababc6 100644
>> - --- a/arch/powerpc/kernel/setup_64.c
>> +++ b/arch/powerpc/kernel/setup_64.c
>> @@ -736,6 +736,8 @@ void __init setup_arch(char **cmdline_p)
>>  	if (ppc_md.setup_arch)
>>  		ppc_md.setup_arch();
>>  
>> +	setup_barrier_nospec();
>> +
>>  	paging_init();
>>  
>>  	/* Initialize the MMU context management stuff */
>> @@ -873,9 +875,6 @@ static void do_nothing(void *unused)
>>  
>>  void rfi_flush_enable(bool enable)
>>  {
>> - -	if (rfi_flush == enable)
>> - -		return;
>> - -
>>  	if (enable) {
>>  		do_rfi_flush_fixups(enabled_flush_types);
>>  		on_each_cpu(do_nothing, NULL, 1);
>> @@ -885,11 +884,15 @@ void rfi_flush_enable(bool enable)
>>  	rfi_flush = enable;
>>  }
>>  
>> - -static void init_fallback_flush(void)
>> +static void __ref init_fallback_flush(void)
>>  {
>>  	u64 l1d_size, limit;
>>  	int cpu;
>>  
>> +	/* Only allocate the fallback flush area once (at boot time). */
>> +	if (l1d_flush_fallback_area)
>> +		return;
>> +
>>  	l1d_size = ppc64_caches.dsize;
>>  	limit = min(safe_stack_limit(), ppc64_rma_size);
>>  
>> @@ -902,34 +905,23 @@ static void init_fallback_flush(void)
>>  	memset(l1d_flush_fallback_area, 0, l1d_size * 2);
>>  
>>  	for_each_possible_cpu(cpu) {
>> - -		/*
>> - -		 * The fallback flush is currently coded for 8-way
>> - -		 * associativity. Different associativity is possible, but it
>> - -		 * will be treated as 8-way and may not evict the lines as
>> - -		 * effectively.
>> - -		 *
>> - -		 * 128 byte lines are mandatory.
>> - -		 */
>> - -		u64 c = l1d_size / 8;
>> - -
>>  		paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
>> - -		paca[cpu].l1d_flush_congruence = c;
>> - -		paca[cpu].l1d_flush_sets = c / 128;
>> +		paca[cpu].l1d_flush_size = l1d_size;
>>  	}
>>  }
>>  
>> - -void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
>> +void setup_rfi_flush(enum l1d_flush_type types, bool enable)
>>  {
>>  	if (types & L1D_FLUSH_FALLBACK) {
>> - -		pr_info("rfi-flush: Using fallback displacement flush\n");
>> +		pr_info("rfi-flush: fallback displacement flush available\n");
>>  		init_fallback_flush();
>>  	}
>>  
>>  	if (types & L1D_FLUSH_ORI)
>> - -		pr_info("rfi-flush: Using ori type flush\n");
>> +		pr_info("rfi-flush: ori type flush available\n");
>>  
>>  	if (types & L1D_FLUSH_MTTRIG)
>> - -		pr_info("rfi-flush: Using mttrig type flush\n");
>> +		pr_info("rfi-flush: mttrig type flush available\n");
>>  
>>  	enabled_flush_types = types;
>>  
>> @@ -940,13 +932,19 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
>>  #ifdef CONFIG_DEBUG_FS
>>  static int rfi_flush_set(void *data, u64 val)
>>  {
>> +	bool enable;
>> +
>>  	if (val == 1)
>> - -		rfi_flush_enable(true);
>> +		enable = true;
>>  	else if (val == 0)
>> - -		rfi_flush_enable(false);
>> +		enable = false;
>>  	else
>>  		return -EINVAL;
>>  
>> +	/* Only do anything if we're changing state */
>> +	if (enable != rfi_flush)
>> +		rfi_flush_enable(enable);
>> +
>>  	return 0;
>>  }
>>  
>> @@ -965,12 +963,4 @@ static __init int rfi_flush_debugfs_init(void)
>>  }
>>  device_initcall(rfi_flush_debugfs_init);
>>  #endif
>> - -
>> - -ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
>> - -{
>> - -	if (rfi_flush)
>> - -		return sprintf(buf, "Mitigation: RFI Flush\n");
>> - -
>> - -	return sprintf(buf, "Vulnerable\n");
>> - -}
>>  #endif /* CONFIG_PPC_BOOK3S_64 */
>> diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
>> index 072a23a17350..876ac9d52afc 100644
>> - --- a/arch/powerpc/kernel/vmlinux.lds.S
>> +++ b/arch/powerpc/kernel/vmlinux.lds.S
>> @@ -73,14 +73,45 @@ SECTIONS
>>  	RODATA
>>  
>>  #ifdef CONFIG_PPC64
>> +	. = ALIGN(8);
>> +	__stf_entry_barrier_fixup : AT(ADDR(__stf_entry_barrier_fixup) - LOAD_OFFSET) {
>> +		__start___stf_entry_barrier_fixup = .;
>> +		*(__stf_entry_barrier_fixup)
>> +		__stop___stf_entry_barrier_fixup = .;
>> +	}
>> +
>> +	. = ALIGN(8);
>> +	__stf_exit_barrier_fixup : AT(ADDR(__stf_exit_barrier_fixup) - LOAD_OFFSET) {
>> +		__start___stf_exit_barrier_fixup = .;
>> +		*(__stf_exit_barrier_fixup)
>> +		__stop___stf_exit_barrier_fixup = .;
>> +	}
>> +
>>  	. = ALIGN(8);
>>  	__rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) {
>>  		__start___rfi_flush_fixup = .;
>>  		*(__rfi_flush_fixup)
>>  		__stop___rfi_flush_fixup = .;
>>  	}
>> - -#endif
>> +#endif /* CONFIG_PPC64 */
>>  
>> +#ifdef CONFIG_PPC_BARRIER_NOSPEC
>> +	. = ALIGN(8);
>> +	__spec_barrier_fixup : AT(ADDR(__spec_barrier_fixup) - LOAD_OFFSET) {
>> +		__start___barrier_nospec_fixup = .;
>> +		*(__barrier_nospec_fixup)
>> +		__stop___barrier_nospec_fixup = .;
>> +	}
>> +#endif /* CONFIG_PPC_BARRIER_NOSPEC */
>> +
>> +#ifdef CONFIG_PPC_FSL_BOOK3E
>> +	. = ALIGN(8);
>> +	__spec_btb_flush_fixup : AT(ADDR(__spec_btb_flush_fixup) - LOAD_OFFSET) {
>> +		__start__btb_flush_fixup = .;
>> +		*(__btb_flush_fixup)
>> +		__stop__btb_flush_fixup = .;
>> +	}
>> +#endif
>>  	EXCEPTION_TABLE(0)
>>  
>>  	NOTES :kernel :notes
>> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
>> index d5edbeb8eb82..570c06a00db6 100644
>> - --- a/arch/powerpc/lib/code-patching.c
>> +++ b/arch/powerpc/lib/code-patching.c
>> @@ -14,12 +14,25 @@
>>  #include <asm/page.h>
>>  #include <asm/code-patching.h>
>>  #include <asm/uaccess.h>
>> +#include <asm/setup.h>
>> +#include <asm/sections.h>
>>  
>>  
>> +static inline bool is_init(unsigned int *addr)
>> +{
>> +	return addr >= (unsigned int *)__init_begin && addr < (unsigned int *)__init_end;
>> +}
>> +
>>  int patch_instruction(unsigned int *addr, unsigned int instr)
>>  {
>>  	int err;
>>  
>> +	/* Make sure we aren't patching a freed init section */
>> +	if (init_mem_is_free && is_init(addr)) {
>> +		pr_debug("Skipping init section patching addr: 0x%px\n", addr);
>> +		return 0;
>> +	}
>> +
>>  	__put_user_size(instr, addr, 4, err);
>>  	if (err)
>>  		return err;
>> @@ -32,6 +45,22 @@ int patch_branch(unsigned int *addr, unsigned long target, int flags)
>>  	return patch_instruction(addr, create_branch(addr, target, flags));
>>  }
>>  
>> +int patch_branch_site(s32 *site, unsigned long target, int flags)
>> +{
>> +	unsigned int *addr;
>> +
>> +	addr = (unsigned int *)((unsigned long)site + *site);
>> +	return patch_instruction(addr, create_branch(addr, target, flags));
>> +}
>> +
>> +int patch_instruction_site(s32 *site, unsigned int instr)
>> +{
>> +	unsigned int *addr;
>> +
>> +	addr = (unsigned int *)((unsigned long)site + *site);
>> +	return patch_instruction(addr, instr);
>> +}
>> +
>>  unsigned int create_branch(const unsigned int *addr,
>>  			   unsigned long target, int flags)
>>  {
>> diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
>> index 3af014684872..7bdfc19a491d 100644
>> - --- a/arch/powerpc/lib/feature-fixups.c
>> +++ b/arch/powerpc/lib/feature-fixups.c
>> @@ -21,7 +21,7 @@
>>  #include <asm/page.h>
>>  #include <asm/sections.h>
>>  #include <asm/setup.h>
>> - -
>> +#include <asm/security_features.h>
>>  
>>  struct fixup_entry {
>>  	unsigned long	mask;
>> @@ -115,6 +115,120 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
>>  }
>>  
>>  #ifdef CONFIG_PPC_BOOK3S_64
>> +void do_stf_entry_barrier_fixups(enum stf_barrier_type types)
>> +{
>> +	unsigned int instrs[3], *dest;
>> +	long *start, *end;
>> +	int i;
>> +
>> +	start = PTRRELOC(&__start___stf_entry_barrier_fixup),
>> +	end = PTRRELOC(&__stop___stf_entry_barrier_fixup);
>> +
>> +	instrs[0] = 0x60000000; /* nop */
>> +	instrs[1] = 0x60000000; /* nop */
>> +	instrs[2] = 0x60000000; /* nop */
>> +
>> +	i = 0;
>> +	if (types & STF_BARRIER_FALLBACK) {
>> +		instrs[i++] = 0x7d4802a6; /* mflr r10		*/
>> +		instrs[i++] = 0x60000000; /* branch patched below */
>> +		instrs[i++] = 0x7d4803a6; /* mtlr r10		*/
>> +	} else if (types & STF_BARRIER_EIEIO) {
>> +		instrs[i++] = 0x7e0006ac; /* eieio + bit 6 hint */
>> +	} else if (types & STF_BARRIER_SYNC_ORI) {
>> +		instrs[i++] = 0x7c0004ac; /* hwsync		*/
>> +		instrs[i++] = 0xe94d0000; /* ld r10,0(r13)	*/
>> +		instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */
>> +	}
>> +
>> +	for (i = 0; start < end; start++, i++) {
>> +		dest = (void *)start + *start;
>> +
>> +		pr_devel("patching dest %lx\n", (unsigned long)dest);
>> +
>> +		patch_instruction(dest, instrs[0]);
>> +
>> +		if (types & STF_BARRIER_FALLBACK)
>> +			patch_branch(dest + 1, (unsigned long)&stf_barrier_fallback,
>> +				     BRANCH_SET_LINK);
>> +		else
>> +			patch_instruction(dest + 1, instrs[1]);
>> +
>> +		patch_instruction(dest + 2, instrs[2]);
>> +	}
>> +
>> +	printk(KERN_DEBUG "stf-barrier: patched %d entry locations (%s barrier)\n", i,
>> +		(types == STF_BARRIER_NONE)                  ? "no" :
>> +		(types == STF_BARRIER_FALLBACK)              ? "fallback" :
>> +		(types == STF_BARRIER_EIEIO)                 ? "eieio" :
>> +		(types == (STF_BARRIER_SYNC_ORI))            ? "hwsync"
>> +		                                           : "unknown");
>> +}
>> +
>> +void do_stf_exit_barrier_fixups(enum stf_barrier_type types)
>> +{
>> +	unsigned int instrs[6], *dest;
>> +	long *start, *end;
>> +	int i;
>> +
>> +	start = PTRRELOC(&__start___stf_exit_barrier_fixup),
>> +	end = PTRRELOC(&__stop___stf_exit_barrier_fixup);
>> +
>> +	instrs[0] = 0x60000000; /* nop */
>> +	instrs[1] = 0x60000000; /* nop */
>> +	instrs[2] = 0x60000000; /* nop */
>> +	instrs[3] = 0x60000000; /* nop */
>> +	instrs[4] = 0x60000000; /* nop */
>> +	instrs[5] = 0x60000000; /* nop */
>> +
>> +	i = 0;
>> +	if (types & STF_BARRIER_FALLBACK || types & STF_BARRIER_SYNC_ORI) {
>> +		if (cpu_has_feature(CPU_FTR_HVMODE)) {
>> +			instrs[i++] = 0x7db14ba6; /* mtspr 0x131, r13 (HSPRG1) */
>> +			instrs[i++] = 0x7db04aa6; /* mfspr r13, 0x130 (HSPRG0) */
>> +		} else {
>> +			instrs[i++] = 0x7db243a6; /* mtsprg 2,r13	*/
>> +			instrs[i++] = 0x7db142a6; /* mfsprg r13,1    */
>> +	        }
>> +		instrs[i++] = 0x7c0004ac; /* hwsync		*/
>> +		instrs[i++] = 0xe9ad0000; /* ld r13,0(r13)	*/
>> +		instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */
>> +		if (cpu_has_feature(CPU_FTR_HVMODE)) {
>> +			instrs[i++] = 0x7db14aa6; /* mfspr r13, 0x131 (HSPRG1) */
>> +		} else {
>> +			instrs[i++] = 0x7db242a6; /* mfsprg r13,2 */
>> +		}
>> +	} else if (types & STF_BARRIER_EIEIO) {
>> +		instrs[i++] = 0x7e0006ac; /* eieio + bit 6 hint */
>> +	}
>> +
>> +	for (i = 0; start < end; start++, i++) {
>> +		dest = (void *)start + *start;
>> +
>> +		pr_devel("patching dest %lx\n", (unsigned long)dest);
>> +
>> +		patch_instruction(dest, instrs[0]);
>> +		patch_instruction(dest + 1, instrs[1]);
>> +		patch_instruction(dest + 2, instrs[2]);
>> +		patch_instruction(dest + 3, instrs[3]);
>> +		patch_instruction(dest + 4, instrs[4]);
>> +		patch_instruction(dest + 5, instrs[5]);
>> +	}
>> +	printk(KERN_DEBUG "stf-barrier: patched %d exit locations (%s barrier)\n", i,
>> +		(types == STF_BARRIER_NONE)                  ? "no" :
>> +		(types == STF_BARRIER_FALLBACK)              ? "fallback" :
>> +		(types == STF_BARRIER_EIEIO)                 ? "eieio" :
>> +		(types == (STF_BARRIER_SYNC_ORI))            ? "hwsync"
>> +		                                           : "unknown");
>> +}
>> +
>> +
>> +void do_stf_barrier_fixups(enum stf_barrier_type types)
>> +{
>> +	do_stf_entry_barrier_fixups(types);
>> +	do_stf_exit_barrier_fixups(types);
>> +}
>> +
>>  void do_rfi_flush_fixups(enum l1d_flush_type types)
>>  {
>>  	unsigned int instrs[3], *dest;
>> @@ -151,10 +265,110 @@ void do_rfi_flush_fixups(enum l1d_flush_type types)
>>  		patch_instruction(dest + 2, instrs[2]);
>>  	}
>>  
>> - -	printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i);
>> +	printk(KERN_DEBUG "rfi-flush: patched %d locations (%s flush)\n", i,
>> +		(types == L1D_FLUSH_NONE)       ? "no" :
>> +		(types == L1D_FLUSH_FALLBACK)   ? "fallback displacement" :
>> +		(types &  L1D_FLUSH_ORI)        ? (types & L1D_FLUSH_MTTRIG)
>> +							? "ori+mttrig type"
>> +							: "ori type" :
>> +		(types &  L1D_FLUSH_MTTRIG)     ? "mttrig type"
>> +						: "unknown");
>> +}
>> +
>> +void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end)
>> +{
>> +	unsigned int instr, *dest;
>> +	long *start, *end;
>> +	int i;
>> +
>> +	start = fixup_start;
>> +	end = fixup_end;
>> +
>> +	instr = 0x60000000; /* nop */
>> +
>> +	if (enable) {
>> +		pr_info("barrier-nospec: using ORI speculation barrier\n");
>> +		instr = 0x63ff0000; /* ori 31,31,0 speculation barrier */
>> +	}
>> +
>> +	for (i = 0; start < end; start++, i++) {
>> +		dest = (void *)start + *start;
>> +
>> +		pr_devel("patching dest %lx\n", (unsigned long)dest);
>> +		patch_instruction(dest, instr);
>> +	}
>> +
>> +	printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i);
>>  }
>> +
>>  #endif /* CONFIG_PPC_BOOK3S_64 */
>>  
>> +#ifdef CONFIG_PPC_BARRIER_NOSPEC
>> +void do_barrier_nospec_fixups(bool enable)
>> +{
>> +	void *start, *end;
>> +
>> +	start = PTRRELOC(&__start___barrier_nospec_fixup),
>> +	end = PTRRELOC(&__stop___barrier_nospec_fixup);
>> +
>> +	do_barrier_nospec_fixups_range(enable, start, end);
>> +}
>> +#endif /* CONFIG_PPC_BARRIER_NOSPEC */
>> +
>> +#ifdef CONFIG_PPC_FSL_BOOK3E
>> +void do_barrier_nospec_fixups_range(bool enable, void *fixup_start, void *fixup_end)
>> +{
>> +	unsigned int instr[2], *dest;
>> +	long *start, *end;
>> +	int i;
>> +
>> +	start = fixup_start;
>> +	end = fixup_end;
>> +
>> +	instr[0] = PPC_INST_NOP;
>> +	instr[1] = PPC_INST_NOP;
>> +
>> +	if (enable) {
>> +		pr_info("barrier-nospec: using isync; sync as speculation barrier\n");
>> +		instr[0] = PPC_INST_ISYNC;
>> +		instr[1] = PPC_INST_SYNC;
>> +	}
>> +
>> +	for (i = 0; start < end; start++, i++) {
>> +		dest = (void *)start + *start;
>> +
>> +		pr_devel("patching dest %lx\n", (unsigned long)dest);
>> +		patch_instruction(dest, instr[0]);
>> +		patch_instruction(dest + 1, instr[1]);
>> +	}
>> +
>> +	printk(KERN_DEBUG "barrier-nospec: patched %d locations\n", i);
>> +}
>> +
>> +static void patch_btb_flush_section(long *curr)
>> +{
>> +	unsigned int *start, *end;
>> +
>> +	start = (void *)curr + *curr;
>> +	end = (void *)curr + *(curr + 1);
>> +	for (; start < end; start++) {
>> +		pr_devel("patching dest %lx\n", (unsigned long)start);
>> +		patch_instruction(start, PPC_INST_NOP);
>> +	}
>> +}
>> +
>> +void do_btb_flush_fixups(void)
>> +{
>> +	long *start, *end;
>> +
>> +	start = PTRRELOC(&__start__btb_flush_fixup);
>> +	end = PTRRELOC(&__stop__btb_flush_fixup);
>> +
>> +	for (; start < end; start += 2)
>> +		patch_btb_flush_section(start);
>> +}
>> +#endif /* CONFIG_PPC_FSL_BOOK3E */
>> +
>>  void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
>>  {
>>  	long *start, *end;
>> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
>> index 22d94c3e6fc4..1efe5ca5c3bc 100644
>> - --- a/arch/powerpc/mm/mem.c
>> +++ b/arch/powerpc/mm/mem.c
>> @@ -62,6 +62,7 @@
>>  #endif
>>  
>>  unsigned long long memory_limit;
>> +bool init_mem_is_free;
>>  
>>  #ifdef CONFIG_HIGHMEM
>>  pte_t *kmap_pte;
>> @@ -381,6 +382,7 @@ void __init mem_init(void)
>>  void free_initmem(void)
>>  {
>>  	ppc_md.progress = ppc_printk_progress;
>> +	init_mem_is_free = true;
>>  	free_initmem_default(POISON_FREE_INITMEM);
>>  }
>>  
>> diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
>> index 29d6987c37ba..5486d56da289 100644
>> - --- a/arch/powerpc/mm/tlb_low_64e.S
>> +++ b/arch/powerpc/mm/tlb_low_64e.S
>> @@ -69,6 +69,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
>>  	std	r15,EX_TLB_R15(r12)
>>  	std	r10,EX_TLB_CR(r12)
>>  #ifdef CONFIG_PPC_FSL_BOOK3E
>> +START_BTB_FLUSH_SECTION
>> +	mfspr r11, SPRN_SRR1
>> +	andi. r10,r11,MSR_PR
>> +	beq 1f
>> +	BTB_FLUSH(r10)
>> +1:
>> +END_BTB_FLUSH_SECTION
>>  	std	r7,EX_TLB_R7(r12)
>>  #endif
>>  	TLB_MISS_PROLOG_STATS
>> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
>> index c57afc619b20..e14b52c7ebd8 100644
>> - --- a/arch/powerpc/platforms/powernv/setup.c
>> +++ b/arch/powerpc/platforms/powernv/setup.c
>> @@ -37,53 +37,99 @@
>>  #include <asm/smp.h>
>>  #include <asm/tm.h>
>>  #include <asm/setup.h>
>> +#include <asm/security_features.h>
>>  
>>  #include "powernv.h"
>>  
>> +
>> +static bool fw_feature_is(const char *state, const char *name,
>> +			  struct device_node *fw_features)
>> +{
>> +	struct device_node *np;
>> +	bool rc = false;
>> +
>> +	np = of_get_child_by_name(fw_features, name);
>> +	if (np) {
>> +		rc = of_property_read_bool(np, state);
>> +		of_node_put(np);
>> +	}
>> +
>> +	return rc;
>> +}
>> +
>> +static void init_fw_feat_flags(struct device_node *np)
>> +{
>> +	if (fw_feature_is("enabled", "inst-spec-barrier-ori31,31,0", np))
>> +		security_ftr_set(SEC_FTR_SPEC_BAR_ORI31);
>> +
>> +	if (fw_feature_is("enabled", "fw-bcctrl-serialized", np))
>> +		security_ftr_set(SEC_FTR_BCCTRL_SERIALISED);
>> +
>> +	if (fw_feature_is("enabled", "inst-l1d-flush-ori30,30,0", np))
>> +		security_ftr_set(SEC_FTR_L1D_FLUSH_ORI30);
>> +
>> +	if (fw_feature_is("enabled", "inst-l1d-flush-trig2", np))
>> +		security_ftr_set(SEC_FTR_L1D_FLUSH_TRIG2);
>> +
>> +	if (fw_feature_is("enabled", "fw-l1d-thread-split", np))
>> +		security_ftr_set(SEC_FTR_L1D_THREAD_PRIV);
>> +
>> +	if (fw_feature_is("enabled", "fw-count-cache-disabled", np))
>> +		security_ftr_set(SEC_FTR_COUNT_CACHE_DISABLED);
>> +
>> +	if (fw_feature_is("enabled", "fw-count-cache-flush-bcctr2,0,0", np))
>> +		security_ftr_set(SEC_FTR_BCCTR_FLUSH_ASSIST);
>> +
>> +	if (fw_feature_is("enabled", "needs-count-cache-flush-on-context-switch", np))
>> +		security_ftr_set(SEC_FTR_FLUSH_COUNT_CACHE);
>> +
>> +	/*
>> +	 * The features below are enabled by default, so we instead look to see
>> +	 * if firmware has *disabled* them, and clear them if so.
>> +	 */
>> +	if (fw_feature_is("disabled", "speculation-policy-favor-security", np))
>> +		security_ftr_clear(SEC_FTR_FAVOUR_SECURITY);
>> +
>> +	if (fw_feature_is("disabled", "needs-l1d-flush-msr-pr-0-to-1", np))
>> +		security_ftr_clear(SEC_FTR_L1D_FLUSH_PR);
>> +
>> +	if (fw_feature_is("disabled", "needs-l1d-flush-msr-hv-1-to-0", np))
>> +		security_ftr_clear(SEC_FTR_L1D_FLUSH_HV);
>> +
>> +	if (fw_feature_is("disabled", "needs-spec-barrier-for-bound-checks", np))
>> +		security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR);
>> +}
>> +
>>  static void pnv_setup_rfi_flush(void)
>>  {
>>  	struct device_node *np, *fw_features;
>>  	enum l1d_flush_type type;
>> - -	int enable;
>> +	bool enable;
>>  
>>  	/* Default to fallback in case fw-features are not available */
>>  	type = L1D_FLUSH_FALLBACK;
>> - -	enable = 1;
>>  
>>  	np = of_find_node_by_name(NULL, "ibm,opal");
>>  	fw_features = of_get_child_by_name(np, "fw-features");
>>  	of_node_put(np);
>>  
>>  	if (fw_features) {
>> - -		np = of_get_child_by_name(fw_features, "inst-l1d-flush-trig2");
>> - -		if (np && of_property_read_bool(np, "enabled"))
>> - -			type = L1D_FLUSH_MTTRIG;
>> +		init_fw_feat_flags(fw_features);
>> +		of_node_put(fw_features);
>>  
>> - -		of_node_put(np);
>> +		if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_TRIG2))
>> +			type = L1D_FLUSH_MTTRIG;
>>  
>> - -		np = of_get_child_by_name(fw_features, "inst-l1d-flush-ori30,30,0");
>> - -		if (np && of_property_read_bool(np, "enabled"))
>> +		if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_ORI30))
>>  			type = L1D_FLUSH_ORI;
>> - -
>> - -		of_node_put(np);
>> - -
>> - -		/* Enable unless firmware says NOT to */
>> - -		enable = 2;
>> - -		np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-hv-1-to-0");
>> - -		if (np && of_property_read_bool(np, "disabled"))
>> - -			enable--;
>> - -
>> - -		of_node_put(np);
>> - -
>> - -		np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-pr-0-to-1");
>> - -		if (np && of_property_read_bool(np, "disabled"))
>> - -			enable--;
>> - -
>> - -		of_node_put(np);
>> - -		of_node_put(fw_features);
>>  	}
>>  
>> - -	setup_rfi_flush(type, enable > 0);
>> +	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && \
>> +		 (security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR)   || \
>> +		  security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV));
>> +
>> +	setup_rfi_flush(type, enable);
>> +	setup_count_cache_flush();
>>  }
>>  
>>  static void __init pnv_setup_arch(void)
>> @@ -91,6 +137,7 @@ static void __init pnv_setup_arch(void)
>>  	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
>>  
>>  	pnv_setup_rfi_flush();
>> +	setup_stf_barrier();
>>  
>>  	/* Initialize SMP */
>>  	pnv_smp_init();
>> diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
>> index 8dd0c8edefd6..c773396d0969 100644
>> - --- a/arch/powerpc/platforms/pseries/mobility.c
>> +++ b/arch/powerpc/platforms/pseries/mobility.c
>> @@ -314,6 +314,9 @@ void post_mobility_fixup(void)
>>  		printk(KERN_ERR "Post-mobility device tree update "
>>  			"failed: %d\n", rc);
>>  
>> +	/* Possibly switch to a new RFI flush type */
>> +	pseries_setup_rfi_flush();
>> +
>>  	return;
>>  }
>>  
>> diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h
>> index 8411c27293e4..e7d80797384d 100644
>> - --- a/arch/powerpc/platforms/pseries/pseries.h
>> +++ b/arch/powerpc/platforms/pseries/pseries.h
>> @@ -81,4 +81,6 @@ extern struct pci_controller_ops pseries_pci_controller_ops;
>>  
>>  unsigned long pseries_memory_block_size(void);
>>  
>> +void pseries_setup_rfi_flush(void);
>> +
>>  #endif /* _PSERIES_PSERIES_H */
>> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
>> index dd2545fc9947..9cc976ff7fec 100644
>> - --- a/arch/powerpc/platforms/pseries/setup.c
>> +++ b/arch/powerpc/platforms/pseries/setup.c
>> @@ -67,6 +67,7 @@
>>  #include <asm/eeh.h>
>>  #include <asm/reg.h>
>>  #include <asm/plpar_wrappers.h>
>> +#include <asm/security_features.h>
>>  
>>  #include "pseries.h"
>>  
>> @@ -499,37 +500,87 @@ static void __init find_and_init_phbs(void)
>>  	of_pci_check_probe_only();
>>  }
>>  
>> - -static void pseries_setup_rfi_flush(void)
>> +static void init_cpu_char_feature_flags(struct h_cpu_char_result *result)
>> +{
>> +	/*
>> +	 * The features below are disabled by default, so we instead look to see
>> +	 * if firmware has *enabled* them, and set them if so.
>> +	 */
>> +	if (result->character & H_CPU_CHAR_SPEC_BAR_ORI31)
>> +		security_ftr_set(SEC_FTR_SPEC_BAR_ORI31);
>> +
>> +	if (result->character & H_CPU_CHAR_BCCTRL_SERIALISED)
>> +		security_ftr_set(SEC_FTR_BCCTRL_SERIALISED);
>> +
>> +	if (result->character & H_CPU_CHAR_L1D_FLUSH_ORI30)
>> +		security_ftr_set(SEC_FTR_L1D_FLUSH_ORI30);
>> +
>> +	if (result->character & H_CPU_CHAR_L1D_FLUSH_TRIG2)
>> +		security_ftr_set(SEC_FTR_L1D_FLUSH_TRIG2);
>> +
>> +	if (result->character & H_CPU_CHAR_L1D_THREAD_PRIV)
>> +		security_ftr_set(SEC_FTR_L1D_THREAD_PRIV);
>> +
>> +	if (result->character & H_CPU_CHAR_COUNT_CACHE_DISABLED)
>> +		security_ftr_set(SEC_FTR_COUNT_CACHE_DISABLED);
>> +
>> +	if (result->character & H_CPU_CHAR_BCCTR_FLUSH_ASSIST)
>> +		security_ftr_set(SEC_FTR_BCCTR_FLUSH_ASSIST);
>> +
>> +	if (result->behaviour & H_CPU_BEHAV_FLUSH_COUNT_CACHE)
>> +		security_ftr_set(SEC_FTR_FLUSH_COUNT_CACHE);
>> +
>> +	/*
>> +	 * The features below are enabled by default, so we instead look to see
>> +	 * if firmware has *disabled* them, and clear them if so.
>> +	 */
>> +	if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY))
>> +		security_ftr_clear(SEC_FTR_FAVOUR_SECURITY);
>> +
>> +	if (!(result->behaviour & H_CPU_BEHAV_L1D_FLUSH_PR))
>> +		security_ftr_clear(SEC_FTR_L1D_FLUSH_PR);
>> +
>> +	if (!(result->behaviour & H_CPU_BEHAV_BNDS_CHK_SPEC_BAR))
>> +		security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR);
>> +}
>> +
>> +void pseries_setup_rfi_flush(void)
>>  {
>>  	struct h_cpu_char_result result;
>>  	enum l1d_flush_type types;
>>  	bool enable;
>>  	long rc;
>>  
>> - -	/* Enable by default */
>> - -	enable = true;
>> +	/*
>> +	 * Set features to the defaults assumed by init_cpu_char_feature_flags()
>> +	 * so it can set/clear again any features that might have changed after
>> +	 * migration, and in case the hypercall fails and it is not even called.
>> +	 */
>> +	powerpc_security_features = SEC_FTR_DEFAULT;
>>  
>>  	rc = plpar_get_cpu_characteristics(&result);
>> - -	if (rc == H_SUCCESS) {
>> - -		types = L1D_FLUSH_NONE;
>> +	if (rc == H_SUCCESS)
>> +		init_cpu_char_feature_flags(&result);
>>  
>> - -		if (result.character & H_CPU_CHAR_L1D_FLUSH_TRIG2)
>> - -			types |= L1D_FLUSH_MTTRIG;
>> - -		if (result.character & H_CPU_CHAR_L1D_FLUSH_ORI30)
>> - -			types |= L1D_FLUSH_ORI;
>> +	/*
>> +	 * We're the guest so this doesn't apply to us, clear it to simplify
>> +	 * handling of it elsewhere.
>> +	 */
>> +	security_ftr_clear(SEC_FTR_L1D_FLUSH_HV);
>>  
>> - -		/* Use fallback if nothing set in hcall */
>> - -		if (types == L1D_FLUSH_NONE)
>> - -			types = L1D_FLUSH_FALLBACK;
>> +	types = L1D_FLUSH_FALLBACK;
>>  
>> - -		if (!(result.behaviour & H_CPU_BEHAV_L1D_FLUSH_PR))
>> - -			enable = false;
>> - -	} else {
>> - -		/* Default to fallback if case hcall is not available */
>> - -		types = L1D_FLUSH_FALLBACK;
>> - -	}
>> +	if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_TRIG2))
>> +		types |= L1D_FLUSH_MTTRIG;
>> +
>> +	if (security_ftr_enabled(SEC_FTR_L1D_FLUSH_ORI30))
>> +		types |= L1D_FLUSH_ORI;
>> +
>> +	enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && \
>> +		 security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR);
>>  
>>  	setup_rfi_flush(types, enable);
>> +	setup_count_cache_flush();
>>  }
>>  
>>  static void __init pSeries_setup_arch(void)
>> @@ -549,6 +600,7 @@ static void __init pSeries_setup_arch(void)
>>  	fwnmi_init();
>>  
>>  	pseries_setup_rfi_flush();
>> +	setup_stf_barrier();
>>  
>>  	/* By default, only probe PCI (can be overridden by rtas_pci) */
>>  	pci_add_flags(PCI_PROBE_ONLY);
>> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
>> index 786bf01691c9..83619ebede93 100644
>> - --- a/arch/powerpc/xmon/xmon.c
>> +++ b/arch/powerpc/xmon/xmon.c
>> @@ -2144,6 +2144,8 @@ static void dump_one_paca(int cpu)
>>  	DUMP(p, slb_cache_ptr, "x");
>>  	for (i = 0; i < SLB_CACHE_ENTRIES; i++)
>>  		printf(" slb_cache[%d]:        = 0x%016lx\n", i, p->slb_cache[i]);
>> +
>> +	DUMP(p, rfi_flush_fallback_area, "px");
>>  #endif
>>  	DUMP(p, dscr_default, "llx");
>>  #ifdef CONFIG_PPC_BOOK3E
>> - -- 
>> 2.20.1
>>
>> -----BEGIN PGP SIGNATURE-----
>>
>> iQIcBAEBAgAGBQJcvHWhAAoJEFHr6jzI4aWA6nsP/0YskmAfLovcUmERQ7+bIjq6
>> IcS1T466dvy6MlqeBXU4x8pVgInWeHKEC9XJdkM1lOeib/SLW7Hbz4kgJeOGwFGY
>> lOTaexrxvsBqPm7f6GC0zbl9obEIIIIUs+TielFQANBgqm+q8Wio+XXPP9bpKeKY
>> agSpQ3nwL/PYixznbNmN/lP9py5p89LQ0IBcR7dDBGGWJtD/AXeZ9hslsZxPbPtI
>> nZJ0vdnjuoB2z+hCxfKWlYfLwH0VfoTpqP5x3ALCkvbBr67e8bf6EK8+trnvhyQ8
>> iLY4bp1pm2epAI0/3NfyEiDMsGjVJ6IFlkyhDkHJgJNu0BGcGOSX2GpyU3juviAK
>> c95FtBft/i8AwigOMCivg2mN5edYjsSiPoEItwT5KWqgByJsdr5i5mYVx8cUjMOz
>> iAxLZCdg+UHZYuCBCAO2ZI1G9bVXI1Pa3btMspiCOOOsYGjXGf0oFfKQ+7957hUO
>> ftYYJoGHlMHiHR1OPas6T3lk6YKF9uvfIDTE3OKw2obHbbRz3u82xoWMRGW503MN
>> 7WpkpAP7oZ9RgqIWFVhatWy5f+7GFL0akEi4o2tsZHhYlPau7YWo+nToTd87itwt
>> GBaWJipzge4s13VkhAE+jWFO35Fvwi8uNZ7UgpuKMBECEjkGbtzBTq2MjSF5G8wc
>> yPEod5jby/Iqb7DkGPVG
>> =6DnF
>> -----END PGP SIGNATURE-----
>>

^ permalink raw reply

* Re: [PATCH stable v4.4 00/52] powerpc spectre backports for 4.4
From: Michael Ellerman @ 2019-04-28  6:17 UTC (permalink / raw)
  To: Greg KH; +Cc: npiggin, diana.craciun, linuxppc-dev, stable, msuchanek
In-Reply-To: <20190421163421.GA8449@kroah.com>

Greg KH <gregkh@linuxfoundation.org> writes:

> On Mon, Apr 22, 2019 at 12:19:45AM +1000, Michael Ellerman wrote:
>> -----BEGIN PGP SIGNED MESSAGE-----
>> Hash: SHA1
>> 
>> Hi Greg/Sasha,
>> 
>> Please queue up these powerpc patches for 4.4 if you have no objections.
>
> why?  Do you, or someone else, really care about spectre issues in 4.4?
> Who is using ppc for 4.4 becides a specific enterprise distro (and they
> don't seem to be pulling in my stable updates anyway...)?

Someone asked for it, but TBH I can't remember who it was. I can chase
it up if you like.

cheers

^ permalink raw reply

* Re: [PATCH V3] ASoC: fsl_esai: Add pm runtime function
From: S.j. Wang @ 2019-04-28  2:28 UTC (permalink / raw)
  To: Mark Brown
  Cc: alsa-devel@alsa-project.org, timur@kernel.org,
	Xiubo.Lee@gmail.com, festevam@gmail.com,
	linux-kernel@vger.kernel.org, nicoleotsuka@gmail.com,
	linuxppc-dev@lists.ozlabs.org

Hi  Mark

> On Fri, Apr 26, 2019 at 10:51:15AM +0000, S.j. Wang wrote:
> > > On Mon, Apr 22, 2019 at 02:31:55AM +0000, S.j. Wang wrote:
> > > > Add pm runtime support and move clock handling there.
> > > > Close the clocks at suspend to reduce the power consumption.
> 
> > > > fsl_esai_suspend is replaced by pm_runtime_force_suspend.
> > > > fsl_esai_resume is replaced by pm_runtime_force_resume.
> 
> > > This doesn't apply against current code, please check and resend.
> 
> > Which branch are you using?  I tried for-next and for-linus, both Are
> > successful applied.
> 
> I'm applying against for-5.2, though if it depends on a patch queued for
> 5.1 that's fine, I can just merge that up - please just resend.  I think I did try
> merging 5.1 though...

I think may be caused by the patch " ASoC: fsl_esai: Fix missing break
in switch statement", so I resend them both base on for-5.2.

best regards
wang shengjiu

^ permalink raw reply

* [PATCH V4] ASoC: fsl_esai: Add pm runtime function
From: S.j. Wang @ 2019-04-28  2:24 UTC (permalink / raw)
  To: timur@kernel.org, nicoleotsuka@gmail.com, Xiubo.Lee@gmail.com,
	festevam@gmail.com, broonie@kernel.org,
	alsa-devel@alsa-project.org
  Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org

Add pm runtime support and move clock handling there.
Close the clocks at suspend to reduce the power consumption.

fsl_esai_suspend is replaced by pm_runtime_force_suspend.
fsl_esai_resume is replaced by pm_runtime_force_resume.

Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Acked-by: Nicolin Chen <nicoleotsuka@gmail.com>
---
Changes in v4
-resend base on for-5.2

Changes in v3
-refine the commit comments.
-add acked-by

Changes in v2
-refine the commit comments.
-move regcache_mark_dirty to runtime suspend.

 sound/soc/fsl/fsl_esai.c | 141 ++++++++++++++++++++++++++---------------------
 1 file changed, 77 insertions(+), 64 deletions(-)

diff --git a/sound/soc/fsl/fsl_esai.c b/sound/soc/fsl/fsl_esai.c
index bad0dfed6b68..10d2210c91ef 100644
--- a/sound/soc/fsl/fsl_esai.c
+++ b/sound/soc/fsl/fsl_esai.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
+#include <linux/pm_runtime.h>
 #include <sound/dmaengine_pcm.h>
 #include <sound/pcm_params.h>
 
@@ -466,30 +467,6 @@ static int fsl_esai_startup(struct snd_pcm_substream *substream,
 			    struct snd_soc_dai *dai)
 {
 	struct fsl_esai *esai_priv = snd_soc_dai_get_drvdata(dai);
-	int ret;
-
-	/*
-	 * Some platforms might use the same bit to gate all three or two of
-	 * clocks, so keep all clocks open/close at the same time for safety
-	 */
-	ret = clk_prepare_enable(esai_priv->coreclk);
-	if (ret)
-		return ret;
-	if (!IS_ERR(esai_priv->spbaclk)) {
-		ret = clk_prepare_enable(esai_priv->spbaclk);
-		if (ret)
-			goto err_spbaclk;
-	}
-	if (!IS_ERR(esai_priv->extalclk)) {
-		ret = clk_prepare_enable(esai_priv->extalclk);
-		if (ret)
-			goto err_extalck;
-	}
-	if (!IS_ERR(esai_priv->fsysclk)) {
-		ret = clk_prepare_enable(esai_priv->fsysclk);
-		if (ret)
-			goto err_fsysclk;
-	}
 
 	if (!dai->active) {
 		/* Set synchronous mode */
@@ -506,16 +483,6 @@ static int fsl_esai_startup(struct snd_pcm_substream *substream,
 
 	return 0;
 
-err_fsysclk:
-	if (!IS_ERR(esai_priv->extalclk))
-		clk_disable_unprepare(esai_priv->extalclk);
-err_extalck:
-	if (!IS_ERR(esai_priv->spbaclk))
-		clk_disable_unprepare(esai_priv->spbaclk);
-err_spbaclk:
-	clk_disable_unprepare(esai_priv->coreclk);
-
-	return ret;
 }
 
 static int fsl_esai_hw_params(struct snd_pcm_substream *substream,
@@ -576,20 +543,6 @@ static int fsl_esai_hw_params(struct snd_pcm_substream *substream,
 	return 0;
 }
 
-static void fsl_esai_shutdown(struct snd_pcm_substream *substream,
-			      struct snd_soc_dai *dai)
-{
-	struct fsl_esai *esai_priv = snd_soc_dai_get_drvdata(dai);
-
-	if (!IS_ERR(esai_priv->fsysclk))
-		clk_disable_unprepare(esai_priv->fsysclk);
-	if (!IS_ERR(esai_priv->extalclk))
-		clk_disable_unprepare(esai_priv->extalclk);
-	if (!IS_ERR(esai_priv->spbaclk))
-		clk_disable_unprepare(esai_priv->spbaclk);
-	clk_disable_unprepare(esai_priv->coreclk);
-}
-
 static int fsl_esai_trigger(struct snd_pcm_substream *substream, int cmd,
 			    struct snd_soc_dai *dai)
 {
@@ -658,7 +611,6 @@ static int fsl_esai_trigger(struct snd_pcm_substream *substream, int cmd,
 
 static const struct snd_soc_dai_ops fsl_esai_dai_ops = {
 	.startup = fsl_esai_startup,
-	.shutdown = fsl_esai_shutdown,
 	.trigger = fsl_esai_trigger,
 	.hw_params = fsl_esai_hw_params,
 	.set_sysclk = fsl_esai_set_dai_sysclk,
@@ -947,6 +899,10 @@ static int fsl_esai_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	pm_runtime_enable(&pdev->dev);
+
+	regcache_cache_only(esai_priv->regmap, true);
+
 	ret = imx_pcm_dma_init(pdev, IMX_ESAI_DMABUF_SIZE);
 	if (ret)
 		dev_err(&pdev->dev, "failed to init imx pcm dma: %d\n", ret);
@@ -954,6 +910,13 @@ static int fsl_esai_probe(struct platform_device *pdev)
 	return ret;
 }
 
+static int fsl_esai_remove(struct platform_device *pdev)
+{
+	pm_runtime_disable(&pdev->dev);
+
+	return 0;
+}
+
 static const struct of_device_id fsl_esai_dt_ids[] = {
 	{ .compatible = "fsl,imx35-esai", },
 	{ .compatible = "fsl,vf610-esai", },
@@ -961,22 +924,35 @@ static int fsl_esai_probe(struct platform_device *pdev)
 };
 MODULE_DEVICE_TABLE(of, fsl_esai_dt_ids);
 
-#ifdef CONFIG_PM_SLEEP
-static int fsl_esai_suspend(struct device *dev)
-{
-	struct fsl_esai *esai = dev_get_drvdata(dev);
-
-	regcache_cache_only(esai->regmap, true);
-	regcache_mark_dirty(esai->regmap);
-
-	return 0;
-}
-
-static int fsl_esai_resume(struct device *dev)
+#ifdef CONFIG_PM
+static int fsl_esai_runtime_resume(struct device *dev)
 {
 	struct fsl_esai *esai = dev_get_drvdata(dev);
 	int ret;
 
+	/*
+	 * Some platforms might use the same bit to gate all three or two of
+	 * clocks, so keep all clocks open/close at the same time for safety
+	 */
+	ret = clk_prepare_enable(esai->coreclk);
+	if (ret)
+		return ret;
+	if (!IS_ERR(esai->spbaclk)) {
+		ret = clk_prepare_enable(esai->spbaclk);
+		if (ret)
+			goto err_spbaclk;
+	}
+	if (!IS_ERR(esai->extalclk)) {
+		ret = clk_prepare_enable(esai->extalclk);
+		if (ret)
+			goto err_extalclk;
+	}
+	if (!IS_ERR(esai->fsysclk)) {
+		ret = clk_prepare_enable(esai->fsysclk);
+		if (ret)
+			goto err_fsysclk;
+	}
+
 	regcache_cache_only(esai->regmap, false);
 
 	/* FIFO reset for safety */
@@ -987,22 +963,59 @@ static int fsl_esai_resume(struct device *dev)
 
 	ret = regcache_sync(esai->regmap);
 	if (ret)
-		return ret;
+		goto err_regcache_sync;
 
 	/* FIFO reset done */
 	regmap_update_bits(esai->regmap, REG_ESAI_TFCR, ESAI_xFCR_xFR, 0);
 	regmap_update_bits(esai->regmap, REG_ESAI_RFCR, ESAI_xFCR_xFR, 0);
 
 	return 0;
+
+err_regcache_sync:
+	if (!IS_ERR(esai->fsysclk))
+		clk_disable_unprepare(esai->fsysclk);
+err_fsysclk:
+	if (!IS_ERR(esai->extalclk))
+		clk_disable_unprepare(esai->extalclk);
+err_extalclk:
+	if (!IS_ERR(esai->spbaclk))
+		clk_disable_unprepare(esai->spbaclk);
+err_spbaclk:
+	clk_disable_unprepare(esai->coreclk);
+
+	return ret;
 }
-#endif /* CONFIG_PM_SLEEP */
+
+static int fsl_esai_runtime_suspend(struct device *dev)
+{
+	struct fsl_esai *esai = dev_get_drvdata(dev);
+
+	regcache_cache_only(esai->regmap, true);
+	regcache_mark_dirty(esai->regmap);
+
+	if (!IS_ERR(esai->fsysclk))
+		clk_disable_unprepare(esai->fsysclk);
+	if (!IS_ERR(esai->extalclk))
+		clk_disable_unprepare(esai->extalclk);
+	if (!IS_ERR(esai->spbaclk))
+		clk_disable_unprepare(esai->spbaclk);
+	clk_disable_unprepare(esai->coreclk);
+
+	return 0;
+}
+#endif /* CONFIG_PM */
 
 static const struct dev_pm_ops fsl_esai_pm_ops = {
-	SET_SYSTEM_SLEEP_PM_OPS(fsl_esai_suspend, fsl_esai_resume)
+	SET_RUNTIME_PM_OPS(fsl_esai_runtime_suspend,
+			   fsl_esai_runtime_resume,
+			   NULL)
+	SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_force_suspend,
+				pm_runtime_force_resume)
 };
 
 static struct platform_driver fsl_esai_driver = {
 	.probe = fsl_esai_probe,
+	.remove = fsl_esai_remove,
 	.driver = {
 		.name = "fsl-esai-dai",
 		.pm = &fsl_esai_pm_ops,
-- 
1.9.1


^ permalink raw reply related

* [PATCH V6] ASoC: fsl_esai: Fix missing break in switch statement
From: S.j. Wang @ 2019-04-28  2:24 UTC (permalink / raw)
  To: timur@kernel.org, nicoleotsuka@gmail.com, Xiubo.Lee@gmail.com,
	festevam@gmail.com, broonie@kernel.org,
	alsa-devel@alsa-project.org
  Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org

case ESAI_HCKT_EXTAL and case ESAI_HCKR_EXTAL should be
independent of each other, so replace fall-through with break.

Fixes: 43d24e76b698 ("ASoC: fsl_esai: Add ESAI CPU DAI driver")
Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
Acked-by: Nicolin Chen <nicoleotsuka@gmail.com>
Cc: <stable@vger.kernel.org>
---
Changes in V6
- resend base one for-5.2

Changes in v5
- remove new line after Fixes

Changes in v4
- Add acked-by

Changes in v3
- Update subject line and cc stable

Changes in v2
- Fix "Fixes" tag

 sound/soc/fsl/fsl_esai.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/fsl/fsl_esai.c b/sound/soc/fsl/fsl_esai.c
index c7410bbfd2af..bad0dfed6b68 100644
--- a/sound/soc/fsl/fsl_esai.c
+++ b/sound/soc/fsl/fsl_esai.c
@@ -251,7 +251,7 @@ static int fsl_esai_set_dai_sysclk(struct snd_soc_dai *dai, int clk_id,
 		break;
 	case ESAI_HCKT_EXTAL:
 		ecr |= ESAI_ECR_ETI;
-		/* fall through */
+		break;
 	case ESAI_HCKR_EXTAL:
 		ecr |= esai_priv->synchronous ? ESAI_ECR_ETI : ESAI_ECR_ERI;
 		break;
-- 
1.9.1


^ permalink raw reply related

* Re: [PATCH 2/3 v2] ASoC: fsl_sai: Add support for runtime pm
From: Mark Brown @ 2019-04-27 17:17 UTC (permalink / raw)
  To: Daniel Baluta
  Cc: alsa-devel@alsa-project.org, linuxppc-dev@lists.ozlabs.org,
	timur@kernel.org, Xiubo.Lee@gmail.com, festevam@gmail.com,
	S.j. Wang, tiwai@suse.com, lgirdwood@gmail.com, perex@perex.cz,
	nicoleotsuka@gmail.com, dl-linux-imx, Daniel Baluta,
	linux-kernel@vger.kernel.org
In-Reply-To: <CAEnQRZCPPr1iHvKEj=fOsE8A2iW=XZd=CaoFqAoUqJftg4pN9Q@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 295 bytes --]

On Fri, Apr 26, 2019 at 03:10:10PM +0300, Daniel Baluta wrote:

> The only patch left in the series that needs to be applied is this:

> https://www.spinics.net/lists/alsa-devel/msg89733.html

> I will reply also to that email, to be easier for you to find it.

Content free pings don't help...

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH V3] ASoC: fsl_esai: Add pm runtime function
From: Mark Brown @ 2019-04-27 17:17 UTC (permalink / raw)
  To: S.j. Wang
  Cc: alsa-devel@alsa-project.org, timur@kernel.org,
	Xiubo.Lee@gmail.com, festevam@gmail.com,
	linux-kernel@vger.kernel.org, nicoleotsuka@gmail.com,
	linuxppc-dev@lists.ozlabs.org
In-Reply-To: <VE1PR04MB64797233B0BFE833550094C3E33E0@VE1PR04MB6479.eurprd04.prod.outlook.com>

[-- Attachment #1: Type: text/plain, Size: 721 bytes --]

On Fri, Apr 26, 2019 at 10:51:15AM +0000, S.j. Wang wrote:
> > On Mon, Apr 22, 2019 at 02:31:55AM +0000, S.j. Wang wrote:
> > > Add pm runtime support and move clock handling there.
> > > Close the clocks at suspend to reduce the power consumption.

> > > fsl_esai_suspend is replaced by pm_runtime_force_suspend.
> > > fsl_esai_resume is replaced by pm_runtime_force_resume.

> > This doesn't apply against current code, please check and resend.

> Which branch are you using?  I tried for-next and for-linus, both
> Are successful applied.

I'm applying against for-5.2, though if it depends on a patch queued for
5.1 that's fine, I can just merge that up - please just resend.  I think
I did try merging 5.1 though...

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox