LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [RFC PATCH 1/4] powerpc: Optimize register usage for esr register
From: Xiongwei Song @ 2021-08-06 13:14 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: ravi.bangoria, Xiongwei Song, oleg, Linux Kernel Mailing List,
	efremov, Paul Mackerras, npiggin, aneesh.kumar, peterx, PowerPC,
	akpm, sandipan
In-Reply-To: <874kc3njxh.fsf@mpe.ellerman.id.au>

On Fri, Aug 6, 2021 at 2:53 PM Michael Ellerman <mpe@ellerman.id.au> wrote:
>
> sxwjean@me.com writes:
> > From: Xiongwei Song <sxwjean@gmail.com>
> >
> > Create an anonymous union for dsisr and esr regsiters, we can reference
> > esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y.
> > Otherwise, reference dsisr. This makes code more clear.
> >
> > Signed-off-by: Xiongwei Song <sxwjean@gmail.com>
> > ---
> >  arch/powerpc/include/asm/ptrace.h          |  5 ++++-
> >  arch/powerpc/include/uapi/asm/ptrace.h     |  5 ++++-
> >  arch/powerpc/kernel/process.c              |  2 +-
> >  arch/powerpc/kernel/ptrace/ptrace.c        |  2 ++
> >  arch/powerpc/kernel/traps.c                |  2 +-
> >  arch/powerpc/mm/fault.c                    | 16 ++++++++++++++--
> >  arch/powerpc/platforms/44x/machine_check.c |  4 ++--
> >  arch/powerpc/platforms/4xx/machine_check.c |  2 +-
> >  8 files changed, 29 insertions(+), 9 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
> > index 3e5d470a6155..c252d04b1206 100644
> > --- a/arch/powerpc/include/asm/ptrace.h
> > +++ b/arch/powerpc/include/asm/ptrace.h
> > @@ -44,7 +44,10 @@ struct pt_regs
> >  #endif
> >                       unsigned long trap;
> >                       unsigned long dar;
> > -                     unsigned long dsisr;
> > +                     union {
> > +                             unsigned long dsisr;
> > +                             unsigned long esr;
> > +                     };
>
> I don't mind doing that.
>
> >                       unsigned long result;
> >               };
> >       };
> > diff --git a/arch/powerpc/include/uapi/asm/ptrace.h b/arch/powerpc/include/uapi/asm/ptrace.h
> > index 7004cfea3f5f..e357288b5f34 100644
> > --- a/arch/powerpc/include/uapi/asm/ptrace.h
> > +++ b/arch/powerpc/include/uapi/asm/ptrace.h
> > @@ -53,7 +53,10 @@ struct pt_regs
> >       /* N.B. for critical exceptions on 4xx, the dar and dsisr
> >          fields are overloaded to hold srr0 and srr1. */
> >       unsigned long dar;              /* Fault registers */
> > -     unsigned long dsisr;            /* on 4xx/Book-E used for ESR */
> > +     union {
> > +             unsigned long dsisr;            /* on Book-S used for DSISR */
> > +             unsigned long esr;              /* on 4xx/Book-E used for ESR */
> > +     };
> >       unsigned long result;           /* Result of a system call */
> >  };
>
> But I'm not sure about the use of anonymous unions in UAPI headers. Old
> compilers don't support them, so there's a risk of breakage.
>
> I'd rather we didn't touch the uapi version.

Ok. Will discard the change.

>
>
> > diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> > index 185beb290580..f74af8f9133c 100644
> > --- a/arch/powerpc/kernel/process.c
> > +++ b/arch/powerpc/kernel/process.c
> > @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs)
> >           trap == INTERRUPT_DATA_STORAGE ||
> >           trap == INTERRUPT_ALIGNMENT) {
> >               if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
> > -                     pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr);
> > +                     pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->esr);
> >               else
> >                       pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr);
> >       }
> > diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c
> > index 0a0a33eb0d28..00789ad2c4a3 100644
> > --- a/arch/powerpc/kernel/ptrace/ptrace.c
> > +++ b/arch/powerpc/kernel/ptrace/ptrace.c
> > @@ -375,6 +375,8 @@ void __init pt_regs_check(void)
> >                    offsetof(struct user_pt_regs, dar));
> >       BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) !=
> >                    offsetof(struct user_pt_regs, dsisr));
> > +     BUILD_BUG_ON(offsetof(struct pt_regs, esr) !=
> > +                  offsetof(struct user_pt_regs, esr));
> >       BUILD_BUG_ON(offsetof(struct pt_regs, result) !=
> >                    offsetof(struct user_pt_regs, result));
> >
> > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> > index dfbce527c98e..2164f5705a0b 100644
> > --- a/arch/powerpc/kernel/traps.c
> > +++ b/arch/powerpc/kernel/traps.c
> > @@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs)
> >  #ifdef CONFIG_PPC_ADV_DEBUG_REGS
> >  /* On 4xx, the reason for the machine check or program exception
> >     is in the ESR. */
> > -#define get_reason(regs)     ((regs)->dsisr)
> > +#define get_reason(regs)     ((regs)->esr)
> >  #define REASON_FP            ESR_FP
> >  #define REASON_ILLEGAL               (ESR_PIL | ESR_PUO)
> >  #define REASON_PRIVILEGED    ESR_PPR
> > diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> > index a8d0ce85d39a..62953d4e7c93 100644
> > --- a/arch/powerpc/mm/fault.c
> > +++ b/arch/powerpc/mm/fault.c
> > @@ -541,7 +541,11 @@ static __always_inline void __do_page_fault(struct pt_regs *regs)
> >  {
> >       long err;
> >
> > -     err = ___do_page_fault(regs, regs->dar, regs->dsisr);
> > +     if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
> > +             err = ___do_page_fault(regs, regs->dar, regs->esr);
> > +     else
> > +             err = ___do_page_fault(regs, regs->dar, regs->dsisr);
>
> As Christophe said, I don't thinks this is an improvement.
>
> It makes the code less readable. If anyone is confused about what is
> passed to ___do_page_fault() they can either read the comment above it,
> or look at the definition of pt_regs to see that esr and dsisr share
> storage.

Ok, thanks a lot.  Will send v2.

Regards,
Xiongwei

>
> cheers

^ permalink raw reply

* Re: [PATCH] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs
From: Cédric Le Goater @ 2021-08-06 11:50 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Laurent Vivier, Srikar Dronamraju, Geetika Moolchandani, stable,
	David Gibson
In-Reply-To: <20210629131542.743888-1-clg@kaod.org>

On 6/29/21 3:15 PM, Cédric Le Goater wrote:
> On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at
> runtime. Today, the IPI is not created for such nodes, and hot-plugged
> CPUs use a bogus IPI, which leads to soft lockups.
> 
> We could create the node IPI on demand but it is a bit complex because
> this code would be called under bringup_up() and some IRQ locking is
> being done. The simplest solution is to create the IPIs for all nodes
> at startup.
> 
> Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node")
> Cc: stable@vger.kernel.org # v5.13
> Reported-by: Geetika Moolchandani <Geetika.Moolchandani1@ibm.com>
> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
> Signed-off-by: Cédric Le Goater <clg@kaod.org>
> ---
> 
> This patch breaks old versions of irqbalance (<= v1.4). Possible nodes
> are collected from /sys/devices/system/node/ but CPU-less nodes are
> not listed there. When interrupts are scanned, the link representing
> the node structure is NULL and segfault occurs.

This is an irqbalance regression due to :

  https://github.com/Irqbalance/irqbalance/pull/172

I will report through an issue. 

Anyhow, there is a better approach which is to allocate IPIs for all 
nodes at boot time and do the mapping on demand. Removing the mapping 
on last use seems more complex though. 

I will send a v2 after some tests.  

Thanks,

C.

> Version 1.7 seems immune. 
> 
> ---
>  arch/powerpc/sysdev/xive/common.c | 4 ----
>  1 file changed, 4 deletions(-)
> 
> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
> index f3b16ed48b05..5d2c58dba57e 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -1143,10 +1143,6 @@ static int __init xive_request_ipi(void)
>  		struct xive_ipi_desc *xid = &xive_ipis[node];
>  		struct xive_ipi_alloc_info info = { node };
>  
> -		/* Skip nodes without CPUs */
> -		if (cpumask_empty(cpumask_of_node(node)))
> -			continue;
> -
>  		/*
>  		 * Map one IPI interrupt per node for all cpus of that node.
>  		 * Since the HW interrupt number doesn't have any meaning,
> 


^ permalink raw reply

* Re: [PATCH printk v1 03/10] kgdb: delay roundup if holding printk cpulock
From: Daniel Thompson @ 2021-08-06 12:06 UTC (permalink / raw)
  To: John Ogness
  Cc: Gautham R. Shenoy, Douglas Anderson, Srikar Dronamraju,
	Peter Zijlstra, linux-kernel, Paul Mackerras, H. Peter Anvin,
	Chengyang Fan, Bhaskar Chowdhury, x86, Ingo Molnar,
	kgdb-bugreport, Petr Mladek, Nicholas Piggin, Borislav Petkov,
	Steven Rostedt, Thomas Gleixner, Gustavo A. R. Silva,
	Sergey Senozhatsky, Jason Wessel, linuxppc-dev,
	Cédric Le Goater
In-Reply-To: <87tuk4lfj0.fsf@jogness.linutronix.de>

On Thu, Aug 05, 2021 at 05:52:43AM +0206, John Ogness wrote:
> On 2021-08-04, Daniel Thompson <daniel.thompson@linaro.org> wrote:
> > On Wed, Aug 04, 2021 at 02:12:22PM +0200, Petr Mladek wrote:
> >> On Wed 2021-08-04 12:31:59, Daniel Thompson wrote:
> >> > On Tue, Aug 03, 2021 at 05:36:32PM +0206, John Ogness wrote:
> >> > > On 2021-08-03, Daniel Thompson <daniel.thompson@linaro.org> wrote:
> >> > > > On Tue, Aug 03, 2021 at 03:18:54PM +0206, John Ogness wrote:
> >> > > >> kgdb makes use of its own cpulock (@dbg_master_lock, @kgdb_active)
> >> > > >> during cpu roundup. This will conflict with the printk cpulock.
> >> > > >
> >> > > > When the full vision is realized what will be the purpose of the printk
> >> > > > cpulock?
> >> > > >
> >> > > > I'm asking largely because it's current role is actively unhelpful
> >> > > > w.r.t. kdb. It is possible that cautious use of in_dbg_master() might
> >> > > > be a better (and safer) solution. However it sounds like there is a
> >> > > > larger role planned for the printk cpulock...
> >> > > 
> >> > > The printk cpulock is used as a synchronization mechanism for
> >> > > implementing atomic consoles, which need to be able to safely interrupt
> >> > > the console write() activity at any time and immediately continue with
> >> > > their own printing. The ultimate goal is to move all console printing
> >> > > into per-console dedicated kthreads, so the primary function of the
> >> > > printk cpulock is really to immediately _stop_ the CPU/kthread
> >> > > performing write() in order to allow write_atomic() (from any context on
> >> > > any CPU) to safely and reliably take over.
> >> > 
> >> > I see.
> >> > 
> >> > Is there any mileage in allowing in_dbg_master() to suppress taking
> >> > the console lock?
> >> > 
> >> > There's a couple of reasons to worry about the current approach.
> >> > 
> >> > The first is that we don't want this code to trigger in the case when
> >> > kgdb is enabled and kdb is not since it is only kdb (a self-hosted
> >> > debugger) than uses the consoles. This case is relatively trivial to
> >> > address since we can rename it kdb_roundup_delay() and alter the way it
> >> > is conditionally compiled.
> 
> Well, _I_ want this code to trigger even without kdb. The printk cpulock
> is meant to be the innermost locking for the entire kernel. No code is
> allowed to block/spin on any kind of lock if holding the printk
> cpulock. This is the only way to guarantee the functionality of the
> atomic consoles.
>
> For example, if the kernel were to crash while inside kgdb code, we want
> to see the backtrace.

That would certainly help me debug any such problems in kgdb ;-) .


> Since kgdb _does_ take locks (spinning on @dbg_slave_lock during roundup
> and the master's own cpu lock as a retry loop on @dbg_master_lock),
> clearly it is not allowed to hold the printk cpulock. The simplest
> solution I could find was just to make sure kgdb_cpu_enter() isn't
> called while holding the printk cpulock.

We might have to come back to this. I'm pretty certain your patch
does not currently achieve this goal.


> >> > The second is more of a problem however. kdb will only call into the
> >> > console code from the debug master. By default this is the CPU that
> >> > takes the debug trap so initial prints will work fine. However it is
> >> > possible to switch to a different master (so we can read per-CPU
> >> > registers and things like that). This will result in one of the CPUs
> >> > that did the IPI round up calling into console code and this is unsafe
> >> > in that instance.
> 
> It is only unsafe if a CPU enters "kgdb/kdb context" while holding the
> printk cpulock. That is what I want to prevent.

Currently you can preventing this only for CPUs that enter the debugger
via an IPI. CPUs that enter due to a breakpoint (and there can be more
than one at a time) cannot just continue until the lock is dropped
since they would end up re-executing the breakpoint instruction.


> >> > There are a couple of tricks we could adopt to work around this but
> >> > given the slightly odd calling context for kdb (all CPUs quiesced, no
> >> > log interleaving possible) it sounds like it would remain safe to
> >> > bypass the lock if in_dbg_master() is true.
> >> > 
> >> > Bypassing an inconvenient lock might sound icky but:
> >> > 
> >> > 1. If the lock is not owned by any CPU then what kdb will do is safe.
> 
> No. The printk cpulock exists for low-level synchronization. The atomic
> consoles need this synchronization. (For example, the 8250 needs this
> for correct tracking of its interrupt register, even for
> serial8250_put_poll_char().)

What I mean is that because kdb is mono-threaded (even on SMP systems
due to the quiescing of other CPUs) then if the lock is not taken when
we enter kdb then it is safe for kdb to contend for the lock in the
normal way since it cannot deadlock.

BTW the synchronization in question is the need for strict nesting, is
that right? (e.g.  that each context that recursively acquires the lock
will release it in strict reverse order?).


> >> > 2. If the lock is owned by any CPU then we have quiesced it anyway
> >> >    and this makes is safe for the owning CPU to share its ownership
> >> >    (since it isn't much different to recursive acquisition on a single
> >> >    CPU)
> 
> Quiescing the printk cpulock is not permitted.

Sorry I wasn't quite clear in phrasing here. I don't think of it as
quiescing the lock, I think of it as quiescing the CPU that owns the
lock.

If any CPU that owns the lock *and* all CPUs except the debug master are
quiesced then allowing the debug master to take the lock is essentially
a special case of recursive acquisition and it will nest correctly.


> Just because it is kdb, does not mean that the atomic consoles were
> interrupted in a convenient place. The whole purpose of the atomic
> consoles is so that we can have guaranteed console output from _any_
> context and _any_ line of code in the kernel.
> 
> >> I think about the following:
> >> 
> >> void kgdb_roundup_cpus(void)
> >> {
> >> 	__printk_cpu_lock();
> >> 	__kgdb_roundup_cpus();
> >> }
> >> 
> >> , where __printk_cpu_lock() waits/takes printk_cpu_lock()
> >> 	__kgdb_roundup_cpus() is the original kgdb_roundup_cpus();
> >> 
> >> 
> >> The idea is that kgdb_roundup_cpus() caller takes the printk_cpu lock.
> >> The owner will be well defined.
> >> 
> >> As a result any other CPU will not be able to take the printk_cpu lock
> >> as long as it is owned by the kgdb lock. But as you say, kgdb will
> >> make sure that everything is serialized at this stage. So that
> >> the original raw_printk_cpu_lock_irqsave() might just disable
> >> IRQs when called under debugger.
> >> 
> >> Does it make any sense?
> >
> > Yes but I think it is still has problems.
> >
> > Primarily is doesn't solve the issue I raised. It would still be unsafe
> > to change debug master: we can guarantee the initial master owns the
> > lock but if it has been multiply acquired we cannot transfer ownership
> > when we want to change master.
> >
> > Additionally it will delay the round up of cores that do not own the
> > lock. The quiescing is never atomic and the operator needs to know
> > that but the longer CPUs are allows to execute for the more confusing
> > things can become for the operator.
> >
> > Finally on machines without an NMI this could cause trouble with the
> > interrupt disable in raw_printk_cpu_lock_irqsave() (or any outer level
> > interrupt disable). If the master get the lock then the other processes
> > will become incapable of being rounded up if they are waiting for the
> > printk lock).
> 
> I am also not happy with such a solution. Aside from Daniel's comments,
> it also violates the basic principle of the printk cpulock by allowing
> further locking while holding the print cpulock. That is a recipe for
> deadlock.
> 
> >> I have to say that it is a bit hairy. But it looks slightly better
> >> than the delayed/repeated IPI proposed by this patch.
> >
> > I'd like to reserve judgement for now which one is least worst...
> > largely because if the purpose of the lock simply to prevent interleaving
> > of console output then the debugger quiescing code should already have
> > this covered.
> >
> > It leaves me wondering if a change like the one below is sufficient
> > (based on code without John's patches but hopefully still clear enough).
> > I've given the new code it's own branch which it doesn't, strictly
> > speaking, need but it is easier to comment this way... and perhaps also
> > just a little easier for people who have not set CONFIG_KGDB to
> > ignore ;-).
> >
> > ~~~
> > diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> > index 142a58d124d9..41a7e103bb66 100644
> > --- a/kernel/printk/printk.c
> > +++ b/kernel/printk/printk.c
> > @@ -3599,6 +3599,18 @@ int __printk_cpu_trylock(void)
> >                 /* This CPU is already the owner. */
> >                 atomic_inc(&printk_cpulock_nested);
> >                 return 1;
> > +       } else if (in_dbg_master()) {
> > +               /*
> > +                * If we are executing as the primary CPU and with the debugger
> > +                * active than all other CPUs in the system are quiesced by
> > +                * the time kdb winds up calling this function. To execute this
> > +                * branch then the lock must be owned by one of the quiesced CPUs.
> > +                * Happily, because it is quiesced and cannot release it, it is
> > +                * safe for us to allow the lock to be taken from a different CPU!
> > +                * The lock will be released prior to resuming the real owner.
> > +                */
> > +               atomic_inc(&printk_cpulock_nested);
> > +               return 1;
> >         }
> >  
> >         return 0;
> > ~~~
> 
> Being in kgdb/kdb context is similar to being in atomic console
> context. (Of course, they are both using cpu locks.) But the contexts
> are not the same. It is incorrect to handle them as the same.
> 
> We need to decide who is inside of who. Either printk is the innermost,
> in which case the printk cpulock cannot be held when calling
> kgdb_cpu_enter().

It is difficult to prevent this for the breakpoint cases... although
since everything about your current work is difficult I don't expect
that to be a sufficient argument on its own!


> Or kgdb is the innermost, meaning that the atomic
> consoles are no longer atomic/reliable while in kgdb.
> 
> I prefer and am pushing for the first, but am willing to accept the
> second (i.e. that kgdb is the innermost function of the kernel).

I think it will always be the case that we might execute breakpoints in
an NMI context since the collateral damage from forbidding breakpoints
on all API that *might* be called from NMI is likely to constrain the
not-NMI debugging experience too much. However it *is* possible to defer
breakpoints: we could defer them by calling into the
out-of-line-single-step logic that is needed to support kprobes.  I
dislike this approach since there is no way to fixup the PC so when
we eventually stop then gdb would have trouble figuring out
why the system has stopped.

However taking on board what you are saying about innermost functions
I think there might be a we could look into that is much nicer from an
analysis point of view than relying in in_dbg_master() to implicitly
borrow the printk lock.

Would you consider a means for kgdb to *explicitly* allow a slave CPU
to donate ownership to the debug master as part of it's spin loop (e.g.
explicitly transfer ownership if and only if we are quiesced). This
has a number of nice properties:

1. The ownership transfer happens *after* we have decided who the
   master actually is and before that point everything works as
   normal!

2. Safe-nesting is guaranteed by the slave CPUs exception stack.

3. We can print (and expect it to be seen) pretty much anywhere in the
   master code path (including the ones before we find out who will be
   master since that happens before the IPIs) with no trouble.

3. Handling change of master is easy... we can re-donate the lock
   to the new master using the same or similar API.

4. We can print anywhere in the slave code *except* for the tight
   loop we run after donating ownership to the master and the code
   after an former master CPU donates the lock to the next master
   and before the former master drops into the slave loop.

5. Apart from the function to donate ownership all the nasty code
   to handle it ends up in kgdb where is belongs rather than smeared
   in your lock code.

I can't decide if this makes a tiny piece of kgdb inner-most or not
but it is certainly much easier to analyse how kgdb and atomic consoles
interact.


> > PS In the interested of full disclosure there is a special case
> >    in the debugger to allow it to try to cope if it fails to
> >    quiesce a CPU and I deliberately omitted this from the long
> >    comment above. That special case is expected to be unstable
> >    but since the alternative is likely to be a permanent deadlock
> >    without any indication of why we choose to take the risk of
> >    continuing. Personally I don't recommend reasoning about
> >    console safety based on this emergency case hence omitting the
> >    comment.

The above idea of explicitly transferring lock ownership also allows us
to analyse this case (where as the in_dbg_master() approach meant it was
too hard). If the CPU cannot be rounded up (will not react to the NMI
IPI) *and* it owns the printk lock and won't give it back then kdb will
deadlock. Given your goals w.r.t. reliability of atomic consoles then I
am more than happy to live with this!


Daniel.

^ permalink raw reply

* Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option
From: Nicholas Piggin @ 2021-08-06 10:42 UTC (permalink / raw)
  To: Athira Rajeev; +Cc: linuxppc-dev, kvm-ppc
In-Reply-To: <4600EC62-5505-4856-AE23-939ED62287B3@linux.vnet.ibm.com>

Excerpts from Athira Rajeev's message of August 6, 2021 7:28 pm:
> 
> 
>> On 26-Jul-2021, at 9:19 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>> 
>> It can be useful in simulators (with very constrained environments)
>> to allow some PMCs to run from boot so they can be sampled directly
>> by a test harness, rather than having to run perf.
>> 
>> A previous change freezes counters at boot by default, so provide
>> a boot time option to un-freeze (plus a bit more flexibility).
>> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>> .../admin-guide/kernel-parameters.txt         |  7 ++++
>> arch/powerpc/perf/core-book3s.c               | 35 +++++++++++++++++++
>> 2 files changed, 42 insertions(+)
>> 
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>> index bdb22006f713..96b7d0ebaa40 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -4089,6 +4089,13 @@
>> 			Override pmtimer IOPort with a hex value.
>> 			e.g. pmtmr=0x508
>> 
>> +	pmu=		[PPC] Manually enable the PMU.
>> +			Enable the PMU by setting MMCR0 to 0 (clear FC bit).
>> +			This option is implemented for Book3S processors.
>> +			If a number is given, then MMCR1 is set to that number,
>> +			otherwise (e.g., 'pmu=on'), it is left 0. The perf
>> +			subsystem is disabled if this option is used.
>> +
>> 	pm_debug_messages	[SUSPEND,KNL]
>> 			Enable suspend/resume debug messages during boot up.
>> 
>> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
>> index 65795cadb475..e7cef4fe17d7 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu)
>> }
>> 
>> #ifdef CONFIG_PPC64
>> +static bool pmu_override = false;
>> +static unsigned long pmu_override_val;
>> +static void do_pmu_override(void *data)
>> +{
>> +	ppc_set_pmu_inuse(1);
>> +	if (pmu_override_val)
>> +		mtspr(SPRN_MMCR1, pmu_override_val);
>> +	mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
> 
> Hi Nick
> 
> Here, we are not doing any validity check for the value used to set MMCR1. 
> For advanced users, the option to pass value for MMCR1 is fine. But other cases, it could result in
> invalid event getting used. Do we need to restrict this boot time option for only PMC5/6 ?

Depends what would be useful. We don't have to prevent the admin shooting 
themselves in the foot with options like this, but if we can make it 
safer without making it less useful then that's always a good option.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option
From: Nicholas Piggin @ 2021-08-06 10:38 UTC (permalink / raw)
  To: kvm-ppc, Madhavan Srinivasan; +Cc: linuxppc-dev
In-Reply-To: <e7bb1311-3b50-dcc2-7fb0-1773558e9abc@linux.ibm.com>

Excerpts from Madhavan Srinivasan's message of August 6, 2021 5:33 pm:
> 
> On 7/26/21 9:19 AM, Nicholas Piggin wrote:
>> It can be useful in simulators (with very constrained environments)
>> to allow some PMCs to run from boot so they can be sampled directly
>> by a test harness, rather than having to run perf.
>>
>> A previous change freezes counters at boot by default, so provide
>> a boot time option to un-freeze (plus a bit more flexibility).
>>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>   .../admin-guide/kernel-parameters.txt         |  7 ++++
>>   arch/powerpc/perf/core-book3s.c               | 35 +++++++++++++++++++
>>   2 files changed, 42 insertions(+)
>>
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>> index bdb22006f713..96b7d0ebaa40 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -4089,6 +4089,13 @@
>>   			Override pmtimer IOPort with a hex value.
>>   			e.g. pmtmr=0x508
>>
>> +	pmu=		[PPC] Manually enable the PMU.
> 
> 
> This is bit confusing, IIUC, we are manually disabling the perf 
> registration
> with this option and not pmu.
> If this option is used, we will unfreeze the
> MMCR0_FC (only in the HV_mode) and not register perf subsystem.

With the previous patch, this option un-freezes the PMU
(and disables perf).

> Since this option is valid only for HV_mode, canwe call it
> kvm_disable_perf or kvm_dis_perf.

It's only disabled for guests because it would require a bit
of logic to set pmcregs_in_use when we register our lppaca. We could
add that if needed, but the intention is for use on BML, not exactly
KVM specific.

I can add HV restriction to the help text. And we could rename the 
option. free_run_pmu= or something?

Thanks,
Nick

> 
> 
>> +			Enable the PMU by setting MMCR0 to 0 (clear FC bit).
>> +			This option is implemented for Book3S processors.
>> +			If a number is given, then MMCR1 is set to that number,
>> +			otherwise (e.g., 'pmu=on'), it is left 0. The perf
>> +			subsystem is disabled if this option is used.
>> +
>>   	pm_debug_messages	[SUSPEND,KNL]
>>   			Enable suspend/resume debug messages during boot up.
>>
>> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
>> index 65795cadb475..e7cef4fe17d7 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu)
>>   }
>>
>>   #ifdef CONFIG_PPC64
>> +static bool pmu_override = false;
>> +static unsigned long pmu_override_val;
>> +static void do_pmu_override(void *data)
>> +{
>> +	ppc_set_pmu_inuse(1);
>> +	if (pmu_override_val)
>> +		mtspr(SPRN_MMCR1, pmu_override_val);
>> +	mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
>> +}
>> +
>>   static int __init init_ppc64_pmu(void)
>>   {
>> +	if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) {
>> +		printk(KERN_WARNING "perf: disabling perf due to pmu= command line option.\n");
>> +		on_each_cpu(do_pmu_override, NULL, 1);
>> +		return 0;
>> +	}
>> +
>>   	/* run through all the pmu drivers one at a time */
>>   	if (!init_power5_pmu())
>>   		return 0;
>> @@ -2451,4 +2467,23 @@ static int __init init_ppc64_pmu(void)
>>   		return init_generic_compat_pmu();
>>   }
>>   early_initcall(init_ppc64_pmu);
>> +
>> +static int __init pmu_setup(char *str)
>> +{
>> +	unsigned long val;
>> +
>> +	if (!early_cpu_has_feature(CPU_FTR_HVMODE))
>> +		return 0;
>> +
>> +	pmu_override = true;
>> +
>> +	if (kstrtoul(str, 0, &val))
>> +		val = 0;
>> +
>> +	pmu_override_val = val;
>> +
>> +	return 1;
>> +}
>> +__setup("pmu=", pmu_setup);
>> +
>>   #endif
> 

^ permalink raw reply

* Re: [PATCH v1 14/55] KVM: PPC: Book3S HV: Don't always save PMU for guest capable of nesting
From: Nicholas Piggin @ 2021-08-06 10:32 UTC (permalink / raw)
  To: kvm-ppc, Michael Ellerman; +Cc: linuxppc-dev
In-Reply-To: <871r77ni1g.fsf@mpe.ellerman.id.au>

Excerpts from Michael Ellerman's message of August 6, 2021 5:34 pm:
> Nicholas Piggin <npiggin@gmail.com> writes:
>> Revert the workaround added by commit 63279eeb7f93a ("KVM: PPC: Book3S
>> HV: Always save guest pmu for guest capable of nesting").
>>
>> Nested capable guests running with the earlier commit ("KVM: PPC: Book3S
>> HV Nested: Indicate guest PMU in-use in VPA") will now indicate the PMU
>> in-use status of their guests, which means the parent does not need to
>> unconditionally save the PMU for nested capable guests.
>>
>> This will cause the PMU to break for nested guests when running older
>> nested hypervisor guests under a kernel with this change. It's unclear
>> there's an easy way to avoid that, so this could wait for a release or
>> so for the fix to filter into stable kernels.
> 
> I'm not sure PMU inside nested guests is getting much use, but I don't
> think we can break this quite so casually :)
> 
> Especially as the failure mode will be PMU counts that don't match
> reality, which is hard to diagnose. It took nearly a year for us to find
> the original bug.
> 
> I think we need to hold this back for a while.
> 
> We could put it under a CONFIG option, and then flip that option to off
> at some point in the future.

Okay that might be a good compromise, I'll do that.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v1 11/55] powerpc/time: add API for KVM to re-arm the host timer/decrementer
From: Nicholas Piggin @ 2021-08-06 10:30 UTC (permalink / raw)
  To: Christophe Leroy, kvm-ppc; +Cc: linuxppc-dev
In-Reply-To: <370398a9-4429-285e-4a0f-33759f39b2fc@csgroup.eu>

Excerpts from Christophe Leroy's message of August 5, 2021 5:22 pm:
> 
> 
> Le 26/07/2021 à 05:49, Nicholas Piggin a écrit :
>> Rather than have KVM look up the host timer and fiddle with the
>> irq-work internal details, have the powerpc/time.c code provide a
>> function for KVM to re-arm the Linux timer code when exiting a
>> guest.
>> 
>> This is implementation has an improvement over existing code of
>> marking a decrementer interrupt as soft-pending if a timer has
>> expired, rather than setting DEC to a -ve value, which tended to
>> cause host timers to take two interrupts (first hdec to exit the
>> guest, then the immediate dec).
>> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>   arch/powerpc/include/asm/time.h | 16 +++-------
>>   arch/powerpc/kernel/time.c      | 52 +++++++++++++++++++++++++++------
>>   arch/powerpc/kvm/book3s_hv.c    |  7 ++---
>>   3 files changed, 49 insertions(+), 26 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
>> index 69b6be617772..924b2157882f 100644
>> --- a/arch/powerpc/include/asm/time.h
>> +++ b/arch/powerpc/include/asm/time.h
>> @@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low,
>>   extern void secondary_cpu_time_init(void);
>>   extern void __init time_init(void);
>>   
>> -#ifdef CONFIG_PPC64
>> -static inline unsigned long test_irq_work_pending(void)
>> -{
>> -	unsigned long x;
>> -
>> -	asm volatile("lbz %0,%1(13)"
>> -		: "=r" (x)
>> -		: "i" (offsetof(struct paca_struct, irq_work_pending)));
>> -	return x;
>> -}
>> -#endif
>> -
>>   DECLARE_PER_CPU(u64, decrementers_next_tb);
>>   
>>   static inline u64 timer_get_next_tb(void)
>> @@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void)
>>   	return __this_cpu_read(decrementers_next_tb);
>>   }
>>   
>> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>> +void timer_rearm_host_dec(u64 now);
>> +#endif
>> +
>>   /* Convert timebase ticks to nanoseconds */
>>   unsigned long long tb_to_ns(unsigned long long tb_ticks);
>>   
>> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
>> index 72d872b49167..016828b7401b 100644
>> --- a/arch/powerpc/kernel/time.c
>> +++ b/arch/powerpc/kernel/time.c
>> @@ -499,6 +499,16 @@ EXPORT_SYMBOL(profile_pc);
>>    * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
>>    */
>>   #ifdef CONFIG_PPC64
>> +static inline unsigned long test_irq_work_pending(void)
>> +{
>> +	unsigned long x;
>> +
>> +	asm volatile("lbz %0,%1(13)"
>> +		: "=r" (x)
>> +		: "i" (offsetof(struct paca_struct, irq_work_pending)));
> 
> Can we just use READ_ONCE() instead of hard coding the read ?

Good question, probably yes. Probably calls for its own patch series, 
e.g., hw_irq.h has all similar paca accesses.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v1 02/55] KVM: PPC: Book3S HV P9: Fixes for TM softpatch interrupt
From: Nicholas Piggin @ 2021-08-06 10:25 UTC (permalink / raw)
  To: kvm-ppc, Michael Ellerman; +Cc: linuxppc-dev
In-Reply-To: <87a6lvnzin.fsf@mpe.ellerman.id.au>

Excerpts from Michael Ellerman's message of August 6, 2021 11:16 am:
> Nicholas Piggin <npiggin@gmail.com> writes:
>> The softpatch interrupt sets HSRR0 to the faulting instruction +4, so
>> it should subtract 4 for the faulting instruction address. Also have it
>> emulate and deliver HFAC interrupts correctly, which is important for
>> nested HV and facility demand-faulting in future.
> 
> The nip being off by 4 sounds bad. But I guess it's not that big a deal
> because it's only used for reporting the instruction address?

Yeah currently I think so. It's not that bad of a bug.

> 
> Would also be good to have some more explanation of why it's OK to
> change from illegal to HFAC, which is a guest visible change.

Good point. Again for now it doesn't really matter because the HFAC
handler turns everything (except msgsndp) into a sigill anyway, so
becomes important when we start using HFACs. Put that way I'll probably
split it out.

> 
>> diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c
>> index cc90b8b82329..e4fd4a9dee08 100644
>> --- a/arch/powerpc/kvm/book3s_hv_tm.c
>> +++ b/arch/powerpc/kvm/book3s_hv_tm.c
>> @@ -74,19 +74,23 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
>>  	case PPC_INST_RFEBB:
>>  		if ((msr & MSR_PR) && (vcpu->arch.vcore->pcr & PCR_ARCH_206)) {
>>  			/* generate an illegal instruction interrupt */
>> +			vcpu->arch.regs.nip -= 4;
>>  			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
>>  			return RESUME_GUEST;
>>  		}
>>  		/* check EBB facility is available */
>>  		if (!(vcpu->arch.hfscr & HFSCR_EBB)) {
>> -			/* generate an illegal instruction interrupt */
>> -			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
>> -			return RESUME_GUEST;
>> +			vcpu->arch.regs.nip -= 4;
>> +			vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE;
>> +			vcpu->arch.hfscr |= (u64)FSCR_EBB_LG << 56;
>> +			vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL;
>> +			return -1; /* rerun host interrupt handler */
> 
> This is EBB not TM. Probably OK to leave it in this patch as long as
> it's mentioned in the change log?

It is, but you can get a softpatch interrupt on rfebb changing TM state. 
Although I haven't actually tested to see if you get a softpatch when
HFSCR disables EBB or the hardware just gives you the HFAC. For that 
matter, same for all the other facility tests.

Thanks,
Nick

> 
>>  		}
>>  		if ((msr & MSR_PR) && !(vcpu->arch.fscr & FSCR_EBB)) {
>>  			/* generate a facility unavailable interrupt */
>> -			vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) |
>> -				((u64)FSCR_EBB_LG << 56);
>> +			vcpu->arch.regs.nip -= 4;
>> +			vcpu->arch.fscr &= ~FSCR_INTR_CAUSE;
>> +			vcpu->arch.fscr |= (u64)FSCR_EBB_LG << 56;
> 
> Same.
> 
> 
> cheers
> 

^ permalink raw reply

* Re: [PATCH v2 2/3] KVM: PPC: Book3S HV: Add sanity check to copy_tofrom_guest
From: Nicholas Piggin @ 2021-08-06 10:10 UTC (permalink / raw)
  To: Fabiano Rosas, kvm-ppc; +Cc: christophe.leroy, linuxppc-dev
In-Reply-To: <20210805212616.2641017-3-farosas@linux.ibm.com>

Excerpts from Fabiano Rosas's message of August 6, 2021 7:26 am:
> Both paths into __kvmhv_copy_tofrom_guest_radix ensure that we arrive
> with an effective address that is smaller than our total addressable
> space and addresses quadrant 0.
> 
> - The H_COPY_TOFROM_GUEST hypercall path rejects the call with
> H_PARAMETER if the effective address has any of the twelve most
> significant bits set.
> 
> - The kvmhv_copy_tofrom_guest_radix path clears the top twelve bits
> before calling the internal function.
> 
> Although the callers make sure that the effective address is sane, any
> future use of the function is exposed to a programming error, so add a
> sanity check.

We possibly should put these into #defines in radix pgtable headers 
somewhere but KVM already open codes them so this is good for now.

Reviewed-by: Nicholas Piggin <npiggin@gmail.com>

> 
> Suggested-by: Nicholas Piggin <npiggin@gmail.com>
> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
> ---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 44eb7b1ef289..1b1c9e9e539b 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -44,6 +44,9 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
>  					  (to != NULL) ? __pa(to): 0,
>  					  (from != NULL) ? __pa(from): 0, n);
>  
> +	if (eaddr & (0xFFFUL << 52))
> +		return ret;
> +
>  	quadrant = 1;
>  	if (!pid)
>  		quadrant = 2;
> -- 
> 2.29.2
> 
> 

^ permalink raw reply

* Re: [PATCH v2 1/3] KVM: PPC: Book3S HV: Fix copy_tofrom_guest routines
From: Nicholas Piggin @ 2021-08-06 10:09 UTC (permalink / raw)
  To: Fabiano Rosas, kvm-ppc; +Cc: christophe.leroy, linuxppc-dev
In-Reply-To: <20210805212616.2641017-2-farosas@linux.ibm.com>

Excerpts from Fabiano Rosas's message of August 6, 2021 7:26 am:
> The __kvmhv_copy_tofrom_guest_radix function was introduced along with
> nested HV guest support. It uses the platform's Radix MMU quadrants to
> provide a nested hypervisor with fast access to its nested guests
> memory (H_COPY_TOFROM_GUEST hypercall). It has also since been added
> as a fast path for the kvmppc_ld/st routines which are used during
> instruction emulation.
> 
> The commit def0bfdbd603 ("powerpc: use probe_user_read() and
> probe_user_write()") changed the low level copy function from
> raw_copy_from_user to probe_user_read, which adds a check to
> access_ok. In powerpc that is:
> 
>  static inline bool __access_ok(unsigned long addr, unsigned long size)
>  {
>          return addr < TASK_SIZE_MAX && size <= TASK_SIZE_MAX - addr;
>  }
> 
> and TASK_SIZE_MAX is 0x0010000000000000UL for 64-bit, which means that
> setting the two MSBs of the effective address (which correspond to the
> quadrant) now cause access_ok to reject the access.
> 
> This was not caught earlier because the most common code path via
> kvmppc_ld/st contains a fallback (kvm_read_guest) that is likely to
> succeed for L1 guests. For nested guests there is no fallback.
> 
> Another issue is that probe_user_read (now __copy_from_user_nofault)
> does not return the number of bytes not copied in case of failure, so
> the destination memory is not being cleared anymore in
> kvmhv_copy_from_guest_radix:
> 
>  ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
>  if (ret > 0)                            <-- always false!
>          memset(to + (n - ret), 0, ret);
> 
> This patch fixes both issues by skipping access_ok and open-coding the
> low level __copy_to/from_user_inatomic.
> 
> Fixes: def0bfdbd603 ("powerpc: use probe_user_read() and probe_user_write()")

Reviewed-by: Nicholas Piggin <npiggin@gmail.com>

> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
> ---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index b5905ae4377c..44eb7b1ef289 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -65,10 +65,12 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
>  	}
>  	isync();
>  
> +	pagefault_disable();
>  	if (is_load)
> -		ret = copy_from_user_nofault(to, (const void __user *)from, n);
> +		ret = __copy_from_user_inatomic(to, (const void __user *)from, n);
>  	else
> -		ret = copy_to_user_nofault((void __user *)to, from, n);
> +		ret = __copy_to_user_inatomic((void __user *)to, from, n);
> +	pagefault_enable();
>  
>  	/* switch the pid first to avoid running host with unallocated pid */
>  	if (quadrant == 1 && pid != old_pid)
> -- 
> 2.29.2
> 
> 

^ permalink raw reply

* Re: Debian SID kernel doesn't boot on PowerBook 3400c
From: Christophe Leroy @ 2021-08-06  9:58 UTC (permalink / raw)
  To: Finn Thain; +Cc: debian-powerpc, linuxppc-dev, Stan Johnson, Nicholas Piggin
In-Reply-To: <c031a1e7-fde7-7c39-d9ff-404157cfc0df@linux-m68k.org>



Le 06/08/2021 à 11:43, Finn Thain a écrit :
> On Fri, 6 Aug 2021, Christophe Leroy wrote:
> 
>>>>>>
>>>>>> Can you check if they DO NOT happen at preceding commit c16728835~
>>>>>>
>>>>
>>>> $ git checkout c16728835~
>>>> Previous HEAD position was c16728835eec powerpc/32: Manage KUAP in C
>>>> HEAD is now at 0b45359aa2df powerpc/8xx: Create C version of kuap
>>>> save/restore/check helpers
>>>> $ git am ../message.mbox
>>>> warning: Patch sent with format=flowed; space at the end of lines might be
>>>> lost.
>>>> Applying: powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE
>>>> $ cp ../dot-config-powermac-5.13 .config
>>>> $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean
>>>> olddefconfig vmlinux
>>>>
>>>> Linux version 5.12.0-rc3-pmac-00077-gc9f6e8dd045
>>>>
>>>> 3) PB 3400c
>>>> Hangs at boot (Mac OS screen)
>>>>
>>>> 4) Wallstreet
>>>> X fails, errors in console log (different than test 2), see
>>>> Wallstreet_console-2.txt.
>>>>
>>>
>>> This log shows that the errors "xfce4-session[1775]: bus error (7)" and
>>> "kernel BUG at arch/powerpc/kernel/interrupt.c:49!" happen prior to commit
>>> c16728835eec ("powerpc/32: Manage KUAP in C").
>>
>> As mentionned by Nic, this is due to r11 being cloberred. For the time being
>> the only r11 clobber identified is the one I have provided a fix for. I'm
>> wondering whether it was applied for all further tests or not.
>>
> 
> Your fix was applied to this build with "git am ../message.mbox".

Ok good.

> 
>> ...
>>>>
>>>>>
>>>>>> Could you test with CONFIG_PPC_KUAP and CONFIG_PPC_KUAP_DEBUG
>>>> ...
>>>>
>>>> $scripts/config -e CONFIG_PPC_KUAP
>>>> $ scripts/config -e CONFIG_PPC_KUAP_DEBUG
>>>> $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean
>>>> olddefconfig vmlinux
>>>> $ grep CONFIG_PPC_KUAP .config
>>>> CONFIG_PPC_KUAP=y
>>>> CONFIG_PPC_KUAP_DEBUG=y
>>>>
>>>> Linux version 5.12.0-rc3-pmac-00078-g5cac2bc3752
>>>>
>>>> 9) PB 3400c
>>>> Hangs at boot (Mac OS screen)
>>>>
>>>> 10) Wallstreet
>>>> X failed at first login, worked at second login, one error in console
>>>> log ("BUG: Unable to handle kernel instruction fetch"), see
>>>> Wallstreet_console-5.txt.
>>>>
>>>
>>> One might expect to see "Kernel attempted to write user page (b3399774) -
>>> exploit attempt?" again here (see c16728835eec build above) but instead
>>> this log says "Oops: Kernel access of bad area, sig: 11".
>>
>> Maybe the test should be done a second time. As r11 is garbage it may or
>> may not be a user address. If it is a user address the we get "Kernel
>> attempted to write user page". If it is a random kernel address, we
>> likely get "Kernel access of bad area" instead.
>>
> 
> Your fix was applied here also.
> 

Anyway, it would be worth trying to boot a few times more with the same kernel, because as I said 
the value is random, so it may or may not hit userspace, hence the possible difference of message, 
either "Kernel attempted to write user page" or "Kernel access of bad area" depending on whether the 
address is a user address or not.

I have cooked a tentative fix for that KUAP stuff.
Could you try the branch 'bugtest' at https://github.com/chleroy/linux.git

Thanks
Christophe

^ permalink raw reply

* Re: Debian SID kernel doesn't boot on PowerBook 3400c
From: Finn Thain @ 2021-08-06  9:43 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: debian-powerpc, linuxppc-dev, Stan Johnson, Nicholas Piggin
In-Reply-To: <ca0ded24-9fa0-fae4-89cf-20fc1959f69d@csgroup.eu>

On Fri, 6 Aug 2021, Christophe Leroy wrote:

> > > > > 
> > > > > Can you check if they DO NOT happen at preceding commit c16728835~
> > > > > 
> > > 
> > > $ git checkout c16728835~
> > > Previous HEAD position was c16728835eec powerpc/32: Manage KUAP in C
> > > HEAD is now at 0b45359aa2df powerpc/8xx: Create C version of kuap
> > > save/restore/check helpers
> > > $ git am ../message.mbox
> > > warning: Patch sent with format=flowed; space at the end of lines might be
> > > lost.
> > > Applying: powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE
> > > $ cp ../dot-config-powermac-5.13 .config
> > > $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean
> > > olddefconfig vmlinux
> > > 
> > > Linux version 5.12.0-rc3-pmac-00077-gc9f6e8dd045
> > > 
> > > 3) PB 3400c
> > > Hangs at boot (Mac OS screen)
> > > 
> > > 4) Wallstreet
> > > X fails, errors in console log (different than test 2), see
> > > Wallstreet_console-2.txt.
> > > 
> > 
> > This log shows that the errors "xfce4-session[1775]: bus error (7)" and
> > "kernel BUG at arch/powerpc/kernel/interrupt.c:49!" happen prior to commit
> > c16728835eec ("powerpc/32: Manage KUAP in C").
> 
> As mentionned by Nic, this is due to r11 being cloberred. For the time being
> the only r11 clobber identified is the one I have provided a fix for. I'm
> wondering whether it was applied for all further tests or not.
> 

Your fix was applied to this build with "git am ../message.mbox".

> ...
> > > 
> > > > 
> > > > > Could you test with CONFIG_PPC_KUAP and CONFIG_PPC_KUAP_DEBUG
> > > ...
> > > 
> > > $scripts/config -e CONFIG_PPC_KUAP
> > > $ scripts/config -e CONFIG_PPC_KUAP_DEBUG
> > > $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean
> > > olddefconfig vmlinux
> > > $ grep CONFIG_PPC_KUAP .config
> > > CONFIG_PPC_KUAP=y
> > > CONFIG_PPC_KUAP_DEBUG=y
> > > 
> > > Linux version 5.12.0-rc3-pmac-00078-g5cac2bc3752
> > > 
> > > 9) PB 3400c
> > > Hangs at boot (Mac OS screen)
> > > 
> > > 10) Wallstreet
> > > X failed at first login, worked at second login, one error in console
> > > log ("BUG: Unable to handle kernel instruction fetch"), see
> > > Wallstreet_console-5.txt.
> > > 
> > 
> > One might expect to see "Kernel attempted to write user page (b3399774) -
> > exploit attempt?" again here (see c16728835eec build above) but instead
> > this log says "Oops: Kernel access of bad area, sig: 11".
> 
> Maybe the test should be done a second time. As r11 is garbage it may or 
> may not be a user address. If it is a user address the we get "Kernel 
> attempted to write user page". If it is a random kernel address, we 
> likely get "Kernel access of bad area" instead.
> 

Your fix was applied here also.

^ permalink raw reply

* Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option
From: Athira Rajeev @ 2021-08-06  9:28 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linuxppc-dev, kvm-ppc
In-Reply-To: <20210726035036.739609-17-npiggin@gmail.com>



> On 26-Jul-2021, at 9:19 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
> 
> It can be useful in simulators (with very constrained environments)
> to allow some PMCs to run from boot so they can be sampled directly
> by a test harness, rather than having to run perf.
> 
> A previous change freezes counters at boot by default, so provide
> a boot time option to un-freeze (plus a bit more flexibility).
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
> .../admin-guide/kernel-parameters.txt         |  7 ++++
> arch/powerpc/perf/core-book3s.c               | 35 +++++++++++++++++++
> 2 files changed, 42 insertions(+)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index bdb22006f713..96b7d0ebaa40 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -4089,6 +4089,13 @@
> 			Override pmtimer IOPort with a hex value.
> 			e.g. pmtmr=0x508
> 
> +	pmu=		[PPC] Manually enable the PMU.
> +			Enable the PMU by setting MMCR0 to 0 (clear FC bit).
> +			This option is implemented for Book3S processors.
> +			If a number is given, then MMCR1 is set to that number,
> +			otherwise (e.g., 'pmu=on'), it is left 0. The perf
> +			subsystem is disabled if this option is used.
> +
> 	pm_debug_messages	[SUSPEND,KNL]
> 			Enable suspend/resume debug messages during boot up.
> 
> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
> index 65795cadb475..e7cef4fe17d7 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu)
> }
> 
> #ifdef CONFIG_PPC64
> +static bool pmu_override = false;
> +static unsigned long pmu_override_val;
> +static void do_pmu_override(void *data)
> +{
> +	ppc_set_pmu_inuse(1);
> +	if (pmu_override_val)
> +		mtspr(SPRN_MMCR1, pmu_override_val);
> +	mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);

Hi Nick

Here, we are not doing any validity check for the value used to set MMCR1. 
For advanced users, the option to pass value for MMCR1 is fine. But other cases, it could result in
invalid event getting used. Do we need to restrict this boot time option for only PMC5/6 ?
 
Thanks
Athira

> +}
> +
> static int __init init_ppc64_pmu(void)
> {
> +	if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) {
> +		printk(KERN_WARNING "perf: disabling perf due to pmu= command line option.\n");
> +		on_each_cpu(do_pmu_override, NULL, 1);
> +		return 0;
> +	}
> +
> 	/* run through all the pmu drivers one at a time */
> 	if (!init_power5_pmu())
> 		return 0;
> @@ -2451,4 +2467,23 @@ static int __init init_ppc64_pmu(void)
> 		return init_generic_compat_pmu();
> }
> early_initcall(init_ppc64_pmu);
> +
> +static int __init pmu_setup(char *str)
> +{
> +	unsigned long val;
> +
> +	if (!early_cpu_has_feature(CPU_FTR_HVMODE))
> +		return 0;
> +
> +	pmu_override = true;
> +
> +	if (kstrtoul(str, 0, &val))
> +		val = 0;
> +
> +	pmu_override_val = val;
> +
> +	return 1;
> +}
> +__setup("pmu=", pmu_setup);
> +
> #endif
> -- 
> 2.23.0
> 


^ permalink raw reply

* Re: [PATCH] powerpc/kprobes: Fix kprobe Oops happens in booke
From: Pu Lehui @ 2021-08-06  8:59 UTC (permalink / raw)
  To: Christophe Leroy, oleg, mpe, benh, paulus, naveen.n.rao, mhiramat,
	peterz, npiggin, ruscur
  Cc: zhangjinhao2, xukuohai, linuxppc-dev, linux-kernel
In-Reply-To: <021cf081-77a9-8e4e-a246-4faaf3937dbe@csgroup.eu>



On 2021/8/5 17:51, Christophe Leroy wrote:
> 
> 
> Le 04/08/2021 à 16:37, Pu Lehui a écrit :
>> When using kprobe on powerpc booke series processor, Oops happens
>> as show bellow:
>>
>> [   35.861352] Oops: Exception in kernel mode, sig: 5 [#1]
>> [   35.861676] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500
>> [   35.861905] Modules linked in:
>> [   35.862144] CPU: 0 PID: 76 Comm: sh Not tainted 
>> 5.14.0-rc3-00060-g7e96bf476270 #18
>> [   35.862610] NIP:  c0b96470 LR: c00107b4 CTR: c0161c80
>> [   35.862805] REGS: c387fe70 TRAP: 0700   Not tainted 
>> (5.14.0-rc3-00060-g7e96bf476270)
>> [   35.863198] MSR:  00029002 <CE,EE,ME>  CR: 24022824  XER: 20000000
>> [   35.863577]
>> [   35.863577] GPR00: c0015218 c387ff20 c313e300 c387ff50 00000004 
>> 40000002 40000000 0a1a2cce
>> [   35.863577] GPR08: 00000000 00000004 00000000 59764000 24022422 
>> 102490c2 00000000 00000000
>> [   35.863577] GPR16: 00000000 00000000 00000040 10240000 10240000 
>> 10240000 10240000 10220000
>> [   35.863577] GPR24: ffffffff 10240000 00000000 00000000 bfc655e8 
>> 00000800 c387ff50 00000000
>> [   35.865367] NIP [c0b96470] schedule+0x0/0x130
>> [   35.865606] LR [c00107b4] interrupt_exit_user_prepare_main+0xf4/0x100
>> [   35.865974] Call Trace:
>> [   35.866142] [c387ff20] [c0053224] irq_exit+0x114/0x120 (unreliable)
>> [   35.866472] [c387ff40] [c0015218] interrupt_return+0x14/0x13c
>> [   35.866728] --- interrupt: 900 at 0x100af3dc
>> [   35.866963] NIP:  100af3dc LR: 100de020 CTR: 00000000
>> [   35.867177] REGS: c387ff50 TRAP: 0900   Not tainted 
>> (5.14.0-rc3-00060-g7e96bf476270)
>> [   35.867488] MSR:  0002f902 <CE,EE,PR,FP,ME>  CR: 20022422  XER: 
>> 20000000
>> [   35.867808]
>> [   35.867808] GPR00: c001509c bfc65570 1024b4d0 00000000 100de020 
>> 20022422 bfc655a8 100af3dc
>> [   35.867808] GPR08: 0002f902 00000000 00000000 00000000 72656773 
>> 102490c2 00000000 00000000
>> [   35.867808] GPR16: 00000000 00000000 00000040 10240000 10240000 
>> 10240000 10240000 10220000
>> [   35.867808] GPR24: ffffffff 10240000 00000000 00000000 bfc655e8 
>> 10245910 ffffffff 00000001
>> [   35.869406] NIP [100af3dc] 0x100af3dc
>> [   35.869578] LR [100de020] 0x100de020
>> [   35.869751] --- interrupt: 900
>> [   35.870001] Instruction dump:
>> [   35.870283] 40c20010 815e0518 714a0100 41e2fd04 39200000 913e00c0 
>> 3b1e0450 4bfffd80
>> [   35.870666] 0fe00000 92a10024 4bfff1a9 60000000 <7fe00008> 7c0802a6 
>> 93e1001c 7c5f1378
>> [   35.871339] ---[ end trace 23ff848139efa9b9 ]---
>>
>> There is no real mode for booke arch and the MMU translation is
>> always on. The corresponding MSR_IS/MSR_DS bit in booke is used
>> to switch the address space, but not for real mode judgment.
> 
> Can you explain more the link between that explanation and the Oops 
> itself ?
> 
In fact, the same Oops appears when any probed function is hit, like 
do_nanosleep

/ # echo "p:myprobe do_nanosleep" > /sys/kernel/debug/tracing/kprobe_events
/ # echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable
/ # sleep 1
[   50.076730] Oops: Exception in kernel mode, sig: 5 [#1]
[   50.077017] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500
[   50.077221] Modules linked in:
[   50.077462] CPU: 0 PID: 77 Comm: sleep Not tainted 
5.14.0-rc4-00022-g251a1524293d #21
[   50.077887] NIP:  c0b9c4e0 LR: c00ebecc CTR: 00000000
[   50.078067] REGS: c3883de0 TRAP: 0700   Not tainted 
(5.14.0-rc4-00022-g251a1524293d)
[   50.078349] MSR:  00029000 <CE,EE,ME>  CR: 24000228  XER: 20000000
[   50.078675]
[   50.078675] GPR00: c00ebdf0 c3883e90 c313e300 c3883ea0 00000001 
00000000 c3883ecc 00000001
[   50.078675] GPR08: c100598c c00ea250 00000004 00000000 24000222 
102490c2 bff4180c 101e60d4
[   50.078675] GPR16: 00000000 102454ac 00000040 10240000 10241100 
102410f8 10240000 00500000
[   50.078675] GPR24: 00000002 00000000 c3883ea0 00000001 00000000 
0000c350 3b9b8d50 00000000
[   50.080151] NIP [c0b9c4e0] do_nanosleep+0x0/0x190
[   50.080352] LR [c00ebecc] hrtimer_nanosleep+0x14c/0x1e0
[   50.080638] Call Trace:
[   50.080801] [c3883e90] [c00ebdf0] hrtimer_nanosleep+0x70/0x1e0 
(unreliable)
[   50.081110] [c3883f00] [c00ec004] sys_nanosleep_time32+0xa4/0x110
[   50.081336] [c3883f40] [c001509c] ret_from_syscall+0x0/0x28
[   50.081541] --- interrupt: c00 at 0x100a4d08
[   50.081749] NIP:  100a4d08 LR: 101b5234 CTR: 00000003
[   50.081931] REGS: c3883f50 TRAP: 0c00   Not tainted 
(5.14.0-rc4-00022-g251a1524293d)
[   50.082183] MSR:  0002f902 <CE,EE,PR,FP,ME>  CR: 24000222  XER: 00000000
[   50.082457]
[   50.082457] GPR00: 000000a2 bf980040 1024b4d0 bf980084 bf980084 
64000000 00555345 fefefeff
[   50.082457] GPR08: 7f7f7f7f 101e0000 00000069 00000003 28000422 
102490c2 bff4180c 101e60d4
[   50.082457] GPR16: 00000000 102454ac 00000040 10240000 10241100 
102410f8 10240000 00500000
[   50.082457] GPR24: 00000002 bf9803f4 10240000 00000000 00000000 
100039e0 00000000 102444e8
[   50.083789] NIP [100a4d08] 0x100a4d08
[   50.083917] LR [101b5234] 0x101b5234
[   50.084042] --- interrupt: c00
[   50.084238] Instruction dump:
[   50.084483] 4bfffc40 60000000 60000000 60000000 9421fff0 39400402 
914200c0 38210010
[   50.084841] 4bfffc20 00000000 00000000 00000000 <7fe00008> 7c0802a6 
7c892378 93c10048
[   50.085487] ---[ end trace f6fffe98e2fa8f3e ]---
[   50.085678]
Trace/breakpoint trap

In current code, kprobe_handler() will be called by 'program check 
exception' when a probe is hit,
and kprobe_handler() will check whether it is in real-mode according to 
MSR_IR/MSR_DR bit.
When in real-mode(MSR_IR/MSR_DR are 0), the following process will be 
executed to trigger Oops:
__exception() -> exception_common() -> die("Exception in kernel mode")

But for booke arch, the corresponding bits, which is called 
MSR_IS/MSR_DS, are used for switching
the non-privileged and privileged virtual address spaces, and users can 
change both spaces by
setting MSR_IS/MSR_DS bit. So, when kernel in privileged 
space(MSR_IS/MSR_DS are 0), kprobe trap will
meet a Oops.

And also, the MMU of booke is always enabled, so when other trap 
appears, the problem mentioned in
21f8b2fa3ca5 will never met.
>>
>> Fixes: 21f8b2fa3ca5 ("powerpc/kprobes: Ignore traps that happened in 
>> real mode")
>> Signed-off-by: Pu Lehui <pulehui@huawei.com>
>> ---
>>   arch/powerpc/include/asm/ptrace.h | 6 ++++++
>>   arch/powerpc/kernel/kprobes.c     | 5 +----
>>   2 files changed, 7 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/ptrace.h 
>> b/arch/powerpc/include/asm/ptrace.h
>> index 3e5d470a6155..4aec1a97024b 100644
>> --- a/arch/powerpc/include/asm/ptrace.h
>> +++ b/arch/powerpc/include/asm/ptrace.h
>> @@ -187,6 +187,12 @@ static inline unsigned long frame_pointer(struct 
>> pt_regs *regs)
>>   #define user_mode(regs) (((regs)->msr & MSR_PR) != 0)
>>   #endif
>> +#ifdef CONFIG_BOOKE
>> +#define real_mode(regs)    0
>> +#else
>> +#define real_mode(regs)    (!((regs)->msr & MSR_IR) || !((regs)->msr 
>> & MSR_DR))
>> +#endif
>> +
> 
> You don't need an #ifdef stuff here, you can base your testing on 
> IS_ENABLED(CONFIG_BOOKE)
>
I'll fix in v2.
>>   #define force_successful_syscall_return()   \
>>       do { \
>>           set_thread_flag(TIF_NOERROR); \
>> diff --git a/arch/powerpc/kernel/kprobes.c 
>> b/arch/powerpc/kernel/kprobes.c
>> index cbc28d1a2e1b..fac9a5974718 100644
>> --- a/arch/powerpc/kernel/kprobes.c
>> +++ b/arch/powerpc/kernel/kprobes.c
>> @@ -289,10 +289,7 @@ int kprobe_handler(struct pt_regs *regs)
>>       unsigned int *addr = (unsigned int *)regs->nip;
>>       struct kprobe_ctlblk *kcb;
>> -    if (user_mode(regs))
>> -        return 0;
>> -
>> -    if (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR))
>> +    if (user_mode(regs) || real_mode(regs))
>>           return 0;
>>       /*
>>
> .

^ permalink raw reply

* Re: [RFC PATCH] powerpc/book3s64/radix: Upgrade va tlbie to PID tlbie if we cross PMD_SIZE
From: Puvichakravarthy Ramachandran @ 2021-08-06  7:56 UTC (permalink / raw)
  To: aneesh.kumar; +Cc: linuxppc-dev, npiggin

> With shared mapping, even though we are unmapping a large range, the 
kernel
> will force a TLB flush with ptl lock held to avoid the race mentioned in
> commit 1cf35d47712d ("mm: split 'tlb_flush_mmu()' into tlb flushing and 
memory freeing parts")
> This results in the kernel issuing a high number of TLB flushes even for 
a large
> range. This can be improved by making sure the kernel switch to pid 
based flush if the
> kernel is unmapping a 2M range.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/book3s64/radix_tlb.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
> index aefc100d79a7..21d0f098e43b 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -1106,7 +1106,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
>   * invalidating a full PID, so it has a far lower threshold to change 
from
>   * individual page flushes to full-pid flushes.
>   */
> -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
> +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 32;
>  static unsigned long tlb_local_single_page_flush_ceiling __read_mostly 
= POWER9_TLB_SETS_RADIX * 2;
> 
>  static inline void __radix__flush_tlb_range(struct mm_struct *mm,
> @@ -1133,7 +1133,7 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>       if (fullmm)
>               flush_pid = true;
>       else if (type == FLUSH_TYPE_GLOBAL)
> -             flush_pid = nr_pages > tlb_single_page_flush_ceiling;
> +             flush_pid = nr_pages >= tlb_single_page_flush_ceiling;
>       else
>               flush_pid = nr_pages > 
tlb_local_single_page_flush_ceiling;

Additional details on the test environment. This was tested on a 2 Node/8 
socket Power10 system.
The LPAR had 105 cores and the LPAR spanned across all the sockets. 

# perf stat -I 1000 -a -e cycles,instructions -e 
"{cpu/config=0x030008,name=PM_EXEC_STALL/}" -e 
"{cpu/config=0x02E01C,name=PM_EXEC_STALL_TLBIE/}" ./tlbie -i 10 -c 1  -t 1
 Rate of work: = 176 
#           time             counts unit events
     1.029206442         4198594519      cycles  
     1.029206442         2458254252      instructions              # 0.59 
insn per cycle 
     1.029206442         3004031488      PM_EXEC_STALL   
     1.029206442         1798186036      PM_EXEC_STALL_TLBIE   
 Rate of work: = 181 
     2.054288539         4183883450      cycles  
     2.054288539         2472178171      instructions              # 0.59 
insn per cycle 
     2.054288539         3014609313      PM_EXEC_STALL   
     2.054288539         1797851642      PM_EXEC_STALL_TLBIE   
 Rate of work: = 180 
     3.078306883         4171250717      cycles  
     3.078306883         2468341094      instructions              # 0.59 
insn per cycle 
     3.078306883         2993036205      PM_EXEC_STALL   
     3.078306883         1798181890      PM_EXEC_STALL_TLBIE   
.
. 

# cat /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling
34

# echo 32 > /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling

# perf stat -I 1000 -a -e cycles,instructions -e 
"{cpu/config=0x030008,name=PM_EXEC_STALL/}" -e 
"{cpu/config=0x02E01C,name=PM_EXEC_STALL_TLBIE/}" ./tlbie -i 10 -c 1  -t 1
 Rate of work: = 313 
#           time             counts unit events
     1.030310506         4206071143      cycles  
     1.030310506         4314716958      instructions              # 1.03 
insn per cycle 
     1.030310506         2157762167      PM_EXEC_STALL   
     1.030310506          110825573      PM_EXEC_STALL_TLBIE   
 Rate of work: = 322 
     2.056034068         4331745630      cycles  
     2.056034068         4531658304      instructions              # 1.05 
insn per cycle 
     2.056034068         2288971361      PM_EXEC_STALL   
     2.056034068          111267927      PM_EXEC_STALL_TLBIE   
 Rate of work: = 321 
     3.081216434         4327050349      cycles  
     3.081216434         4379679508      instructions              # 1.01 
insn per cycle 
     3.081216434         2252602550      PM_EXEC_STALL   
     3.081216434          110974887      PM_EXEC_STALL_TLBIE   
.
.
 

Regards,
Puvichakravarthy Ramachandran





^ permalink raw reply

* Re: [PATCH v2 0/6] PCI: Drop duplicated tracking of a pci_dev's bound driver
From: Uwe Kleine-König @ 2021-08-06  6:46 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Mark Rutland, Giovanni Cabiddu, Rafał Miłecki,
	Peter Zijlstra, linux-pci, Alexander Duyck, x86, oss-drivers,
	netdev, Paul Mackerras, H. Peter Anvin, Jiri Olsa,
	Thomas Gleixner, Taras Chornyi, Stefano Stabellini, Herbert Xu,
	linux-scsi, Sathya Prakash, qat-linux, Alexander Shishkin,
	Ingo Molnar, Jakub Kicinski, Yisen Zhuang,
	Suganath Prabu Subramani, Fiona Trahe, Oliver O'Halloran,
	Andrew Donnellan, Mathias Nyman, Konrad Rzeszutek Wilk,
	Ido Schimmel, Arnaldo Carvalho de Melo, Frederic Barrat,
	Borislav Petkov, Michael Buesch, Jiri Pirko, Bjorn Helgaas,
	Namhyung Kim, Boris Ostrovsky, Andy Shevchenko, Juergen Gross,
	Salil Mehta, Sreekanth Reddy, xen-devel, Vadym Kochan,
	MPT-FusionLinux.pdl, linux-usb, linux-wireless, linux-kernel,
	linux-perf-users, Zhou Wang, Arnd Bergmann, linux-crypto, kernel,
	Greg Kroah-Hartman, Simon Horman, Wojciech Ziemba, linuxppc-dev,
	David S. Miller
In-Reply-To: <20210805234234.GA1797883@bjorn-Precision-5520>

[-- Attachment #1: Type: text/plain, Size: 3697 bytes --]

Hello Bjorn,

On Thu, Aug 05, 2021 at 06:42:34PM -0500, Bjorn Helgaas wrote:
> On Tue, Aug 03, 2021 at 12:01:44PM +0200, Uwe Kleine-König wrote:
> > Hello,
> > 
> > changes since v1 (https://lore.kernel.org/linux-pci/20210729203740.1377045-1-u.kleine-koenig@pengutronix.de):
> > 
> > - New patch to simplify drivers/pci/xen-pcifront.c, spotted and
> >   suggested by Boris Ostrovsky
> > - Fix a possible NULL pointer dereference I introduced in xen-pcifront.c
> > - A few whitespace improvements
> > - Add a commit log to patch #6 (formerly #5)
> > 
> > I also expanded the audience for patches #4 and #6 to allow affected
> > people to actually see the changes to their drivers.
> > 
> > Interdiff can be found below.
> > 
> > The idea is still the same: After a few cleanups (#1 - #3) a new macro
> > is introduced abstracting access to struct pci_dev->driver. All users
> > are then converted to use this and in the last patch the macro is
> > changed to make use of struct pci_dev::dev->driver to get rid of the
> > duplicated tracking.
> 
> I love the idea of this series!

\o/

> I looked at all the bus_type.probe() methods, it looks like pci_dev is
> not the only offender here.  At least the following also have a driver
> pointer in the device struct:
> 
>   parisc_device.driver
>   acpi_device.driver
>   dio_dev.driver
>   hid_device.driver
>   pci_dev.driver
>   pnp_dev.driver
>   rio_dev.driver
>   zorro_dev.driver

Right, when I converted zorro_dev it was pointed out that the code was
copied from pci and the latter has the same construct. :-)
See
https://lore.kernel.org/r/20210730191035.1455248-5-u.kleine-koenig@pengutronix.de
for the patch, I don't find where pci was pointed out, maybe it was on
irc only.

> Do you plan to do the same for all of them, or is there some reason
> why they need the pointer and PCI doesn't?

There is a list of cleanup stuff I intend to work on. Considering how
working on that list only made it longer in the recent past, maybe it
makes more sense to not work on it :-)

> In almost all cases, other buses define a "to_<bus>_driver()"
> interface.  In fact, PCI already has a to_pci_driver().
> 
> This series adds pci_driver_of_dev(), which basically just means we
> can do this:
> 
>   pdrv = pci_driver_of_dev(pdev);
> 
> instead of this:
> 
>   pdrv = to_pci_driver(pdev->dev.driver);
> 
> I don't see any other "<bus>_driver_of_dev()" interfaces, so I assume
> other buses just live with the latter style?  I'd rather not be
> different and have two ways to get the "struct pci_driver *" unless
> there's a good reason.

Among few the busses I already fixed in this regard pci was the first
that has a considerable amount of usage. So I considered it worth giving
it a name.
 
> Looking through the places that care about pci_dev.driver (the ones
> updated by patch 5/6), many of them are ... a little dubious to begin
> with.  A few need the "struct pci_error_handlers *err_handler"
> pointer, so that's probably legitimate.  But many just need a name,
> and should probably be using dev_driver_string() instead.

Yeah, I considered adding a function to get the driver name from a
pci_dev and a function to get the error handlers. Maybe it's an idea to
introduce these two and then use to_pci_driver(pdev->dev.driver) for the
few remaining users? Maybe doing that on top of my current series makes
sense to have a clean switch from pdev->driver to pdev->dev.driver?!

Best regards
Uwe

-- 
Pengutronix e.K.                           | Uwe Kleine-König            |
Industrial Linux Solutions                 | https://www.pengutronix.de/ |

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [RFC PATCH] powerpc/book3s64/radix: Upgrade va tlbie to PID tlbie if we cross PMD_SIZE
From: Puvichakravarthy Ramachandran @ 2021-08-06  5:22 UTC (permalink / raw)
  To: aneesh.kumar; +Cc: linuxppc-dev, npiggin

> With shared mapping, even though we are unmapping a large range, the 
kernel
> will force a TLB flush with ptl lock held to avoid the race mentioned in
> commit 1cf35d47712d ("mm: split 'tlb_flush_mmu()' into tlb flushing and 
memory freeing parts")
> This results in the kernel issuing a high number of TLB flushes even for 
a large
> range. This can be improved by making sure the kernel switch to pid 
based flush if the
> kernel is unmapping a 2M range.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/book3s64/radix_tlb.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
> index aefc100d79a7..21d0f098e43b 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -1106,7 +1106,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
>   * invalidating a full PID, so it has a far lower threshold to change 
from
>   * individual page flushes to full-pid flushes.
>   */
> -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
> +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 32;
>  static unsigned long tlb_local_single_page_flush_ceiling __read_mostly 
= POWER9_TLB_SETS_RADIX * 2;
> 
>  static inline void __radix__flush_tlb_range(struct mm_struct *mm,
> @@ -1133,7 +1133,7 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
>       if (fullmm)
>               flush_pid = true;
>       else if (type == FLUSH_TYPE_GLOBAL)
> -             flush_pid = nr_pages > tlb_single_page_flush_ceiling;
> +             flush_pid = nr_pages >= tlb_single_page_flush_ceiling;
>       else
>               flush_pid = nr_pages > 
tlb_local_single_page_flush_ceiling;

I evaluated the patches from Aneesh with a micro benchmark which does 
shmat, shmdt of 256 MB segment.
Higher the rate of work, better the performance. With a value of 32, we 
match the performance of 
GTSE=off. This was evaluated on SLES15 SP3 kernel.


# cat /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling 
32

# perf stat -I 1000 -a -e powerpc:tlbie,r30058 ./tlbie -i 5 -c 1 t 1
 Rate of work: = 311 
#           time             counts unit events
     1.013131404              50939      powerpc:tlbie   
     1.013131404              50658      r30058  
 Rate of work: = 318 
     2.026957019              51520      powerpc:tlbie   
     2.026957019              51481      r30058  
 Rate of work: = 318 
     3.038884431              51485      powerpc:tlbie   
     3.038884431              51461      r30058  
 Rate of work: = 318 
     4.051483926              51485      powerpc:tlbie   
     4.051483926              51520      r30058  
 Rate of work: = 318 
     5.063635713              48577      powerpc:tlbie   
     5.063635713              48347      r30058  
 
# echo 34 > /sys/kernel/debug/powerpc/tlb_single_page_flush_ceiling 

# perf stat -I 1000 -a -e powerpc:tlbie,r30058 ./tlbie -i 5 -c 1 t 1
 Rate of work: = 174 
#           time             counts unit events
     1.012672696             721471      powerpc:tlbie   
     1.012672696             726491      r30058  
 Rate of work: = 177 
     2.026348661             737460      powerpc:tlbie   
     2.026348661             736138      r30058  
 Rate of work: = 178 
     3.037932122             737460      powerpc:tlbie   
     3.037932122             737460      r30058  
 Rate of work: = 178 
     4.050198819             737044      powerpc:tlbie   
     4.050198819             737460      r30058  
 Rate of work: = 177 
     5.062400776             692832      powerpc:tlbie   
     5.062400776             688319      r30058          


Regards,
Puvichakravarthy Ramachandran




^ permalink raw reply

* Re: [PATCH v1 14/55] KVM: PPC: Book3S HV: Don't always save PMU for guest capable of nesting
From: Michael Ellerman @ 2021-08-06  7:34 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210726035036.739609-15-npiggin@gmail.com>

Nicholas Piggin <npiggin@gmail.com> writes:
> Revert the workaround added by commit 63279eeb7f93a ("KVM: PPC: Book3S
> HV: Always save guest pmu for guest capable of nesting").
>
> Nested capable guests running with the earlier commit ("KVM: PPC: Book3S
> HV Nested: Indicate guest PMU in-use in VPA") will now indicate the PMU
> in-use status of their guests, which means the parent does not need to
> unconditionally save the PMU for nested capable guests.
>
> This will cause the PMU to break for nested guests when running older
> nested hypervisor guests under a kernel with this change. It's unclear
> there's an easy way to avoid that, so this could wait for a release or
> so for the fix to filter into stable kernels.

I'm not sure PMU inside nested guests is getting much use, but I don't
think we can break this quite so casually :)

Especially as the failure mode will be PMU counts that don't match
reality, which is hard to diagnose. It took nearly a year for us to find
the original bug.

I think we need to hold this back for a while.

We could put it under a CONFIG option, and then flip that option to off
at some point in the future.

cheers

> index e7f8cc04944b..ab89db561c85 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4003,8 +4003,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
>  		vcpu->arch.vpa.dirty = 1;
>  		save_pmu = lp->pmcregs_in_use;
>  	}
> -	/* Must save pmu if this guest is capable of running nested guests */
> -	save_pmu |= nesting_enabled(vcpu->kvm);
>  
>  	kvmhv_save_guest_pmu(vcpu, save_pmu);
>  #ifdef CONFIG_PPC_PSERIES
> -- 
> 2.23.0

^ permalink raw reply

* Re: [PATCH v1 16/55] powerpc/64s: Implement PMU override command line option
From: Madhavan Srinivasan @ 2021-08-06  7:33 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev
In-Reply-To: <20210726035036.739609-17-npiggin@gmail.com>


On 7/26/21 9:19 AM, Nicholas Piggin wrote:
> It can be useful in simulators (with very constrained environments)
> to allow some PMCs to run from boot so they can be sampled directly
> by a test harness, rather than having to run perf.
>
> A previous change freezes counters at boot by default, so provide
> a boot time option to un-freeze (plus a bit more flexibility).
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>   .../admin-guide/kernel-parameters.txt         |  7 ++++
>   arch/powerpc/perf/core-book3s.c               | 35 +++++++++++++++++++
>   2 files changed, 42 insertions(+)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index bdb22006f713..96b7d0ebaa40 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -4089,6 +4089,13 @@
>   			Override pmtimer IOPort with a hex value.
>   			e.g. pmtmr=0x508
>
> +	pmu=		[PPC] Manually enable the PMU.


This is bit confusing, IIUC, we are manually disabling the perf 
registration
with this option and not pmu. If this option is used, we will unfreeze the
MMCR0_FC (only in the HV_mode) and not register perf subsystem.
Since this option is valid only for HV_mode, canwe call it
kvm_disable_perf or kvm_dis_perf.


> +			Enable the PMU by setting MMCR0 to 0 (clear FC bit).
> +			This option is implemented for Book3S processors.
> +			If a number is given, then MMCR1 is set to that number,
> +			otherwise (e.g., 'pmu=on'), it is left 0. The perf
> +			subsystem is disabled if this option is used.
> +
>   	pm_debug_messages	[SUSPEND,KNL]
>   			Enable suspend/resume debug messages during boot up.
>
> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
> index 65795cadb475..e7cef4fe17d7 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -2428,8 +2428,24 @@ int register_power_pmu(struct power_pmu *pmu)
>   }
>
>   #ifdef CONFIG_PPC64
> +static bool pmu_override = false;
> +static unsigned long pmu_override_val;
> +static void do_pmu_override(void *data)
> +{
> +	ppc_set_pmu_inuse(1);
> +	if (pmu_override_val)
> +		mtspr(SPRN_MMCR1, pmu_override_val);
> +	mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
> +}
> +
>   static int __init init_ppc64_pmu(void)
>   {
> +	if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) {
> +		printk(KERN_WARNING "perf: disabling perf due to pmu= command line option.\n");
> +		on_each_cpu(do_pmu_override, NULL, 1);
> +		return 0;
> +	}
> +
>   	/* run through all the pmu drivers one at a time */
>   	if (!init_power5_pmu())
>   		return 0;
> @@ -2451,4 +2467,23 @@ static int __init init_ppc64_pmu(void)
>   		return init_generic_compat_pmu();
>   }
>   early_initcall(init_ppc64_pmu);
> +
> +static int __init pmu_setup(char *str)
> +{
> +	unsigned long val;
> +
> +	if (!early_cpu_has_feature(CPU_FTR_HVMODE))
> +		return 0;
> +
> +	pmu_override = true;
> +
> +	if (kstrtoul(str, 0, &val))
> +		val = 0;
> +
> +	pmu_override_val = val;
> +
> +	return 1;
> +}
> +__setup("pmu=", pmu_setup);
> +
>   #endif

^ permalink raw reply

* Re: [RFC PATCH 1/4] powerpc: Optimize register usage for esr register
From: Christophe Leroy @ 2021-08-06  7:32 UTC (permalink / raw)
  To: Xiongwei Song
  Cc: ravi.bangoria, Xiongwei Song, oleg, npiggin,
	Linux Kernel Mailing List, efremov, Paul Mackerras, aneesh.kumar,
	peterx, PowerPC, akpm, sandipan
In-Reply-To: <CAEVVKH-rht+xpwTUL=ny6qENe2Fb0n=3e7DEmc5qzpSq2_1gTA@mail.gmail.com>



Le 06/08/2021 à 05:16, Xiongwei Song a écrit :
> On Thu, Aug 5, 2021 at 6:06 PM Christophe Leroy
> <christophe.leroy@csgroup.eu> wrote:
>>
>>
>>
>> Le 26/07/2021 à 16:30, sxwjean@me.com a écrit :
>>> From: Xiongwei Song <sxwjean@gmail.com>
>>>
>>> Create an anonymous union for dsisr and esr regsiters, we can reference
>>> esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y.
>>> Otherwise, reference dsisr. This makes code more clear.
>>
>> I'm not sure it is worth doing that.
> Why don't we use "esr" as reference manauls mentioned?
> 
>>
>> What is the point in doing the following when you know that regs->esr and regs->dsisr are exactly
>> the same:
>>
>>   > -    err = ___do_page_fault(regs, regs->dar, regs->dsisr);
>>   > +    if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
>>   > +            err = ___do_page_fault(regs, regs->dar, regs->esr);
>>   > +    else
>>   > +            err = ___do_page_fault(regs, regs->dar, regs->dsisr);
>>   > +
> Yes, we can drop this. But it's a bit vague.
> 
>> Or even
>>
>>   > -    int is_write = page_fault_is_write(regs->dsisr);
>>   > +    unsigned long err_reg;
>>   > +    int is_write;
>>   > +
>>   > +    if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
>>   > +            err_reg = regs->esr;
>>   > +    else
>>   > +            err_reg = regs->dsisr;
>>   > +
>>   > +    is_write = page_fault_is_write(err_reg);
>>
>>
>> Artificially growing the code for that makes no sense to me.
> 
> We can drop this too.
>>
>>
>> To avoid anbiguity, maybe the best would be to rename regs->dsisr to something like regs->sr , so
>> that we know it represents the status register, which is DSISR or ESR depending on the platform.
> 
> If so, this would make other people more confused. My consideration is
> to follow what the reference
> manuals represent.

Maybe then we could rename the fields as regs->dsisr_esr and regs->dar_dear

That would be more explicit for everyone.

The UAPI header however should remain as is because anonymous unions are not supported by old 
compilers as mentioned by Michael.

But nevertheless, there are also situations where was is stored in regs->dsisr is not what we have 
in DSISR register. For instance on an ISI exception, we store a subset of the content of SRR1 
register into regs->dsisr.

Christophe

^ permalink raw reply

* Re: [RFC PATCH 1/4] powerpc: Optimize register usage for esr register
From: Michael Ellerman @ 2021-08-06  6:53 UTC (permalink / raw)
  To: sxwjean, linuxppc-dev
  Cc: ravi.bangoria, Xiongwei Song, oleg, npiggin, linux-kernel,
	efremov, paulus, aneesh.kumar, peterx, akpm, sandipan
In-Reply-To: <20210726143053.532839-1-sxwjean@me.com>

sxwjean@me.com writes:
> From: Xiongwei Song <sxwjean@gmail.com>
>
> Create an anonymous union for dsisr and esr regsiters, we can reference
> esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y.
> Otherwise, reference dsisr. This makes code more clear.
>
> Signed-off-by: Xiongwei Song <sxwjean@gmail.com>
> ---
>  arch/powerpc/include/asm/ptrace.h          |  5 ++++-
>  arch/powerpc/include/uapi/asm/ptrace.h     |  5 ++++-
>  arch/powerpc/kernel/process.c              |  2 +-
>  arch/powerpc/kernel/ptrace/ptrace.c        |  2 ++
>  arch/powerpc/kernel/traps.c                |  2 +-
>  arch/powerpc/mm/fault.c                    | 16 ++++++++++++++--
>  arch/powerpc/platforms/44x/machine_check.c |  4 ++--
>  arch/powerpc/platforms/4xx/machine_check.c |  2 +-
>  8 files changed, 29 insertions(+), 9 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
> index 3e5d470a6155..c252d04b1206 100644
> --- a/arch/powerpc/include/asm/ptrace.h
> +++ b/arch/powerpc/include/asm/ptrace.h
> @@ -44,7 +44,10 @@ struct pt_regs
>  #endif
>  			unsigned long trap;
>  			unsigned long dar;
> -			unsigned long dsisr;
> +			union {
> +				unsigned long dsisr;
> +				unsigned long esr;
> +			};

I don't mind doing that.

>  			unsigned long result;
>  		};
>  	};
> diff --git a/arch/powerpc/include/uapi/asm/ptrace.h b/arch/powerpc/include/uapi/asm/ptrace.h
> index 7004cfea3f5f..e357288b5f34 100644
> --- a/arch/powerpc/include/uapi/asm/ptrace.h
> +++ b/arch/powerpc/include/uapi/asm/ptrace.h
> @@ -53,7 +53,10 @@ struct pt_regs
>  	/* N.B. for critical exceptions on 4xx, the dar and dsisr
>  	   fields are overloaded to hold srr0 and srr1. */
>  	unsigned long dar;		/* Fault registers */
> -	unsigned long dsisr;		/* on 4xx/Book-E used for ESR */
> +	union {
> +		unsigned long dsisr;		/* on Book-S used for DSISR */
> +		unsigned long esr;		/* on 4xx/Book-E used for ESR */
> +	};
>  	unsigned long result;		/* Result of a system call */
>  };

But I'm not sure about the use of anonymous unions in UAPI headers. Old
compilers don't support them, so there's a risk of breakage.

I'd rather we didn't touch the uapi version.


> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index 185beb290580..f74af8f9133c 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs)
>  	    trap == INTERRUPT_DATA_STORAGE ||
>  	    trap == INTERRUPT_ALIGNMENT) {
>  		if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
> -			pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr);
> +			pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->esr);
>  		else
>  			pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr);
>  	}
> diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c
> index 0a0a33eb0d28..00789ad2c4a3 100644
> --- a/arch/powerpc/kernel/ptrace/ptrace.c
> +++ b/arch/powerpc/kernel/ptrace/ptrace.c
> @@ -375,6 +375,8 @@ void __init pt_regs_check(void)
>  		     offsetof(struct user_pt_regs, dar));
>  	BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) !=
>  		     offsetof(struct user_pt_regs, dsisr));
> +	BUILD_BUG_ON(offsetof(struct pt_regs, esr) !=
> +		     offsetof(struct user_pt_regs, esr));
>  	BUILD_BUG_ON(offsetof(struct pt_regs, result) !=
>  		     offsetof(struct user_pt_regs, result));
>  
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index dfbce527c98e..2164f5705a0b 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs)
>  #ifdef CONFIG_PPC_ADV_DEBUG_REGS
>  /* On 4xx, the reason for the machine check or program exception
>     is in the ESR. */
> -#define get_reason(regs)	((regs)->dsisr)
> +#define get_reason(regs)	((regs)->esr)
>  #define REASON_FP		ESR_FP
>  #define REASON_ILLEGAL		(ESR_PIL | ESR_PUO)
>  #define REASON_PRIVILEGED	ESR_PPR
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index a8d0ce85d39a..62953d4e7c93 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -541,7 +541,11 @@ static __always_inline void __do_page_fault(struct pt_regs *regs)
>  {
>  	long err;
>  
> -	err = ___do_page_fault(regs, regs->dar, regs->dsisr);
> +	if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE))
> +		err = ___do_page_fault(regs, regs->dar, regs->esr);
> +	else
> +		err = ___do_page_fault(regs, regs->dar, regs->dsisr);

As Christophe said, I don't thinks this is an improvement.

It makes the code less readable. If anyone is confused about what is
passed to ___do_page_fault() they can either read the comment above it,
or look at the definition of pt_regs to see that esr and dsisr share
storage.

cheers

^ permalink raw reply

* Re: [PATCH v6 3/6] powerpc/pseries: Consolidate different NUMA distance update code paths
From: David Gibson @ 2021-08-06  6:37 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <20210727100311.310969-4-aneesh.kumar@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 11361 bytes --]

On Tue, Jul 27, 2021 at 03:33:08PM +0530, Aneesh Kumar K.V wrote:
> The associativity details of the newly added resourced are collected from
> the hypervisor via "ibm,configure-connector" rtas call. Update the numa
> distance details of the newly added numa node after the above call.
> 
> Instead of updating NUMA distance every time we lookup a node id
> from the associativity property, add helpers that can be used
> during boot which does this only once. Also remove the distance
> update from node id lookup helpers.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/include/asm/topology.h           |   2 +
>  arch/powerpc/mm/numa.c                        | 178 +++++++++++++-----
>  arch/powerpc/platforms/pseries/hotplug-cpu.c  |   2 +
>  .../platforms/pseries/hotplug-memory.c        |   2 +
>  4 files changed, 138 insertions(+), 46 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
> index e4db64c0e184..a6425a70c37b 100644
> --- a/arch/powerpc/include/asm/topology.h
> +++ b/arch/powerpc/include/asm/topology.h
> @@ -64,6 +64,7 @@ static inline int early_cpu_to_node(int cpu)
>  }
>  
>  int of_drconf_to_nid_single(struct drmem_lmb *lmb);
> +void update_numa_distance(struct device_node *node);
>  
>  #else
>  
> @@ -93,6 +94,7 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>  	return first_online_node;
>  }
>  
> +static inline void update_numa_distance(struct device_node *node) {}
>  #endif /* CONFIG_NUMA */
>  
>  #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 368719b14dcc..c695faf67d68 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -208,22 +208,6 @@ int __node_distance(int a, int b)
>  }
>  EXPORT_SYMBOL(__node_distance);
>  
> -static void initialize_distance_lookup_table(int nid,
> -		const __be32 *associativity)
> -{
> -	int i;
> -
> -	if (affinity_form != FORM1_AFFINITY)
> -		return;
> -
> -	for (i = 0; i < distance_ref_points_depth; i++) {
> -		const __be32 *entry;
> -
> -		entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> -		distance_lookup_table[nid][i] = of_read_number(entry, 1);
> -	}
> -}
> -
>  /*
>   * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
>   * info is found.
> @@ -241,15 +225,6 @@ static int associativity_to_nid(const __be32 *associativity)
>  	/* POWER4 LPAR uses 0xffff as invalid node */
>  	if (nid == 0xffff || nid >= nr_node_ids)
>  		nid = NUMA_NO_NODE;
> -
> -	if (nid > 0 &&
> -		of_read_number(associativity, 1) >= distance_ref_points_depth) {
> -		/*
> -		 * Skip the length field and send start of associativity array
> -		 */
> -		initialize_distance_lookup_table(nid, associativity + 1);
> -	}
> -
>  out:
>  	return nid;
>  }
> @@ -287,6 +262,48 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> +static void __initialize_form1_numa_distance(const __be32 *associativity)
> +{
> +	int i, nid;
> +
> +	if (affinity_form != FORM1_AFFINITY)
> +		return;
> +
> +	nid = associativity_to_nid(associativity);
> +	if (nid != NUMA_NO_NODE) {
> +		for (i = 0; i < distance_ref_points_depth; i++) {
> +			const __be32 *entry;
> +
> +			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
> +			distance_lookup_table[nid][i] = of_read_number(entry, 1);

So, in the conversion from the old initialize_distance_lookup_table()
you change this from accepting a bare associativity list, to requiring
the length on the front.  [*]

> +		}
> +	}
> +}
> +
> +static void initialize_form1_numa_distance(struct device_node *node)
> +{
> +	const __be32 *associativity;
> +
> +	associativity = of_get_associativity(node);
> +	if (!associativity)
> +		return;
> +
> +	__initialize_form1_numa_distance(associativity);
> +}
> +
> +/*
> + * Used to update distance information w.r.t newly added node.
> + */
> +void update_numa_distance(struct device_node *node)
> +{
> +	if (affinity_form == FORM0_AFFINITY)
> +		return;
> +	else if (affinity_form == FORM1_AFFINITY) {
> +		initialize_form1_numa_distance(node);
> +		return;
> +	}
> +}
> +
>  static int __init find_primary_domain_index(void)
>  {
>  	int index;
> @@ -433,6 +450,48 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
>  	return 0;
>  }
>  
> +static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
> +{
> +	struct assoc_arrays aa = { .arrays = NULL };
> +	int default_nid = NUMA_NO_NODE;

You never change default_nid, so it seems like it would be clearer to
get rid of this and just use NUMA_NO_NODE inline everywhere below.

> +	int nid = default_nid;
> +	int rc, index;
> +
> +	if ((primary_domain_index < 0) || !numa_enabled)
> +		return default_nid;
> +
> +	rc = of_get_assoc_arrays(&aa);
> +	if (rc)
> +		return default_nid;
> +
> +	if (primary_domain_index <= aa.array_sz &&
> +	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
> +		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> +		nid = of_read_number(&aa.arrays[index], 1);
> +
> +		if (nid == 0xffff || nid >= nr_node_ids)
> +			nid = default_nid;
> +		if (nid > 0 && affinity_form == FORM1_AFFINITY) {

Is the nid > 0 test assuming something about what are and aren't valid
nids?  Should it be nid != NUMA_NO_NODE instead?

> +			int i;
> +			const __be32 *associativity;
> +
> +			index = lmb->aa_index * aa.array_sz;
> +			associativity = &aa.arrays[index];

Again, you can reuse form1_numa_distance() here if it weren't for the
fact that you broke it at [*] above to no longer accept bare
associativity lists.

> +			/*
> +			 * lookup array associativity entries have different format
> +			 * There is no length of the array as the first element.
> +			 */
> +			for (i = 0; i < distance_ref_points_depth; i++) {
> +				const __be32 *entry;
> +
> +				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> +				distance_lookup_table[nid][i] = of_read_number(entry, 1);
> +			}
> +		}
> +	}
> +	return nid;
> +}
> +
>  /*
>   * This is like of_node_to_nid_single() for memory represented in the
>   * ibm,dynamic-reconfiguration-memory node.
> @@ -458,21 +517,14 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>  
>  		if (nid == 0xffff || nid >= nr_node_ids)
>  			nid = default_nid;
> -
> -		if (nid > 0) {
> -			index = lmb->aa_index * aa.array_sz;
> -			initialize_distance_lookup_table(nid,
> -							&aa.arrays[index]);
> -		}
>  	}
> -
>  	return nid;
>  }
>  
>  #ifdef CONFIG_PPC_SPLPAR
> -static int vphn_get_nid(long lcpu)
> +
> +static int __vphn_get_associativity(long lcpu, __be32 *associativity)
>  {
> -	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
>  	long rc, hwid;
>  
>  	/*
> @@ -492,12 +544,30 @@ static int vphn_get_nid(long lcpu)
>  
>  		rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
>  		if (rc == H_SUCCESS)
> -			return associativity_to_nid(associativity);
> +			return 0;
>  	}
>  
> +	return -1;
> +}
> +
> +static int vphn_get_nid(long lcpu)
> +{
> +	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
> +
> +
> +	if (!__vphn_get_associativity(lcpu, associativity))
> +		return associativity_to_nid(associativity);
> +
>  	return NUMA_NO_NODE;
> +
>  }
>  #else
> +
> +static int __vphn_get_associativity(long lcpu, __be32 *associativity)
> +{
> +	return -1;
> +}
> +
>  static int vphn_get_nid(long unused)
>  {
>  	return NUMA_NO_NODE;
> @@ -692,7 +762,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
>  			size = read_n_cells(n_mem_size_cells, usm);
>  		}
>  
> -		nid = of_drconf_to_nid_single(lmb);
> +		nid = get_nid_and_numa_distance(lmb);
>  		fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
>  					  &nid);
>  		node_set_online(nid);
> @@ -709,6 +779,7 @@ static int __init parse_numa_properties(void)
>  	struct device_node *memory;
>  	int default_nid = 0;
>  	unsigned long i;
> +	const __be32 *associativity;
>  
>  	if (numa_enabled == 0) {
>  		printk(KERN_WARNING "NUMA disabled by user\n");
> @@ -734,18 +805,30 @@ static int __init parse_numa_properties(void)
>  	 * each node to be onlined must have NODE_DATA etc backing it.
>  	 */
>  	for_each_present_cpu(i) {
> +		__be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
>  		struct device_node *cpu;
> -		int nid = vphn_get_nid(i);
> +		int nid = NUMA_NO_NODE;
>  
> -		/*
> -		 * Don't fall back to default_nid yet -- we will plug
> -		 * cpus into nodes once the memory scan has discovered
> -		 * the topology.
> -		 */
> -		if (nid == NUMA_NO_NODE) {
> +		memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
> +
> +		if (__vphn_get_associativity(i, vphn_assoc) == 0) {
> +			nid = associativity_to_nid(vphn_assoc);
> +			__initialize_form1_numa_distance(vphn_assoc);
> +		} else {
> +
> +			/*
> +			 * Don't fall back to default_nid yet -- we will plug
> +			 * cpus into nodes once the memory scan has discovered
> +			 * the topology.
> +			 */
>  			cpu = of_get_cpu_node(i, NULL);
>  			BUG_ON(!cpu);
> -			nid = of_node_to_nid_single(cpu);
> +
> +			associativity = of_get_associativity(cpu);
> +			if (associativity) {
> +				nid = associativity_to_nid(associativity);
> +				__initialize_form1_numa_distance(associativity);
> +			}
>  			of_node_put(cpu);
>  		}
>  
> @@ -781,8 +864,11 @@ static int __init parse_numa_properties(void)
>  		 * have associativity properties.  If none, then
>  		 * everything goes to default_nid.
>  		 */
> -		nid = of_node_to_nid_single(memory);
> -		if (nid < 0)
> +		associativity = of_get_associativity(memory);
> +		if (associativity) {
> +			nid = associativity_to_nid(associativity);
> +			__initialize_form1_numa_distance(associativity);
> +		} else
>  			nid = default_nid;
>  
>  		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 7e970f81d8ff..778b6ab35f0d 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -498,6 +498,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
>  		return saved_rc;
>  	}
>  
> +	update_numa_distance(dn);
> +
>  	rc = dlpar_online_cpu(dn);
>  	if (rc) {
>  		saved_rc = rc;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 377d852f5a9a..ee1d81d7e54a 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb)
>  		return -ENODEV;
>  	}
>  
> +	update_numa_distance(lmb_node);
> +
>  	dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
>  	if (!dr_node) {
>  		dlpar_free_cc_nodes(lmb_node);

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH v6 6/6] powerpc/pseries: Consolidate form1 distance initialization into a helper
From: David Gibson @ 2021-08-06  6:47 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: Nathan Lynch, Daniel Henrique Barboza, linuxppc-dev
In-Reply-To: <20210727100311.310969-7-aneesh.kumar@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 6190 bytes --]

On Tue, Jul 27, 2021 at 03:33:11PM +0530, Aneesh Kumar K.V wrote:
> Currently, we duplicate parsing code for ibm,associativity and
> ibm,associativity-lookup-arrays in the kernel. The associativity array provided
> by these device tree properties are very similar and hence can use
> a helper to parse the node id and numa distance details.

Oh... sorry.. comments on the earlier patch were from before I read
and saw you adjusted things here.

> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/mm/numa.c | 83 ++++++++++++++++++++++++++----------------
>  1 file changed, 51 insertions(+), 32 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index fffb3c40f595..7506251e17f2 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -171,19 +171,19 @@ static void unmap_cpu_from_node(unsigned long cpu)
>  }
>  #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
>  
> -/*
> - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> - * info is found.
> - */
> -static int associativity_to_nid(const __be32 *associativity)
> +static int __associativity_to_nid(const __be32 *associativity,
> +				  int max_array_sz)
>  {
>  	int nid = NUMA_NO_NODE;
> +	/*
> +	 * primary_domain_index is 1 based array index.
> +	 */
> +	int index = primary_domain_index  - 1;
>  
> -	if (!numa_enabled)
> +	if (!numa_enabled || index >= max_array_sz)
>  		goto out;

You don't need a goto, you can just return NUMA_NO_NODE.

>  
> -	if (of_read_number(associativity, 1) >= primary_domain_index)
> -		nid = of_read_number(&associativity[primary_domain_index], 1);
> +	nid = of_read_number(&associativity[index], 1);
>  
>  	/* POWER4 LPAR uses 0xffff as invalid node */
>  	if (nid == 0xffff || nid >= nr_node_ids)
> @@ -191,6 +191,17 @@ static int associativity_to_nid(const __be32 *associativity)
>  out:
>  	return nid;
>  }
> +/*
> + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
> + * info is found.
> + */
> +static int associativity_to_nid(const __be32 *associativity)
> +{
> +	int array_sz = of_read_number(associativity, 1);
> +
> +	/* Skip the first element in the associativity array */
> +	return __associativity_to_nid((associativity + 1), array_sz);
> +}
>  
>  static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
>  {
> @@ -295,24 +306,41 @@ int of_node_to_nid(struct device_node *device)
>  }
>  EXPORT_SYMBOL(of_node_to_nid);
>  
> -static void __initialize_form1_numa_distance(const __be32 *associativity)
> +static void ___initialize_form1_numa_distance(const __be32 *associativity,
> +					     int max_array_sz)
>  {
>  	int i, nid;
>  
>  	if (affinity_form != FORM1_AFFINITY)
>  		return;
>  
> -	nid = associativity_to_nid(associativity);
> +	nid = __associativity_to_nid(associativity, max_array_sz);
>  	if (nid != NUMA_NO_NODE) {
>  		for (i = 0; i < distance_ref_points_depth; i++) {
>  			const __be32 *entry;
> +			int index = be32_to_cpu(distance_ref_points[i]) - 1;
> +
> +			/*
> +			 * broken hierarchy, return with broken distance table

WARN_ON, maybe?

> +			 */
> +			if (index >= max_array_sz)
> +				return;
>  
> -			entry = &associativity[be32_to_cpu(distance_ref_points[i])];
> +			entry = &associativity[index];
>  			distance_lookup_table[nid][i] = of_read_number(entry, 1);
>  		}
>  	}
>  }
>  
> +static void __initialize_form1_numa_distance(const __be32 *associativity)

Do you actually use this in-between wrapper?

> +{
> +	int array_sz;
> +
> +	array_sz = of_read_number(associativity, 1);
> +	/* Skip the first element in the associativity array */
> +	___initialize_form1_numa_distance(associativity + 1, array_sz);
> +}
> +
>  static void initialize_form1_numa_distance(struct device_node *node)
>  {
>  	const __be32 *associativity;
> @@ -586,27 +614,18 @@ static int get_nid_and_numa_distance(struct drmem_lmb *lmb)
>  
>  	if (primary_domain_index <= aa.array_sz &&
>  	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
> -		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> -		nid = of_read_number(&aa.arrays[index], 1);
> +		const __be32 *associativity;
>  
> -		if (nid == 0xffff || nid >= nr_node_ids)
> -			nid = default_nid;
> +		index = lmb->aa_index * aa.array_sz;
> +		associativity = &aa.arrays[index];
> +		nid = __associativity_to_nid(associativity, aa.array_sz);
>  		if (nid > 0 && affinity_form == FORM1_AFFINITY) {
> -			int i;
> -			const __be32 *associativity;
> -
> -			index = lmb->aa_index * aa.array_sz;
> -			associativity = &aa.arrays[index];
>  			/*
> -			 * lookup array associativity entries have different format
> -			 * There is no length of the array as the first element.
> +			 * lookup array associativity entries have
> +			 * no length of the array as the first element.
>  			 */
> -			for (i = 0; i < distance_ref_points_depth; i++) {
> -				const __be32 *entry;
> -
> -				entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
> -				distance_lookup_table[nid][i] = of_read_number(entry, 1);
> -			}
> +			___initialize_form1_numa_distance(associativity,
> +							  aa.array_sz);

Better, thanks.

>  		}
>  	}
>  	return nid;
> @@ -632,11 +651,11 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>  
>  	if (primary_domain_index <= aa.array_sz &&
>  	    !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
> -		index = lmb->aa_index * aa.array_sz + primary_domain_index - 1;
> -		nid = of_read_number(&aa.arrays[index], 1);
> +		const __be32 *associativity;
>  
> -		if (nid == 0xffff || nid >= nr_node_ids)
> -			nid = default_nid;
> +		index = lmb->aa_index * aa.array_sz;
> +		associativity = &aa.arrays[index];
> +		nid = __associativity_to_nid(associativity, aa.array_sz);
>  	}
>  	return nid;
>  }

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: Debian SID kernel doesn't boot on PowerBook 3400c
From: Christophe Leroy @ 2021-08-06  6:09 UTC (permalink / raw)
  To: Finn Thain, Nicholas Piggin; +Cc: debian-powerpc, linuxppc-dev, Stan Johnson
In-Reply-To: <2619d78-e8f-334a-20c0-2a60c936a293@linux-m68k.org>

+nicholas piggin for the C interrupt stuff

Le 06/08/2021 à 03:06, Finn Thain a écrit :
> (Christophe, you've seen some of this before, however there are new
> results added at the end. I've Cc'd the mailing lists this time.)
> 
> On Wed, 4 Aug 2021, Stan Johnson wrote:
> 
>> On 8/4/21 8:41 PM, Finn Thain wrote:
>>
>>>
>>> $ curl https://lore.kernel.org/lkml/9b64dde3-6ebd-b446-41d9-61e8cb0d8c39@csgroup.eu/raw
>>> ../message.mbox
>> ok
>>
>> $ sha1 ../message.mbox
>> SHA1 (../message.mbox) = 436ce0adf893c46c84c54607f73c838897caeeea
>>
>>>
>>> On Wed, 4 Aug 2021, Christophe Leroy wrote:
>>>
>>>> Can you check if they happen at commit c16728835
>>>>
>>
>> $ git checkout c16728835eec
>> Checking out files: 100% (20728/20728), done.
>> Note: checking out 'c16728835eec'.
>>
>> You are in 'detached HEAD' state. You can look around, make experimental
>> changes and commit them, and you can discard any commits you make in this
>> state without impacting any branches by performing another checkout.
>>
>> If you want to create a new branch to retain commits you create, you may
>> do so (now or later) by using -b with the checkout command again. Example:
>>
>>    git checkout -b <new-branch-name>
>>
>> HEAD is now at c16728835eec powerpc/32: Manage KUAP in C
>> $ git am ../message.mbox
>> warning: Patch sent with format=flowed; space at the end of lines might be lost.
>> Applying: powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE
>> $ cp ../dot-config-powermac-5.13 .config
>> $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean olddefconfig vmlinux
>> $ strings vmlinux | fgrep 'Linux version'
>> Linux version 5.12.0-rc3-pmac-00078-geb51c431b81 (johnson@ThinkPad) (powerpc-linux-gnu-gcc (Debian 8.3.0-2) 8.3.0, GNU ld (GNU Binutils for Debian) 2.31.1) #1 SMP Wed Aug 4 21:50:47 MDT 2021
>>
>> 1) PB 3400c
>> Hangs at boot (Mac OS screen), no serial console output
>>
>> 2) Wallstreet
>> X fails, errors ("Kernel attempted to write user page", "BUG: Unable to
>> handle kernel instruction fetch"), see Wallstreet_console-1.txt.
>>
> 
> The log shows that the error "Kernel attempted to write user page
> (b3399774) - exploit attempt?" happens after commit c16728835eec
> ("powerpc/32: Manage KUAP in C").

I think I found a possible cause for this. After the above patch, locking KUAP on interrupt is done 
in interrupt_enter_prepare(). But in case of NMI interrupt, that function is not called. That means 
that when leaving interrupt through interrupt_exit_kernel_prepare(), the supposedly saved previous 
KUAP status is garbage.

An easy way to fix that is to add missing stuff in interrupt_nmi_enter_prepare(), I'll do that at 
least for testing, but at the end it is not so easy, because of booke32 and 40x.

The problem on booke32 and 40x is that the "critical interrupts" exit goes through interrupt_return 
when they happened in user mode and bypass interrupt_return when they happened in kernel mode. So it 
is not easy to manage.


> 
>>>>
>>>> Can you check if they DO NOT happen at preceding commit c16728835~
>>>>
>>
>> $ git checkout c16728835~
>> Previous HEAD position was c16728835eec powerpc/32: Manage KUAP in C
>> HEAD is now at 0b45359aa2df powerpc/8xx: Create C version of kuap save/restore/check helpers
>> $ git am ../message.mbox
>> warning: Patch sent with format=flowed; space at the end of lines might be lost.
>> Applying: powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE
>> $ cp ../dot-config-powermac-5.13 .config
>> $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean olddefconfig vmlinux
>>
>> Linux version 5.12.0-rc3-pmac-00077-gc9f6e8dd045
>>
>> 3) PB 3400c
>> Hangs at boot (Mac OS screen)
>>
>> 4) Wallstreet
>> X fails, errors in console log (different than test 2), see
>> Wallstreet_console-2.txt.
>>
> 
> This log shows that the errors "xfce4-session[1775]: bus error (7)" and
> "kernel BUG at arch/powerpc/kernel/interrupt.c:49!" happen prior to commit
> c16728835eec ("powerpc/32: Manage KUAP in C").

As mentionned by Nic, this is due to r11 being cloberred. For the time being the only r11 clobber 
identified is the one I have provided a fix for. I'm wondering whether it was applied for all 
further tests or not.

> 
>>
>> $ git checkout 0b45359aa2df
>> ...
>> HEAD is now at 0b45359aa2df powerpc/8xx: Create C version of kuap save/restore/check helpers
>> $ git am ../message.mbox
>> warning: Patch sent with format=flowed; space at the end of lines might be lost.
>> Applying: powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE
>> $ cp ../dot-config-powermac-5.13 .config
>> $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean olddefconfig vmlinux
>>
>> Linux version 5.12.0-rc3-pmac-00077-ge06b29ce146
>>
>> 5) PB 3400c
>> Hangs at boot (Mac OS screen)
>>
>> 6) Wallstreet
>> X failed (X login succeeded, but setting up desktop failed), errors in
>> console log, see Wallstreet_console-3.txt.
>>
> 
> (No need for those two tests: it's exactly the same code and almost the
> same failure modes: "kernel BUG at arch/powerpc/kernel/interrupt.c:50".)
> 
> On Thu, 5 Aug 2021, Stan Johnson wrote:
> 
>> On 8/5/21 12:47 AM, Finn Thain wrote:
>>
>>> On Wed, 4 Aug 2021, Christophe Leroy wrote:
>>>
>>>> Could you test without CONFIG_PPC_KUAP
>> ...
>>
>> $ git checkout c16728835eec
>> ...
>> HEAD is now at c16728835eec powerpc/32: Manage KUAP in C
>> $ git am ../message.mbox
>> warning: Patch sent with format=flowed; space at the end of lines might be lost.
>> Applying: powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE
>> $ cp ../dot-config-powermac-5.13 .config
>> $ scripts/config -d CONFIG_PPC_KUAP
>> $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean olddefconfig vmlinux
>> $ grep CONFIG_PPC_KUAP .config
>> # CONFIG_PPC_KUAP is not set
>>
>> Linux version 5.12.0-rc3-pmac-00078-g5cac2bc3752
>>
>> 7) PB 3400c
>> Hangs at boot (Mac OS screen)
>>
>> 8) Wallstreet
>> Everything works, no errors (see Wallstreet_console-4.txt).
>>
> 
> That would seem to implicate CONFIG_PPC_KUAP itself. (Note that all builds
> up until this one have CONFIG_PPC_KUAP=y.)

Yes I believe so, see at the begining of this mail.


> 
>>
>>>
>>>> Could you test with CONFIG_PPC_KUAP and CONFIG_PPC_KUAP_DEBUG
>> ...
>>
>> $scripts/config -e CONFIG_PPC_KUAP
>> $ scripts/config -e CONFIG_PPC_KUAP_DEBUG
>> $ make ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- -j4 clean olddefconfig vmlinux
>> $ grep CONFIG_PPC_KUAP .config
>> CONFIG_PPC_KUAP=y
>> CONFIG_PPC_KUAP_DEBUG=y
>>
>> Linux version 5.12.0-rc3-pmac-00078-g5cac2bc3752
>>
>> 9) PB 3400c
>> Hangs at boot (Mac OS screen)
>>
>> 10) Wallstreet
>> X failed at first login, worked at second login, one error in console
>> log ("BUG: Unable to handle kernel instruction fetch"), see
>> Wallstreet_console-5.txt.
>>
> 
> One might expect to see "Kernel attempted to write user page (b3399774) -
> exploit attempt?" again here (see c16728835eec build above) but instead
> this log says "Oops: Kernel access of bad area, sig: 11".

Maybe the test should be done a second time. As r11 is garbage it may or may not be a user address. 
If it is a user address the we get "Kernel attempted to write user page". If it is a random kernel 
address, we likely get "Kernel access of bad area" instead.


> 
> BTW, this procedure could be made simpler and easier if I pushed git
> branches to a public repo for Stan to build, which included Christophe's
> fix plus hard-wired Kconfig changes. That way, the .config file could be
> held constant and the commit hash in the serial console log would be more
> meaningful.
> 

I like the idea, I think I'm going to provide testing fixes through a git repo, that will for sure 
make things easier.

Thanks
Christophe

^ permalink raw reply

* Re: [PATCHv2 2/3] powerpc/cacheinfo: Remove the redundant get_shared_cpu_map()
From: Srikar Dronamraju @ 2021-08-06  5:44 UTC (permalink / raw)
  To: Parth Shah; +Cc: ego, mikey, parths1229, svaidy, linuxppc-dev
In-Reply-To: <20210728175607.591679-3-parth@linux.ibm.com>

* Parth Shah <parth@linux.ibm.com> [2021-07-28 23:26:06]:

> From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
> 
> The helper function get_shared_cpu_map() was added in
> 
> 'commit 500fe5f550ec ("powerpc/cacheinfo: Report the correct
> shared_cpu_map on big-cores")'
> 
> and subsequently expanded upon in
> 
> 'commit 0be47634db0b ("powerpc/cacheinfo: Print correct cache-sibling
> map/list for L2 cache")'
> 
> in order to help report the correct groups of threads sharing these caches
> on big-core systems where groups of threads within a core can share
> different sets of caches.
> 
> Now that powerpc/cacheinfo is aware of "ibm,thread-groups" property,
> cache->shared_cpu_map contains the correct set of thread-siblings
> sharing the cache. Hence we no longer need the functions
> get_shared_cpu_map(). This patch removes this function. We also remove
> the helper function index_dir_to_cpu() which was only called by
> get_shared_cpu_map().
> 
> With these functions removed, we can still see the correct
> cache-sibling map/list for L1 and L2 caches on systems with L1 and L2
> caches distributed among groups of threads in a core.
> 
> With this patch, on a SMT8 POWER10 system where the L1 and L2 caches
> are split between the two groups of threads in a core, for CPUs 8,9,
> the L1-Data, L1-Instruction, L2, L3 cache CPU sibling list is as
> follows:
> 
> $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
> /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10,12,14
> /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10,12,14
> /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10,12,14
> /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-15
> /sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11,13,15
> /sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11,13,15
> /sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11,13,15
> /sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-15
> 
> $ ppc64_cpu --smt=4
> $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
> /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10
> /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10
> /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10
> /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-11
> /sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11
> /sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11
> /sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11
> /sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-11
> 
> $ ppc64_cpu --smt=2
> $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
> /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8
> /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8
> /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8
> /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-9
> /sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9
> /sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9
> /sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9
> /sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-9
> 
> $ ppc64_cpu --smt=1
> $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list
> /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8
> /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8
> /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8
> /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8
> 
> Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>

Looks good to me.

Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>

> ---
>  arch/powerpc/kernel/cacheinfo.c | 41 +--------------------------------
>  1 file changed, 1 insertion(+), 40 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
> index 5a6925d87424..20d91693eac1 100644
> --- a/arch/powerpc/kernel/cacheinfo.c
> +++ b/arch/powerpc/kernel/cacheinfo.c
> @@ -675,45 +675,6 @@ static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char *
>  static struct kobj_attribute cache_level_attr =
>  	__ATTR(level, 0444, level_show, NULL);
> 
> -static unsigned int index_dir_to_cpu(struct cache_index_dir *index)
> -{
> -	struct kobject *index_dir_kobj = &index->kobj;
> -	struct kobject *cache_dir_kobj = index_dir_kobj->parent;
> -	struct kobject *cpu_dev_kobj = cache_dir_kobj->parent;
> -	struct device *dev = kobj_to_dev(cpu_dev_kobj);
> -
> -	return dev->id;
> -}
> -
> -/*
> - * On big-core systems, each core has two groups of CPUs each of which
> - * has its own L1-cache. The thread-siblings which share l1-cache with
> - * @cpu can be obtained via cpu_smallcore_mask().
> - *
> - * On some big-core systems, the L2 cache is shared only between some
> - * groups of siblings. This is already parsed and encoded in
> - * cpu_l2_cache_mask().
> - *
> - * TODO: cache_lookup_or_instantiate() needs to be made aware of the
> - *       "ibm,thread-groups" property so that cache->shared_cpu_map
> - *       reflects the correct siblings on platforms that have this
> - *       device-tree property. This helper function is only a stop-gap
> - *       solution so that we report the correct siblings to the
> - *       userspace via sysfs.
> - */
> -static const struct cpumask *get_shared_cpu_map(struct cache_index_dir *index, struct cache *cache)
> -{
> -	if (has_big_cores) {
> -		int cpu = index_dir_to_cpu(index);
> -		if (cache->level == 1)
> -			return cpu_smallcore_mask(cpu);
> -		if (cache->level == 2 && thread_group_shares_l2)
> -			return cpu_l2_cache_mask(cpu);
> -	}
> -
> -	return &cache->shared_cpu_map;
> -}
> -
>  static ssize_t
>  show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bool list)
>  {
> @@ -724,7 +685,7 @@ show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bo
>  	index = kobj_to_cache_index_dir(k);
>  	cache = index->cache;
> 
> -	mask = get_shared_cpu_map(index, cache);
> +	mask = &cache->shared_cpu_map;
> 
>  	return cpumap_print_to_pagebuf(list, buf, mask);
>  }
> -- 
> 2.26.3
> 

-- 
Thanks and Regards
Srikar Dronamraju

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox