LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v6 06/10] powerpc/signal64: Replace setup_sigcontext() w/ unsafe_setup_sigcontext()
From: Christophe Leroy @ 2021-02-23 17:12 UTC (permalink / raw)
  To: Christopher M. Riedl, linuxppc-dev
In-Reply-To: <20210221012401.22328-7-cmr@codefail.de>



Le 21/02/2021 à 02:23, Christopher M. Riedl a écrit :
> Previously setup_sigcontext() performed a costly KUAP switch on every
> uaccess operation. These repeated uaccess switches cause a significant
> drop in signal handling performance.
> 
> Rewrite setup_sigcontext() to assume that a userspace write access window
> is open by replacing all uaccess functions with their 'unsafe' versions.
> Modify the callers to first open, call unsafe_setup_sigcontext() and
> then close the uaccess window.

Do you plan to also convert setup_tm_sigcontexts() ?
It would allow to then remove copy_fpr_to_user() and copy_ckfpr_to_user() and maybe other functions too.

Christophe

> 
> Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
> ---
>   arch/powerpc/kernel/signal_64.c | 71 ++++++++++++++++++++-------------
>   1 file changed, 44 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
> index bd8d210c9115..3faaa736ed62 100644
> --- a/arch/powerpc/kernel/signal_64.c
> +++ b/arch/powerpc/kernel/signal_64.c
> @@ -101,9 +101,13 @@ static void prepare_setup_sigcontext(struct task_struct *tsk)
>    * Set up the sigcontext for the signal frame.
>    */
>   
> -static long setup_sigcontext(struct sigcontext __user *sc,
> -		struct task_struct *tsk, int signr, sigset_t *set,
> -		unsigned long handler, int ctx_has_vsx_region)
> +#define unsafe_setup_sigcontext(sc, tsk, signr, set, handler,		\
> +				ctx_has_vsx_region, e)			\
> +	unsafe_op_wrap(__unsafe_setup_sigcontext(sc, tsk, signr, set,	\
> +			handler, ctx_has_vsx_region), e)
> +static long notrace __unsafe_setup_sigcontext(struct sigcontext __user *sc,
> +					struct task_struct *tsk, int signr, sigset_t *set,
> +					unsigned long handler, int ctx_has_vsx_region)
>   {
>   	/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
>   	 * process never used altivec yet (MSR_VEC is zero in pt_regs of
> @@ -118,20 +122,19 @@ static long setup_sigcontext(struct sigcontext __user *sc,
>   #endif
>   	struct pt_regs *regs = tsk->thread.regs;
>   	unsigned long msr = regs->msr;
> -	long err = 0;
>   	/* Force usr to alway see softe as 1 (interrupts enabled) */
>   	unsigned long softe = 0x1;
>   
>   	BUG_ON(tsk != current);
>   
>   #ifdef CONFIG_ALTIVEC
> -	err |= __put_user(v_regs, &sc->v_regs);
> +	unsafe_put_user(v_regs, &sc->v_regs, efault_out);
>   
>   	/* save altivec registers */
>   	if (tsk->thread.used_vr) {
>   		/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
> -		err |= __copy_to_user(v_regs, &tsk->thread.vr_state,
> -				      33 * sizeof(vector128));
> +		unsafe_copy_to_user(v_regs, &tsk->thread.vr_state,
> +				    33 * sizeof(vector128), efault_out);
>   		/* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg)
>   		 * contains valid data.
>   		 */
> @@ -140,12 +143,12 @@ static long setup_sigcontext(struct sigcontext __user *sc,
>   	/* We always copy to/from vrsave, it's 0 if we don't have or don't
>   	 * use altivec.
>   	 */
> -	err |= __put_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33]);
> +	unsafe_put_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33], efault_out);
>   #else /* CONFIG_ALTIVEC */
> -	err |= __put_user(0, &sc->v_regs);
> +	unsafe_put_user(0, &sc->v_regs, efault_out);
>   #endif /* CONFIG_ALTIVEC */
>   	/* copy fpr regs and fpscr */
> -	err |= copy_fpr_to_user(&sc->fp_regs, tsk);
> +	unsafe_copy_fpr_to_user(&sc->fp_regs, tsk, efault_out);
>   
>   	/*
>   	 * Clear the MSR VSX bit to indicate there is no valid state attached
> @@ -160,24 +163,27 @@ static long setup_sigcontext(struct sigcontext __user *sc,
>   	 */
>   	if (tsk->thread.used_vsr && ctx_has_vsx_region) {
>   		v_regs += ELF_NVRREG;
> -		err |= copy_vsx_to_user(v_regs, tsk);
> +		unsafe_copy_vsx_to_user(v_regs, tsk, efault_out);
>   		/* set MSR_VSX in the MSR value in the frame to
>   		 * indicate that sc->vs_reg) contains valid data.
>   		 */
>   		msr |= MSR_VSX;
>   	}
>   #endif /* CONFIG_VSX */
> -	err |= __put_user(&sc->gp_regs, &sc->regs);
> +	unsafe_put_user(&sc->gp_regs, &sc->regs, efault_out);
>   	WARN_ON(!FULL_REGS(regs));
> -	err |= __copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE);
> -	err |= __put_user(msr, &sc->gp_regs[PT_MSR]);
> -	err |= __put_user(softe, &sc->gp_regs[PT_SOFTE]);
> -	err |= __put_user(signr, &sc->signal);
> -	err |= __put_user(handler, &sc->handler);
> +	unsafe_copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE, efault_out);
> +	unsafe_put_user(msr, &sc->gp_regs[PT_MSR], efault_out);
> +	unsafe_put_user(softe, &sc->gp_regs[PT_SOFTE], efault_out);
> +	unsafe_put_user(signr, &sc->signal, efault_out);
> +	unsafe_put_user(handler, &sc->handler, efault_out);
>   	if (set != NULL)
> -		err |=  __put_user(set->sig[0], &sc->oldmask);
> +		unsafe_put_user(set->sig[0], &sc->oldmask, efault_out);
>   
> -	return err;
> +	return 0;
> +
> +efault_out:
> +	return -EFAULT;
>   }
>   
>   #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> @@ -670,12 +676,15 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
>   
>   	if (old_ctx != NULL) {
>   		prepare_setup_sigcontext(current);
> -		if (!access_ok(old_ctx, ctx_size)
> -		    || setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, 0,
> -					ctx_has_vsx_region)
> -		    || __copy_to_user(&old_ctx->uc_sigmask,
> -				      &current->blocked, sizeof(sigset_t)))
> +		if (!user_write_access_begin(old_ctx, ctx_size))
>   			return -EFAULT;
> +
> +		unsafe_setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL,
> +					0, ctx_has_vsx_region, efault_out);
> +		unsafe_copy_to_user(&old_ctx->uc_sigmask, &current->blocked,
> +				    sizeof(sigset_t), efault_out);
> +
> +		user_write_access_end();
>   	}
>   	if (new_ctx == NULL)
>   		return 0;
> @@ -704,6 +713,10 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
>   	/* This returns like rt_sigreturn */
>   	set_thread_flag(TIF_RESTOREALL);
>   	return 0;
> +
> +efault_out:
> +	user_write_access_end();
> +	return -EFAULT;
>   }
>   
>   
> @@ -854,9 +867,13 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
>   	} else {
>   		err |= __put_user(0, &frame->uc.uc_link);
>   		prepare_setup_sigcontext(tsk);
> -		err |= setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig,
> -					NULL, (unsigned long)ksig->ka.sa.sa_handler,
> -					1);
> +		if (!user_write_access_begin(&frame->uc.uc_mcontext,
> +					     sizeof(frame->uc.uc_mcontext)))
> +			return -EFAULT;
> +		err |= __unsafe_setup_sigcontext(&frame->uc.uc_mcontext, tsk,
> +						ksig->sig, NULL,
> +						(unsigned long)ksig->ka.sa.sa_handler, 1);
> +		user_write_access_end();
>   	}
>   	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
>   	if (err)
> 

^ permalink raw reply

* Re: [PATCH for 5.10] powerpc/32: Preserve cr1 in exception prolog stack check to fix build error
From: Christophe Leroy @ 2021-02-23 14:39 UTC (permalink / raw)
  To: Greg KH; +Cc: linuxppc-dev, fedora.dm0, stable, linux-kernel
In-Reply-To: <YCqFn/4YuT+445xW@kroah.com>



Le 15/02/2021 à 15:30, Greg KH a écrit :
> On Fri, Feb 12, 2021 at 08:57:14AM +0000, Christophe Leroy wrote:
>> This is backport of 3642eb21256a ("powerpc/32: Preserve cr1 in
>> exception prolog stack check to fix build error") for kernel 5.10
>>
>> It fixes the build failure on v5.10 reported by kernel test robot
>> and by David Michael.
>>
>> This fix is not in Linux tree yet, it is in next branch in powerpc tree.
> 
> Then there's nothing I can do about it until that happens :(
> 

It now is in Linus' tree, see 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=3642eb21256a317ac14e9ed560242c6d20cf06d9

Thanks
Christophe

^ permalink raw reply

* Re: [PATCH] powerpc/perf: Fix handling of privilege level checks in perf interrupt context
From: Michael Ellerman @ 2021-02-23 12:54 UTC (permalink / raw)
  To: Peter Zijlstra, Athira Rajeev
  Cc: maddy, omosnace, acme, jolsa, linuxppc-dev, kan.liang
In-Reply-To: <YDTg99xsVolE+sv+@hirez.programming.kicks-ass.net>

Peter Zijlstra <peterz@infradead.org> writes:
> On Tue, Feb 23, 2021 at 01:31:49AM -0500, Athira Rajeev wrote:
>> Running "perf mem record" in powerpc platforms with selinux enabled
>> resulted in soft lockup's. Below call-trace was seen in the logs:
...
>> 
>> Since the purpose of this security hook is to control access to
>> perf_event_open, it is not right to call this in interrupt context.
>> But in case of powerpc PMU, we need the privilege checks for specific
>> samples from branch history ring buffer and sampling register values.
>
> I'm confused... why would you need those checks at event time? Either
> the event has perf_event_attr::exclude_kernel and it then isn't allowed
> to expose kernel addresses, or it doesn't and it is.

Well one of us is confused that's for sure ^_^

I missed/forgot that we had that logic in open.

I think the reason we got here is that in the past we didn't have the
event in the low-level routines where we want to check,
power_pmu_bhrb_read() and perf_get_data_addr(), so we hacked in a
perf_paranoid_kernel() check. Which was wrong.

Then Joel's patch plumbed the event through and switched those paranoid
checks to perf_allow_kernel().

Anyway, we'll just switch those to exclude_kernel checks.

> There should never be an event-time question of permission like this. If
> you allow creation of an event, you're allowing the data it generates.

Ack.

cheers

^ permalink raw reply

* Re: [PATCH] powerpc/perf: Fix handling of privilege level checks in perf interrupt context
From: Peter Zijlstra @ 2021-02-23 11:03 UTC (permalink / raw)
  To: Athira Rajeev; +Cc: maddy, omosnace, acme, jolsa, linuxppc-dev, kan.liang
In-Reply-To: <1614061909-1734-1-git-send-email-atrajeev@linux.vnet.ibm.com>

On Tue, Feb 23, 2021 at 01:31:49AM -0500, Athira Rajeev wrote:
> Running "perf mem record" in powerpc platforms with selinux enabled
> resulted in soft lockup's. Below call-trace was seen in the logs:
> 
> CPU: 58 PID: 3751 Comm: sssd_nss Not tainted 5.11.0-rc7+ #2
> NIP:  c000000000dff3d4 LR: c000000000dff3d0 CTR: 0000000000000000
> REGS: c000007fffab7d60 TRAP: 0100   Not tainted  (5.11.0-rc7+)
> <<>>
> NIP [c000000000dff3d4] _raw_spin_lock_irqsave+0x94/0x120
> LR [c000000000dff3d0] _raw_spin_lock_irqsave+0x90/0x120
> Call Trace:
> [c00000000fd471a0] [c00000000fd47260] 0xc00000000fd47260 (unreliable)
> [c00000000fd471e0] [c000000000b5fbbc] skb_queue_tail+0x3c/0x90
> [c00000000fd47220] [c000000000296edc] audit_log_end+0x6c/0x180
> [c00000000fd47260] [c0000000006a3f20] common_lsm_audit+0xb0/0xe0
> [c00000000fd472a0] [c00000000066c664] slow_avc_audit+0xa4/0x110
> [c00000000fd47320] [c00000000066cff4] avc_has_perm+0x1c4/0x260
> [c00000000fd47430] [c00000000066e064] selinux_perf_event_open+0x74/0xd0
> [c00000000fd47450] [c000000000669888] security_perf_event_open+0x68/0xc0
> [c00000000fd47490] [c00000000013d788] record_and_restart+0x6e8/0x7f0
> [c00000000fd476c0] [c00000000013dabc] perf_event_interrupt+0x22c/0x560
> [c00000000fd477d0] [c00000000002d0fc] performance_monitor_exception+0x4c/0x60
> [c00000000fd477f0] [c00000000000b378] performance_monitor_common_virt+0x1c8/0x1d0
> interrupt: f00 at _raw_spin_lock_irqsave+0x38/0x120
> NIP:  c000000000dff378 LR: c000000000b5fbbc CTR: c0000000007d47f0
> REGS: c00000000fd47860 TRAP: 0f00   Not tainted  (5.11.0-rc7+)
> <<>>
> NIP [c000000000dff378] _raw_spin_lock_irqsave+0x38/0x120
> LR [c000000000b5fbbc] skb_queue_tail+0x3c/0x90
> interrupt: f00
> [c00000000fd47b00] [0000000000000038] 0x38 (unreliable)
> [c00000000fd47b40] [c00000000aae6200] 0xc00000000aae6200
> [c00000000fd47b80] [c000000000296edc] audit_log_end+0x6c/0x180
> [c00000000fd47bc0] [c00000000029f494] audit_log_exit+0x344/0xf80
> [c00000000fd47d10] [c0000000002a2b00] __audit_syscall_exit+0x2c0/0x320
> [c00000000fd47d60] [c000000000032878] do_syscall_trace_leave+0x148/0x200
> [c00000000fd47da0] [c00000000003d5b4] syscall_exit_prepare+0x324/0x390
> [c00000000fd47e10] [c00000000000d76c] system_call_common+0xfc/0x27c
> 
> The above trace shows that while the CPU was handling a performance
> monitor exception, there was a call to "security_perf_event_open"
> function. In powerpc core-book3s, this function is called from
> 'perf_allow_kernel' check during recording of data address in the sample
> via perf_get_data_addr().
> 
> Commit da97e18458fb ("perf_event: Add support for LSM and SELinux checks")
> introduced security enhancements to perf. As part of this commit, the new
> security hook for perf_event_open was added in all places where perf
> paranoid check was previously used. In powerpc core-book3s code, originally
> had paranoid checks in 'perf_get_data_addr' and 'power_pmu_bhrb_read'. So
> 'perf_paranoid_kernel' checks were replaced with 'perf_allow_kernel' in
> these pmu helper functions as well.
> 
> The intention of paranoid checks in core-book3s is to verify privilege
> access before capturing some of the sample data. Along with paranoid
> checks, 'perf_allow_kernel' also does a 'security_perf_event_open'. Since
> these functions are accessed while recording sample, we end up in calling
> selinux_perf_event_open in PMI context. Some of the security functions
> use spinlock like sidtab_sid2str_put(). If a perf interrupt hits under
> a spin lock and if we end up in calling selinux hook functions in PMI
> handler, this could cause a dead lock.
> 
> Since the purpose of this security hook is to control access to
> perf_event_open, it is not right to call this in interrupt context.
> But in case of powerpc PMU, we need the privilege checks for specific
> samples from branch history ring buffer and sampling register values.

I'm confused... why would you need those checks at event time? Either
the event has perf_event_attr::exclude_kernel and it then isn't allowed
to expose kernel addresses, or it doesn't and it is.

There should never be an event-time question of permission like this. If
you allow creation of an event, you're allowing the data it generates.

^ permalink raw reply

* Re: [PATCH] powerpc/perf: Fix handling of privilege level checks in perf interrupt context
From: Ondrej Mosnacek @ 2021-02-23 10:36 UTC (permalink / raw)
  To: atrajeev
  Cc: maddy, Peter Zijlstra, Arnaldo Carvalho de Melo, SElinux list,
	Linux Security Module list, Jiri Olsa, linuxppc-dev, kan.liang
In-Reply-To: <1614061909-1734-1-git-send-email-atrajeev@linux.vnet.ibm.com>

(CC'ing LSM and SELinux lists; the initial message can be found here:
https://lore.kernel.org/linuxppc-dev/1614061909-1734-1-git-send-email-atrajeev@linux.vnet.ibm.com/T/)

On Tue, Feb 23, 2021 at 7:32 AM Athira Rajeev
<atrajeev@linux.vnet.ibm.com> wrote:
>
> Running "perf mem record" in powerpc platforms with selinux enabled
> resulted in soft lockup's. Below call-trace was seen in the logs:
>
> CPU: 58 PID: 3751 Comm: sssd_nss Not tainted 5.11.0-rc7+ #2
> NIP:  c000000000dff3d4 LR: c000000000dff3d0 CTR: 0000000000000000
> REGS: c000007fffab7d60 TRAP: 0100   Not tainted  (5.11.0-rc7+)
> <<>>
> NIP [c000000000dff3d4] _raw_spin_lock_irqsave+0x94/0x120
> LR [c000000000dff3d0] _raw_spin_lock_irqsave+0x90/0x120
> Call Trace:
> [c00000000fd471a0] [c00000000fd47260] 0xc00000000fd47260 (unreliable)
> [c00000000fd471e0] [c000000000b5fbbc] skb_queue_tail+0x3c/0x90
> [c00000000fd47220] [c000000000296edc] audit_log_end+0x6c/0x180
> [c00000000fd47260] [c0000000006a3f20] common_lsm_audit+0xb0/0xe0
> [c00000000fd472a0] [c00000000066c664] slow_avc_audit+0xa4/0x110
> [c00000000fd47320] [c00000000066cff4] avc_has_perm+0x1c4/0x260
> [c00000000fd47430] [c00000000066e064] selinux_perf_event_open+0x74/0xd0
> [c00000000fd47450] [c000000000669888] security_perf_event_open+0x68/0xc0
> [c00000000fd47490] [c00000000013d788] record_and_restart+0x6e8/0x7f0
> [c00000000fd476c0] [c00000000013dabc] perf_event_interrupt+0x22c/0x560
> [c00000000fd477d0] [c00000000002d0fc] performance_monitor_exception+0x4c/0x60
> [c00000000fd477f0] [c00000000000b378] performance_monitor_common_virt+0x1c8/0x1d0
> interrupt: f00 at _raw_spin_lock_irqsave+0x38/0x120
> NIP:  c000000000dff378 LR: c000000000b5fbbc CTR: c0000000007d47f0
> REGS: c00000000fd47860 TRAP: 0f00   Not tainted  (5.11.0-rc7+)
> <<>>
> NIP [c000000000dff378] _raw_spin_lock_irqsave+0x38/0x120
> LR [c000000000b5fbbc] skb_queue_tail+0x3c/0x90
> interrupt: f00
> [c00000000fd47b00] [0000000000000038] 0x38 (unreliable)
> [c00000000fd47b40] [c00000000aae6200] 0xc00000000aae6200
> [c00000000fd47b80] [c000000000296edc] audit_log_end+0x6c/0x180
> [c00000000fd47bc0] [c00000000029f494] audit_log_exit+0x344/0xf80
> [c00000000fd47d10] [c0000000002a2b00] __audit_syscall_exit+0x2c0/0x320
> [c00000000fd47d60] [c000000000032878] do_syscall_trace_leave+0x148/0x200
> [c00000000fd47da0] [c00000000003d5b4] syscall_exit_prepare+0x324/0x390
> [c00000000fd47e10] [c00000000000d76c] system_call_common+0xfc/0x27c
>
> The above trace shows that while the CPU was handling a performance
> monitor exception, there was a call to "security_perf_event_open"
> function. In powerpc core-book3s, this function is called from
> 'perf_allow_kernel' check during recording of data address in the sample
> via perf_get_data_addr().
>
> Commit da97e18458fb ("perf_event: Add support for LSM and SELinux checks")
> introduced security enhancements to perf. As part of this commit, the new
> security hook for perf_event_open was added in all places where perf
> paranoid check was previously used. In powerpc core-book3s code, originally
> had paranoid checks in 'perf_get_data_addr' and 'power_pmu_bhrb_read'. So
> 'perf_paranoid_kernel' checks were replaced with 'perf_allow_kernel' in
> these pmu helper functions as well.
>
> The intention of paranoid checks in core-book3s is to verify privilege
> access before capturing some of the sample data. Along with paranoid
> checks, 'perf_allow_kernel' also does a 'security_perf_event_open'. Since
> these functions are accessed while recording sample, we end up in calling
> selinux_perf_event_open in PMI context. Some of the security functions
> use spinlock like sidtab_sid2str_put(). If a perf interrupt hits under
> a spin lock and if we end up in calling selinux hook functions in PMI
> handler, this could cause a dead lock.
>
> Since the purpose of this security hook is to control access to
> perf_event_open, it is not right to call this in interrupt context.
> But in case of powerpc PMU, we need the privilege checks for specific
> samples from branch history ring buffer and sampling register values.
> Reference commits:
> Commit cd1231d7035f ("powerpc/perf: Prevent kernel address leak via
> perf_get_data_addr()")
> Commit bb19af816025 ("powerpc/perf: Prevent kernel address leak to
> userspace via BHRB buffer")
>
> As a fix, patch caches 'perf_allow_kernel' value in event_init in
> 'pmu_private' field of perf_event. The cached value is used in the
> PMI code path.
>
> Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
> Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
> ---
>  arch/powerpc/perf/core-book3s.c | 16 ++++++++++++++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
> index 4b4319d8..9e9f67f 100644
> --- a/arch/powerpc/perf/core-book3s.c
> +++ b/arch/powerpc/perf/core-book3s.c
> @@ -189,6 +189,11 @@ static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
>         return 0;
>  }
>
> +static bool event_allow_kernel(struct perf_event *event)
> +{
> +       return (bool)event->pmu_private;
> +}
> +
>  /*
>   * The user wants a data address recorded.
>   * If we're not doing instruction sampling, give them the SDAR
> @@ -222,7 +227,7 @@ static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs *
>         if (!(mmcra & MMCRA_SAMPLE_ENABLE) || sdar_valid)
>                 *addrp = mfspr(SPRN_SDAR);
>
> -       if (is_kernel_addr(mfspr(SPRN_SDAR)) && perf_allow_kernel(&event->attr) != 0)
> +       if (is_kernel_addr(mfspr(SPRN_SDAR)) && !event_allow_kernel(event))
>                 *addrp = 0;
>  }
>
> @@ -507,7 +512,7 @@ static void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *
>                          * addresses, hence include a check before filtering code
>                          */
>                         if (!(ppmu->flags & PPMU_ARCH_31) &&
> -                               is_kernel_addr(addr) && perf_allow_kernel(&event->attr) != 0)
> +                           is_kernel_addr(addr) && !event_allow_kernel(event))
>                                 continue;
>
>                         /* Branches are read most recent first (ie. mfbhrb 0 is
> @@ -2049,6 +2054,13 @@ static int power_pmu_event_init(struct perf_event *event)
>         if (err)
>                 return -EINVAL;
>
> +       /*
> +        * We (ab)use pmu_private to cache the result of perf_allow_kernel(). We
> +        * need access to that result at interrupt time, but can't call
> +        * perf_allow_kernel() directly from interrupt context.
> +        */
> +       event->pmu_private = (void *)(long)(perf_allow_kernel(&event->attr) == 0);

I don't think you need this. Unless I'm missing something, you can
simply use "event->attr.exclude_kernel" in place of
"!event_allow_kernel(event)". If it is set, then there must have been
a successful perf_allow_kernel() check in perf_event_open(2) before
the event was created. power_pmu_event_init() would be called shortly
after via perf_event_alloc() -> perf_init_event(), so I don't think
this additional check would add much value.

> +
>         event->hw.config = events[n];
>         event->hw.event_base = cflags[n];
>         event->hw.last_period = event->hw.sample_period;
> --
> 1.8.3.1
>

-- 
Ondrej Mosnacek
Software Engineer, Linux Security - SELinux kernel
Red Hat, Inc.


^ permalink raw reply

* [PATCH] powerpc/chrp: Make hydra_init() static
From: Geert Uytterhoeven @ 2021-02-23  9:53 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras,
	Oliver O'Halloran
  Cc: Geert Uytterhoeven, linuxppc-dev, linux-kernel

Commit 407d418f2fd4c20a ("powerpc/chrp: Move PHB discovery") moved the
sole call to hydra_init() to the source file where it is defined, so it
can be made static.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
Compile-tested only.  My LongTrail died in 2004.
---
 arch/powerpc/include/asm/hydra.h  | 2 --
 arch/powerpc/platforms/chrp/pci.c | 3 +--
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/hydra.h b/arch/powerpc/include/asm/hydra.h
index ae02eb53d6ef8221..d024447283a0cf3c 100644
--- a/arch/powerpc/include/asm/hydra.h
+++ b/arch/powerpc/include/asm/hydra.h
@@ -94,8 +94,6 @@ extern volatile struct Hydra __iomem *Hydra;
 #define HYDRA_INT_EXT7		18	/* Power Off Request */
 #define HYDRA_INT_SPARE		19
 
-extern int hydra_init(void);
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASMPPC_HYDRA_H */
diff --git a/arch/powerpc/platforms/chrp/pci.c b/arch/powerpc/platforms/chrp/pci.c
index 8c421dc78b28334b..76e6256cb0a788f4 100644
--- a/arch/powerpc/platforms/chrp/pci.c
+++ b/arch/powerpc/platforms/chrp/pci.c
@@ -131,8 +131,7 @@ static struct pci_ops rtas_pci_ops =
 
 volatile struct Hydra __iomem *Hydra = NULL;
 
-int __init
-hydra_init(void)
+static int __init hydra_init(void)
 {
 	struct device_node *np;
 	struct resource r;
-- 
2.25.1


^ permalink raw reply related

* [PATCH v2] selftests/powerpc: Fix L1D flushing tests for Power10
From: Russell Currey @ 2021-02-23  7:02 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Russell Currey, dja

The rfi_flush and entry_flush selftests work by using the PM_LD_MISS_L1
perf event to count L1D misses.  The value of this event has changed
over time:

- Power7 uses 0x400f0
- Power8 and Power9 use both 0x400f0 and 0x3e054
- Power10 uses only 0x3e054

Rather than relying on raw values, configure perf to count L1D read
misses in the most explicit way available.

This fixes the selftests to work on systems without 0x400f0 as
PM_LD_MISS_L1, and should change no behaviour for systems that the tests
already worked on.

The only potential downside is that referring to a specific perf event
requires PMU support implemented in the kernel for that platform.

Signed-off-by: Russell Currey <ruscur@russell.cc>
---
v2: Move away from raw events as suggested by mpe

 tools/testing/selftests/powerpc/security/entry_flush.c | 2 +-
 tools/testing/selftests/powerpc/security/flush_utils.h | 4 ++++
 tools/testing/selftests/powerpc/security/rfi_flush.c   | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/powerpc/security/entry_flush.c b/tools/testing/selftests/powerpc/security/entry_flush.c
index 78cf914fa321..68ce377b205e 100644
--- a/tools/testing/selftests/powerpc/security/entry_flush.c
+++ b/tools/testing/selftests/powerpc/security/entry_flush.c
@@ -53,7 +53,7 @@ int entry_flush_test(void)
 
 	entry_flush = entry_flush_orig;
 
-	fd = perf_event_open_counter(PERF_TYPE_RAW, /* L1d miss */ 0x400f0, -1);
+	fd = perf_event_open_counter(PERF_TYPE_HW_CACHE, PERF_L1D_READ_MISS_CONFIG, -1);
 	FAIL_IF(fd < 0);
 
 	p = (char *)memalign(zero_size, CACHELINE_SIZE);
diff --git a/tools/testing/selftests/powerpc/security/flush_utils.h b/tools/testing/selftests/powerpc/security/flush_utils.h
index 07a5eb301466..7a3d60292916 100644
--- a/tools/testing/selftests/powerpc/security/flush_utils.h
+++ b/tools/testing/selftests/powerpc/security/flush_utils.h
@@ -9,6 +9,10 @@
 
 #define CACHELINE_SIZE 128
 
+#define PERF_L1D_READ_MISS_CONFIG	((PERF_COUNT_HW_CACHE_L1D) | 		\
+					(PERF_COUNT_HW_CACHE_OP_READ << 8) |	\
+					(PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
+
 void syscall_loop(char *p, unsigned long iterations,
 		  unsigned long zero_size);
 
diff --git a/tools/testing/selftests/powerpc/security/rfi_flush.c b/tools/testing/selftests/powerpc/security/rfi_flush.c
index 7565fd786640..f73484a6470f 100644
--- a/tools/testing/selftests/powerpc/security/rfi_flush.c
+++ b/tools/testing/selftests/powerpc/security/rfi_flush.c
@@ -54,7 +54,7 @@ int rfi_flush_test(void)
 
 	rfi_flush = rfi_flush_orig;
 
-	fd = perf_event_open_counter(PERF_TYPE_RAW, /* L1d miss */ 0x400f0, -1);
+	fd = perf_event_open_counter(PERF_TYPE_HW_CACHE, PERF_L1D_READ_MISS_CONFIG, -1);
 	FAIL_IF(fd < 0);
 
 	p = (char *)memalign(zero_size, CACHELINE_SIZE);
-- 
2.30.1


^ permalink raw reply related

* [PATCH] powerpc/perf: Fix handling of privilege level checks in perf interrupt context
From: Athira Rajeev @ 2021-02-23  6:31 UTC (permalink / raw)
  To: mpe; +Cc: maddy, peterz, omosnace, acme, jolsa, linuxppc-dev, kan.liang

Running "perf mem record" in powerpc platforms with selinux enabled
resulted in soft lockup's. Below call-trace was seen in the logs:

CPU: 58 PID: 3751 Comm: sssd_nss Not tainted 5.11.0-rc7+ #2
NIP:  c000000000dff3d4 LR: c000000000dff3d0 CTR: 0000000000000000
REGS: c000007fffab7d60 TRAP: 0100   Not tainted  (5.11.0-rc7+)
<<>>
NIP [c000000000dff3d4] _raw_spin_lock_irqsave+0x94/0x120
LR [c000000000dff3d0] _raw_spin_lock_irqsave+0x90/0x120
Call Trace:
[c00000000fd471a0] [c00000000fd47260] 0xc00000000fd47260 (unreliable)
[c00000000fd471e0] [c000000000b5fbbc] skb_queue_tail+0x3c/0x90
[c00000000fd47220] [c000000000296edc] audit_log_end+0x6c/0x180
[c00000000fd47260] [c0000000006a3f20] common_lsm_audit+0xb0/0xe0
[c00000000fd472a0] [c00000000066c664] slow_avc_audit+0xa4/0x110
[c00000000fd47320] [c00000000066cff4] avc_has_perm+0x1c4/0x260
[c00000000fd47430] [c00000000066e064] selinux_perf_event_open+0x74/0xd0
[c00000000fd47450] [c000000000669888] security_perf_event_open+0x68/0xc0
[c00000000fd47490] [c00000000013d788] record_and_restart+0x6e8/0x7f0
[c00000000fd476c0] [c00000000013dabc] perf_event_interrupt+0x22c/0x560
[c00000000fd477d0] [c00000000002d0fc] performance_monitor_exception+0x4c/0x60
[c00000000fd477f0] [c00000000000b378] performance_monitor_common_virt+0x1c8/0x1d0
interrupt: f00 at _raw_spin_lock_irqsave+0x38/0x120
NIP:  c000000000dff378 LR: c000000000b5fbbc CTR: c0000000007d47f0
REGS: c00000000fd47860 TRAP: 0f00   Not tainted  (5.11.0-rc7+)
<<>>
NIP [c000000000dff378] _raw_spin_lock_irqsave+0x38/0x120
LR [c000000000b5fbbc] skb_queue_tail+0x3c/0x90
interrupt: f00
[c00000000fd47b00] [0000000000000038] 0x38 (unreliable)
[c00000000fd47b40] [c00000000aae6200] 0xc00000000aae6200
[c00000000fd47b80] [c000000000296edc] audit_log_end+0x6c/0x180
[c00000000fd47bc0] [c00000000029f494] audit_log_exit+0x344/0xf80
[c00000000fd47d10] [c0000000002a2b00] __audit_syscall_exit+0x2c0/0x320
[c00000000fd47d60] [c000000000032878] do_syscall_trace_leave+0x148/0x200
[c00000000fd47da0] [c00000000003d5b4] syscall_exit_prepare+0x324/0x390
[c00000000fd47e10] [c00000000000d76c] system_call_common+0xfc/0x27c

The above trace shows that while the CPU was handling a performance
monitor exception, there was a call to "security_perf_event_open"
function. In powerpc core-book3s, this function is called from
'perf_allow_kernel' check during recording of data address in the sample
via perf_get_data_addr().

Commit da97e18458fb ("perf_event: Add support for LSM and SELinux checks")
introduced security enhancements to perf. As part of this commit, the new
security hook for perf_event_open was added in all places where perf
paranoid check was previously used. In powerpc core-book3s code, originally
had paranoid checks in 'perf_get_data_addr' and 'power_pmu_bhrb_read'. So
'perf_paranoid_kernel' checks were replaced with 'perf_allow_kernel' in
these pmu helper functions as well.

The intention of paranoid checks in core-book3s is to verify privilege
access before capturing some of the sample data. Along with paranoid
checks, 'perf_allow_kernel' also does a 'security_perf_event_open'. Since
these functions are accessed while recording sample, we end up in calling
selinux_perf_event_open in PMI context. Some of the security functions
use spinlock like sidtab_sid2str_put(). If a perf interrupt hits under
a spin lock and if we end up in calling selinux hook functions in PMI
handler, this could cause a dead lock.

Since the purpose of this security hook is to control access to
perf_event_open, it is not right to call this in interrupt context.
But in case of powerpc PMU, we need the privilege checks for specific
samples from branch history ring buffer and sampling register values.
Reference commits:
Commit cd1231d7035f ("powerpc/perf: Prevent kernel address leak via
perf_get_data_addr()")
Commit bb19af816025 ("powerpc/perf: Prevent kernel address leak to
userspace via BHRB buffer")

As a fix, patch caches 'perf_allow_kernel' value in event_init in
'pmu_private' field of perf_event. The cached value is used in the
PMI code path.

Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
---
 arch/powerpc/perf/core-book3s.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 4b4319d8..9e9f67f 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -189,6 +189,11 @@ static inline unsigned long perf_ip_adjust(struct pt_regs *regs)
 	return 0;
 }
 
+static bool event_allow_kernel(struct perf_event *event)
+{
+	return (bool)event->pmu_private;
+}
+
 /*
  * The user wants a data address recorded.
  * If we're not doing instruction sampling, give them the SDAR
@@ -222,7 +227,7 @@ static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs *
 	if (!(mmcra & MMCRA_SAMPLE_ENABLE) || sdar_valid)
 		*addrp = mfspr(SPRN_SDAR);
 
-	if (is_kernel_addr(mfspr(SPRN_SDAR)) && perf_allow_kernel(&event->attr) != 0)
+	if (is_kernel_addr(mfspr(SPRN_SDAR)) && !event_allow_kernel(event))
 		*addrp = 0;
 }
 
@@ -507,7 +512,7 @@ static void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *
 			 * addresses, hence include a check before filtering code
 			 */
 			if (!(ppmu->flags & PPMU_ARCH_31) &&
-				is_kernel_addr(addr) && perf_allow_kernel(&event->attr) != 0)
+			    is_kernel_addr(addr) && !event_allow_kernel(event))
 				continue;
 
 			/* Branches are read most recent first (ie. mfbhrb 0 is
@@ -2049,6 +2054,13 @@ static int power_pmu_event_init(struct perf_event *event)
 	if (err)
 		return -EINVAL;
 
+	/*
+	 * We (ab)use pmu_private to cache the result of perf_allow_kernel(). We
+	 * need access to that result at interrupt time, but can't call
+	 * perf_allow_kernel() directly from interrupt context.
+	 */
+	event->pmu_private = (void *)(long)(perf_allow_kernel(&event->attr) == 0);
+
 	event->hw.config = events[n];
 	event->hw.event_base = cflags[n];
 	event->hw.last_period = event->hw.sample_period;
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH kernel] powerpc/iommu: Annotate nested lock for lockdep
From: Alexey Kardashevskiy @ 2021-02-23  5:13 UTC (permalink / raw)
  To: Frederic Barrat, linuxppc-dev; +Cc: kvm-ppc
In-Reply-To: <49b1f5cb-107c-296f-c339-13e627a73d6d@linux.ibm.com>



On 18/02/2021 23:59, Frederic Barrat wrote:
> 
> 
> On 16/02/2021 04:20, Alexey Kardashevskiy wrote:
>> The IOMMU table is divided into pools for concurrent mappings and each
>> pool has a separate spinlock. When taking the ownership of an IOMMU group
>> to pass through a device to a VM, we lock these spinlocks which triggers
>> a false negative warning in lockdep (below).
>>
>> This fixes it by annotating the large pool's spinlock as a nest lock.
>>
>> ===
>> WARNING: possible recursive locking detected
>> 5.11.0-le_syzkaller_a+fstn1 #100 Not tainted
>> --------------------------------------------
>> qemu-system-ppc/4129 is trying to acquire lock:
>> c0000000119bddb0 (&(p->lock)/1){....}-{2:2}, at: 
>> iommu_take_ownership+0xac/0x1e0
>>
>> but task is already holding lock:
>> c0000000119bdd30 (&(p->lock)/1){....}-{2:2}, at: 
>> iommu_take_ownership+0xac/0x1e0
>>
>> other info that might help us debug this:
>>   Possible unsafe locking scenario:
>>
>>         CPU0
>>         ----
>>    lock(&(p->lock)/1);
>>    lock(&(p->lock)/1);
>> ===
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/kernel/iommu.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index 557a09dd5b2f..2ee642a6731a 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -1089,7 +1089,7 @@ int iommu_take_ownership(struct iommu_table *tbl)
>>       spin_lock_irqsave(&tbl->large_pool.lock, flags);
>>       for (i = 0; i < tbl->nr_pools; i++)
>> -        spin_lock(&tbl->pools[i].lock);
>> +        spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
> 
> 
> We have the same pattern and therefore should have the same problem in 
> iommu_release_ownership().
> 
> But as I understand, we're hacking our way around lockdep here, since 
> conceptually, those locks are independent. I was wondering why it seems 
> to fix it by worrying only about the large pool lock.

This is the other way around - we telling the lockdep not to worry about 
small pool locks if the nest lock (==large pool lock) is locked. The 
warning is printed when a nested lock is detected and the lockdep checks 
if there is a nest for this nested lock at check_deadlock().


> That loop can take 
> many locks (up to 4 with current config). However, if the dma window is 
> less than 1GB, we would only have one, so it would make sense for 
> lockdep to stop complaining.

Why would it stop if the large pool is always there?

> Is it what happened? In which case, this 
> patch doesn't really fix it. Or I'm missing something :-)

I tried with 1 or 2 small pools, no difference at all. I might also be 
missing something here too :)


> 
>    Fred
> 
> 
> 
>>       iommu_table_release_pages(tbl);
>>

-- 
Alexey

^ permalink raw reply

* Re: [PATCH v4 2/3] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE
From: David Gibson @ 2021-02-22 23:26 UTC (permalink / raw)
  To: Bharata B Rao; +Cc: farosas, aneesh.kumar, npiggin, kvm-ppc, linuxppc-dev
In-Reply-To: <20210222064608.GB3672042@in.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 9276 bytes --]

On Mon, Feb 22, 2021 at 12:16:08PM +0530, Bharata B Rao wrote:
> On Wed, Feb 17, 2021 at 11:38:07AM +1100, David Gibson wrote:
> > On Mon, Feb 15, 2021 at 12:05:41PM +0530, Bharata B Rao wrote:
> > > Implement H_RPT_INVALIDATE hcall and add KVM capability
> > > KVM_CAP_PPC_RPT_INVALIDATE to indicate the support for the same.
> > > 
> > > This hcall does two types of TLB invalidations:
> > > 
> > > 1. Process-scoped invalidations for guests with LPCR[GTSE]=0.
> > >    This is currently not used in KVM as GTSE is not usually
> > >    disabled in KVM.
> > > 2. Partition-scoped invalidations that an L1 hypervisor does on
> > >    behalf of an L2 guest. This replaces the uses of the existing
> > >    hcall H_TLB_INVALIDATE.
> > > 
> > > In order to handle process scoped invalidations of L2, we
> > > intercept the nested exit handling code in L0 only to handle
> > > H_TLB_INVALIDATE hcall.
> > > 
> > > Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> > > ---
> > >  Documentation/virt/kvm/api.rst         | 17 +++++
> > >  arch/powerpc/include/asm/kvm_book3s.h  |  3 +
> > >  arch/powerpc/include/asm/mmu_context.h | 11 +++
> > >  arch/powerpc/kvm/book3s_hv.c           | 91 ++++++++++++++++++++++++
> > >  arch/powerpc/kvm/book3s_hv_nested.c    | 96 ++++++++++++++++++++++++++
> > >  arch/powerpc/kvm/powerpc.c             |  3 +
> > >  arch/powerpc/mm/book3s64/radix_tlb.c   | 25 +++++++
> > >  include/uapi/linux/kvm.h               |  1 +
> > >  8 files changed, 247 insertions(+)
> > > 
> > > diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > > index 99ceb978c8b0..416c36aa35d4 100644
> > > --- a/Documentation/virt/kvm/api.rst
> > > +++ b/Documentation/virt/kvm/api.rst
> > > @@ -6038,6 +6038,23 @@ KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
> > >  can then handle to implement model specific MSR handling and/or user notifications
> > >  to inform a user that an MSR was not handled.
> > >  
> > > +7.22 KVM_CAP_PPC_RPT_INVALIDATE
> > > +------------------------------
> > > +
> > > +:Capability: KVM_CAP_PPC_RPT_INVALIDATE
> > > +:Architectures: ppc
> > > +:Type: vm
> > > +
> > > +This capability indicates that the kernel is capable of handling
> > > +H_RPT_INVALIDATE hcall.
> > > +
> > > +In order to enable the use of H_RPT_INVALIDATE in the guest,
> > > +user space might have to advertise it for the guest. For example,
> > > +IBM pSeries (sPAPR) guest starts using it if "hcall-rpt-invalidate" is
> > > +present in the "ibm,hypertas-functions" device-tree property.
> > > +
> > > +This capability is always enabled.
> > 
> > I guess that means it's always enabled when it's available - I'm
> > pretty sure it won't be enabled on POWER8 or on PR KVM.
> 
> Correct, will reword this and restrict this to POWER9, radix etc
> 
> > 
> > > +
> > >  8. Other capabilities.
> > >  ======================
> > >  
> > > diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> > > index d32ec9ae73bd..0f1c5fa6e8ce 100644
> > > --- a/arch/powerpc/include/asm/kvm_book3s.h
> > > +++ b/arch/powerpc/include/asm/kvm_book3s.h
> > > @@ -298,6 +298,9 @@ void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
> > >  void kvmhv_release_all_nested(struct kvm *kvm);
> > >  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
> > >  long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
> > > +long kvmhv_h_rpti_nested(struct kvm_vcpu *vcpu, unsigned long lpid,
> > > +			 unsigned long type, unsigned long pg_sizes,
> > > +			 unsigned long start, unsigned long end);
> > >  int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu,
> > >  			  u64 time_limit, unsigned long lpcr);
> > >  void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
> > > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
> > > index d5821834dba9..fbf3b5b45fe9 100644
> > > --- a/arch/powerpc/include/asm/mmu_context.h
> > > +++ b/arch/powerpc/include/asm/mmu_context.h
> > > @@ -124,8 +124,19 @@ static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea)
> > >  
> > >  #if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU)
> > >  extern void radix_kvm_prefetch_workaround(struct mm_struct *mm);
> > > +void do_h_rpt_invalidate(unsigned long pid, unsigned long lpid,
> > > +			 unsigned long type, unsigned long page_size,
> > > +			 unsigned long psize, unsigned long start,
> > > +			 unsigned long end);
> > >  #else
> > >  static inline void radix_kvm_prefetch_workaround(struct mm_struct *mm) { }
> > > +static inline void do_h_rpt_invalidate(unsigned long pid,
> > > +				       unsigned long lpid,
> > > +				       unsigned long type,
> > > +				       unsigned long page_size,
> > > +				       unsigned long psize,
> > > +				       unsigned long start,
> > > +				       unsigned long end) { }
> > >  #endif
> > >  
> > >  extern void switch_cop(struct mm_struct *next);
> > > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > > index 6f612d240392..802cb77c39cc 100644
> > > --- a/arch/powerpc/kvm/book3s_hv.c
> > > +++ b/arch/powerpc/kvm/book3s_hv.c
> > > @@ -904,6 +904,64 @@ static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
> > >  	return yield_count;
> > >  }
> > >  
> > > +static void do_h_rpt_invalidate_prs(unsigned long pid, unsigned long lpid,
> > > +				    unsigned long type, unsigned long pg_sizes,
> > > +				    unsigned long start, unsigned long end)
> > > +{
> > > +	unsigned long psize;
> > > +
> > > +	if (pg_sizes & H_RPTI_PAGE_64K) {
> > > +		psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_64K);
> > > +		do_h_rpt_invalidate(pid, lpid, type, (1UL << 16), psize,
> > > +				    start, end);
> > > +	}
> > > +
> > > +	if (pg_sizes & H_RPTI_PAGE_2M) {
> > > +		psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_2M);
> > > +		do_h_rpt_invalidate(pid, lpid, type, (1UL << 21), psize,
> > > +				    start, end);
> > > +	}
> > > +
> > > +	if (pg_sizes & H_RPTI_PAGE_1G) {
> > > +		psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_1G);
> > > +		do_h_rpt_invalidate(pid, lpid, type, (1UL << 30), psize,
> > > +				    start, end);
> > > +	}
> > 
> > Hrm.  Here you're stepping through the hcall defined pagesizes, then
> > mapping each one to the Linux internal page size defs.
> > 
> > It might be more elegant to step through mmu_psize_defs table, and
> > conditionally performan an invalidate on that pagesize if the
> > corresponding bit in pg_sizes is set (as noted earlier you could
> > easily add the H_RPTI_PAGE bit to the table).  That way it's a direct
> > table lookup rather than a bunch of ifs or switches.
> 
> Yes, let me give this a try.
> 
> > 
> > > +}
> > > +
> > > +static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
> > > +				    unsigned long pid, unsigned long target,
> > > +				    unsigned long type, unsigned long pg_sizes,
> > > +				    unsigned long start, unsigned long end)
> > > +{
> > > +	if (!kvm_is_radix(vcpu->kvm))
> > > +		return H_UNSUPPORTED;
> > > +
> > > +	if (kvmhv_on_pseries())
> > > +		return H_UNSUPPORTED;
> > 
> > This doesn't seem quite right.  If you have multiply nested guests,
> > won't the L2 be issueing H_RPT_INVALIDATE hcalls to the L1 on behalf
> > of the L3?  The L1 would have to implement them by calling the L0, but
> > the L1 can't just reject them, no?
> > 
> > Likewise for the !H_RPTI_TYPE_NESTED case, but on what happens to be a
> > nested guest in any case, couldn't this case legitimately arise and
> > need to be handled?
> 
> The approach is to handle this hcall on behalf of all the nested
> guests in L0 only. I am intercepting the nested exit path precisely
> for this as shown in the below hunk.

Ah, I see.  Might be worth commenting that, since it's not necessarily obvious.

> 
> > > @@ -1573,6 +1640,30 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
> > >  		if (!xics_on_xive())
> > >  			kvmppc_xics_rm_complete(vcpu, 0);
> > >  		break;
> > > +	case BOOK3S_INTERRUPT_SYSCALL:
> > > +	{
> > > +		unsigned long req = kvmppc_get_gpr(vcpu, 3);
> > > +
> > > +		if (req != H_RPT_INVALIDATE) {
> > > +			r = RESUME_HOST;
> > > +			break;
> > > +		}
> > > +
> > > +		/*
> > > +		 * The H_RPT_INVALIDATE hcalls issued by nested
> > > +		 * guest for process scoped invalidations when
> > > +		 * GTSE=0 are handled here.
> > > +		 */
> > > +		do_h_rpt_invalidate_prs(kvmppc_get_gpr(vcpu, 4),
> > > +					vcpu->arch.nested->shadow_lpid,
> > > +					kvmppc_get_gpr(vcpu, 5),
> > > +					kvmppc_get_gpr(vcpu, 6),
> > > +					kvmppc_get_gpr(vcpu, 7),
> > > +					kvmppc_get_gpr(vcpu, 8));
> > > +		kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
> > > +		r = RESUME_GUEST;
> > > +		break;
> > > +	}
> > >  	default:
> > >  		r = RESUME_HOST;
> > >  		break;
> 
> Thanks for your review.
> 
> Regards,
> Bharata.
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH RFC v1 5/6] xen-swiotlb: convert variables to arrays
From: Stefano Stabellini @ 2021-02-23  1:22 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: ulf.hansson, airlied, joonas.lahtinen, dri-devel, linux-kernel,
	paulus, hpa, Christoph Hellwig, m.szyprowski, sstabellini,
	adrian.hunter, Dongli Zhang, x86, joe.jin, mingo, peterz, mingo,
	bskeggs, linux-pci, xen-devel, matthew.auld, thomas.lendacky,
	intel-gfx, jani.nikula, bp, rodrigo.vivi, bhelgaas,
	Boris Ostrovsky, chris, jgross, tsbogend, nouveau, robin.murphy,
	linux-mmc, linux-mips, iommu, tglx, bauerman, daniel, akpm,
	linuxppc-dev, rppt
In-Reply-To: <YDAgT2ZIdncNwNlf@Konrads-MacBook-Pro.local>

On Fri, 19 Feb 2021, Konrad Rzeszutek Wilk wrote:
> On Sun, Feb 07, 2021 at 04:56:01PM +0100, Christoph Hellwig wrote:
> > On Thu, Feb 04, 2021 at 09:40:23AM +0100, Christoph Hellwig wrote:
> > > So one thing that has been on my mind for a while:  I'd really like
> > > to kill the separate dma ops in Xen swiotlb.  If we compare xen-swiotlb
> > > to swiotlb the main difference seems to be:
> > > 
> > >  - additional reasons to bounce I/O vs the plain DMA capable
> > >  - the possibility to do a hypercall on arm/arm64
> > >  - an extra translation layer before doing the phys_to_dma and vice
> > >    versa
> > >  - an special memory allocator
> > > 
> > > I wonder if inbetween a few jump labels or other no overhead enablement
> > > options and possibly better use of the dma_range_map we could kill
> > > off most of swiotlb-xen instead of maintaining all this code duplication?
> > 
> > So I looked at this a bit more.
> > 
> > For x86 with XENFEAT_auto_translated_physmap (how common is that?)
> 
> Juergen, Boris please correct me if I am wrong, but that XENFEAT_auto_translated_physmap
> only works for PVH guests?

ARM is always XENFEAT_auto_translated_physmap


> > pfn_to_gfn is a nop, so plain phys_to_dma/dma_to_phys do work as-is.
> > 
> > xen_arch_need_swiotlb always returns true for x86, and
> > range_straddles_page_boundary should never be true for the
> > XENFEAT_auto_translated_physmap case.
> 
> Correct. The kernel should have no clue of what the real MFNs are
> for PFNs.

On ARM, Linux knows the MFNs because for local pages MFN == PFN and for
foreign pages it keeps track in arch/arm/xen/p2m.c. More on this below.

xen_arch_need_swiotlb only returns true on ARM in rare situations where
bouncing on swiotlb buffers is required. Today it only happens on old
versions of Xen that don't support the cache flushing hypercall but
there could be more cases in the future.


> > 
> > So as far as I can tell the mapping fast path for the
> > XENFEAT_auto_translated_physmap can be trivially reused from swiotlb.
> > 
> > That leaves us with the next more complicated case, x86 or fully cache
> > coherent arm{,64} without XENFEAT_auto_translated_physmap.  In that case
> > we need to patch in a phys_to_dma/dma_to_phys that performs the MFN
> > lookup, which could be done using alternatives or jump labels.
> > I think if that is done right we should also be able to let that cover
> > the foreign pages in is_xen_swiotlb_buffer/is_swiotlb_buffer, but
> > in that worst case that would need another alternative / jump label.
> > 
> > For non-coherent arm{,64} we'd also need to use alternatives or jump
> > labels to for the cache maintainance ops, but that isn't a hard problem
> > either.

With the caveat that ARM is always XENFEAT_auto_translated_physmap, what
you wrote looks correct. I am writing down a brief explanation on how
swiotlb-xen is used on ARM.


pfn: address as seen by the guest, pseudo-physical address in ARM terminology
mfn (or bfn): real address, physical address in ARM terminology


On ARM dom0 is auto_translated (so Xen sets up the stage2 translation
in the MMU) and the translation is 1:1. So pfn == mfn for Dom0.

However, when another domain shares a page with Dom0, that page is not
1:1. Swiotlb-xen is used to retrieve the mfn for the foreign page at
xen_swiotlb_map_page. It does that with xen_phys_to_bus -> pfn_to_bfn.
It is implemented with a rbtree in arch/arm/xen/p2m.c.

In addition, swiotlb-xen is also used to cache-flush the page via
hypercall at xen_swiotlb_unmap_page. That is done because dev_addr is
really the mfn at unmap_page and we don't know the pfn for it. We can do
pfn-to-mfn but we cannot do mfn-to-pfn (there are good reasons for it
unfortunately). The only way to cache-flush by mfn is by issuing a
hypercall. The hypercall is implemented in arch/arm/xen/mm.c.

The pfn != bfn and pfn_valid() checks are used to detect if the page is
local (of dom0) or foreign; they work thanks to the fact that Dom0 is
1:1 mapped.


Getting back to what you wrote, yes if we had a way to do MFN lookups in
phys_to_dma, and a way to call the hypercall at unmap_page if the page
is foreign (e.g. if it fails a pfn_valid check) then I think we would be
good from an ARM perspective. The only exception is when
xen_arch_need_swiotlb returns true, in which case we need to actually
bounce on swiotlb buffers.

^ permalink raw reply

* Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.12-1 tag
From: Michael Ellerman @ 2021-02-23  1:13 UTC (permalink / raw)
  To: Rob Herring
  Cc: ananth, Alexey Kardashevskiy, kernelfans, cmr, zhengyongjun3,
	Oliver O'Halloran, sandipan, cy.fan, SF Markus Elfring,
	jiapeng.chong, skirmisher, Florian Fainelli, Bhaskar Chowdhury,
	eerykitty, Aneesh Kumar K.V, Haren Myneni, ganeshgr,
	Michal Suchanek, nathanl, kjain, Nicholas Piggin, Mark Brown,
	Qian Cai, Cédric Le Goater, Hari Bathini, atrajeev,
	Linus Torvalds, Randy Dunlap, linux-kernel@vger.kernel.org,
	fbarrat, po-hsu.lin, linuxppc-dev
In-Reply-To: <CAL_Jsq+5u82rS+izM2Ds0jdsQKc9C_MCFLmmRvrxhD_6ofNiJQ@mail.gmail.com>

Rob Herring <robh@kernel.org> writes:
> On Mon, Feb 22, 2021 at 6:05 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>>
>> -----BEGIN PGP SIGNED MESSAGE-----
>> Hash: SHA256
>>
>> Hi Linus,
>>
>> Please pull powerpc updates for 5.12.
>>
>> There will be a conflict with the devicetree tree. It's OK to just take their
>> side of the conflict, we'll fix up the minor behaviour change that causes in a
>> follow-up patch.
>
> The issues turned out to be worse than just this, so I've dropped the
> conflicting change for 5.12.

OK, no worries.

cheers

^ permalink raw reply

* Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.12-1 tag
From: Michael Ellerman @ 2021-02-23  0:53 UTC (permalink / raw)
  To: Oliver O'Halloran, Linus Torvalds
  Cc: Ananth N Mavinakayanahalli, Alexey Kardashevskiy, Pingfan Liu,
	cmr, Zheng Yongjun, Po-Hsu Lin, sandipan, cy.fan, Markus Elfring,
	jiapeng.chong, skirmisher, Florian Fainelli, Bhaskar Chowdhury,
	eerykitty, Aneesh Kumar K.V, Haren Myneni, ganeshgr,
	Michal Suchanek, Nathan Lynch, Rob Herring, kjain, Nick Piggin,
	Mark Brown, Qian Cai, Cédric Le Goater, Hari Bathini,
	Athira Jajeev, Randy Dunlap, Linux Kernel Mailing List,
	Frederic Barrat, linuxppc-dev
In-Reply-To: <CAOSf1CH67Htam33UvYhaypD7HW7q1xU4tUW0soshao2FKa+Czw@mail.gmail.com>

"Oliver O'Halloran" <oohall@gmail.com> writes:

> On Tue, Feb 23, 2021 at 9:44 AM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
>>
>> On Mon, Feb 22, 2021 at 4:06 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>> >
>> > Please pull powerpc updates for 5.12.
>>
>> Pulled. However:
>>
>> >  mode change 100755 => 100644 tools/testing/selftests/powerpc/eeh/eeh-functions.sh
>> >  create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-vf-aware.sh
>> >  create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-vf-unaware.sh
>>
>> Somebody is being confused.
>>
>> Why create two new shell scripts with the proper executable bit, and
>> then remove the executable bit from an existing one?
>>
>> That just seems very inconsistent.
>
> eeh-function.sh just provides some helper functions for the other
> scripts and doesn't do anything when executed directly. I thought
> making it non-executable made sense.

Yeah I think it does make sense. It just looks a bit odd in the diffstat
like this. Maybe if we called it lib.sh it would be more obvious?

cheers

^ permalink raw reply

* Re: linux-next: manual merge of the spi tree with the powerpc tree
From: Stephen Rothwell @ 2021-02-22 23:57 UTC (permalink / raw)
  To: Michael Ellerman, PowerPC
  Cc: Linux Next Mailing List, Mark Brown, Linux Kernel Mailing List
In-Reply-To: <20210212152755.5c82563a@canb.auug.org.au>

[-- Attachment #1: Type: text/plain, Size: 1624 bytes --]

Hi Stephen,

On Fri, 12 Feb 2021 15:31:42 +1100 Stephen Rothwell <sfr@canb.auug.org.au> wrote:
>
> Hi all,
> 
> Today's linux-next merge of the spi tree got a conflict in:
> 
>   drivers/spi/spi-mpc52xx.c
> 
> between commit:
> 
>   e10656114d32 ("spi: mpc52xx: Avoid using get_tbl()")
> 
> from the powerpc tree and commit:
> 
>   258ea99fe25a ("spi: spi-mpc52xx: Use new structure for SPI transfer delays")
> 
> from the spi tree.
> 
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
> 
> diff --cc drivers/spi/spi-mpc52xx.c
> index e6a30f232370,36f941500676..000000000000
> --- a/drivers/spi/spi-mpc52xx.c
> +++ b/drivers/spi/spi-mpc52xx.c
> @@@ -247,8 -247,10 +247,10 @@@ static int mpc52xx_spi_fsmstate_transfe
>   	/* Is the transfer complete? */
>   	ms->len--;
>   	if (ms->len == 0) {
>  -		ms->timestamp = get_tbl();
>  +		ms->timestamp = mftb();
> - 		ms->timestamp += ms->transfer->delay_usecs * tb_ticks_per_usec;
> + 		if (ms->transfer->delay.unit == SPI_DELAY_UNIT_USECS)
> + 			ms->timestamp += ms->transfer->delay.value *
> + 					 tb_ticks_per_usec;
>   		ms->state = mpc52xx_spi_fsmstate_wait;
>   		return FSM_CONTINUE;
>   	}

This is now a conflict between the powerpc tree and Linus' tree.

-- 
Cheers,
Stephen Rothwell

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [powerpc:fixes-test] BUILD SUCCESS a5c2f7d40511976f30de38b4374b8da2b39a073c
From: kernel test robot @ 2021-02-22 23:21 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git fixes-test
branch HEAD: a5c2f7d40511976f30de38b4374b8da2b39a073c  powerpc/4xx: Fix build errors from mfdcr()

elapsed time: 724m

configs tested: 101
configs skipped: 88

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
mips                           mtx1_defconfig
mips                malta_qemu_32r6_defconfig
powerpc                        fsp2_defconfig
h8300                            alldefconfig
powerpc                        icon_defconfig
sh                           se7343_defconfig
ia64                            zx1_defconfig
arm                         shannon_defconfig
arm                        mvebu_v5_defconfig
powerpc                     kmeter1_defconfig
sh                         ecovec24_defconfig
sh                          polaris_defconfig
powerpc                          g5_defconfig
mips                          malta_defconfig
arc                        nsim_700_defconfig
powerpc                     akebono_defconfig
powerpc                     ppa8548_defconfig
powerpc                     mpc5200_defconfig
arm                     davinci_all_defconfig
mips                         tb0219_defconfig
arm                        keystone_defconfig
sh                  sh7785lcr_32bit_defconfig
powerpc                      makalu_defconfig
arm                        realview_defconfig
powerpc                     taishan_defconfig
arm                          pxa168_defconfig
arm                          simpad_defconfig
mips                           ci20_defconfig
powerpc                 mpc8560_ads_defconfig
powerpc                   lite5200b_defconfig
powerpc                      ppc6xx_defconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
c6x                              allyesconfig
nds32                               defconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
s390                             allmodconfig
parisc                           allyesconfig
s390                                defconfig
i386                             allyesconfig
sparc                            allyesconfig
sparc                               defconfig
i386                               tinyconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
x86_64               randconfig-a001-20210222
x86_64               randconfig-a002-20210222
x86_64               randconfig-a003-20210222
x86_64               randconfig-a005-20210222
x86_64               randconfig-a006-20210222
x86_64               randconfig-a004-20210222
i386                 randconfig-a013-20210222
i386                 randconfig-a012-20210222
i386                 randconfig-a011-20210222
i386                 randconfig-a014-20210222
i386                 randconfig-a016-20210222
i386                 randconfig-a015-20210222
riscv                    nommu_k210_defconfig
riscv                            allyesconfig
riscv                    nommu_virt_defconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                          rv32_defconfig
riscv                            allmodconfig
x86_64                           allyesconfig
x86_64                    rhel-7.6-kselftests
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                      rhel-8.3-kbuiltin
x86_64                                  kexec

clang tested configs:
x86_64               randconfig-a015-20210222
x86_64               randconfig-a011-20210222
x86_64               randconfig-a012-20210222
x86_64               randconfig-a016-20210222
x86_64               randconfig-a014-20210222
x86_64               randconfig-a013-20210222

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* [powerpc:merge] BUILD SUCCESS b267c8c58643460da9159ee69f46b3945cfd9de6
From: kernel test robot @ 2021-02-22 23:21 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git merge
branch HEAD: b267c8c58643460da9159ee69f46b3945cfd9de6  Automatic merge of 'master' into merge (2021-02-22 21:30)

elapsed time: 723m

configs tested: 123
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
mips                           mtx1_defconfig
mips                malta_qemu_32r6_defconfig
powerpc                        fsp2_defconfig
h8300                            alldefconfig
powerpc                        icon_defconfig
sh                           se7343_defconfig
ia64                            zx1_defconfig
arm                         shannon_defconfig
arm                        mvebu_v5_defconfig
powerpc                     kmeter1_defconfig
sh                         ecovec24_defconfig
powerpc                     redwood_defconfig
sh                         apsh4a3a_defconfig
powerpc                      ppc44x_defconfig
sh                          polaris_defconfig
powerpc                          g5_defconfig
mips                          malta_defconfig
arc                        nsim_700_defconfig
sh                           se7705_defconfig
powerpc                      chrp32_defconfig
ia64                        generic_defconfig
powerpc                      bamboo_defconfig
arm                          simpad_defconfig
arc                     nsimosci_hs_defconfig
arm                       netwinder_defconfig
arm                           spitz_defconfig
m68k                           sun3_defconfig
powerpc                 mpc85xx_cds_defconfig
powerpc                     akebono_defconfig
mips                            ar7_defconfig
sparc                       sparc32_defconfig
ia64                      gensparse_defconfig
powerpc                     ppa8548_defconfig
powerpc                     mpc5200_defconfig
arm                     davinci_all_defconfig
mips                         tb0219_defconfig
arm                        keystone_defconfig
sh                  sh7785lcr_32bit_defconfig
powerpc                      makalu_defconfig
powerpc                      walnut_defconfig
s390                             allyesconfig
mips                        jmr3927_defconfig
mips                      maltasmvp_defconfig
m68k                          multi_defconfig
arm                        realview_defconfig
powerpc                     taishan_defconfig
arm                          pxa168_defconfig
mips                           ci20_defconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                             allmodconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
c6x                              allyesconfig
nds32                               defconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allmodconfig
parisc                           allyesconfig
s390                                defconfig
i386                             allyesconfig
sparc                            allyesconfig
sparc                               defconfig
i386                               tinyconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
x86_64               randconfig-a001-20210222
x86_64               randconfig-a002-20210222
x86_64               randconfig-a003-20210222
x86_64               randconfig-a005-20210222
x86_64               randconfig-a006-20210222
x86_64               randconfig-a004-20210222
i386                 randconfig-a005-20210222
i386                 randconfig-a006-20210222
i386                 randconfig-a004-20210222
i386                 randconfig-a003-20210222
i386                 randconfig-a001-20210222
i386                 randconfig-a002-20210222
i386                 randconfig-a013-20210222
i386                 randconfig-a012-20210222
i386                 randconfig-a011-20210222
i386                 randconfig-a014-20210222
i386                 randconfig-a016-20210222
i386                 randconfig-a015-20210222
riscv                    nommu_k210_defconfig
riscv                            allyesconfig
riscv                    nommu_virt_defconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                          rv32_defconfig
riscv                            allmodconfig
x86_64                           allyesconfig
x86_64                    rhel-7.6-kselftests
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                      rhel-8.3-kbuiltin
x86_64                                  kexec

clang tested configs:
x86_64               randconfig-a015-20210222
x86_64               randconfig-a011-20210222
x86_64               randconfig-a012-20210222
x86_64               randconfig-a016-20210222
x86_64               randconfig-a014-20210222
x86_64               randconfig-a013-20210222

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.12-1 tag
From: Oliver O'Halloran @ 2021-02-22 23:00 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ananth N Mavinakayanahalli, Alexey Kardashevskiy, Pingfan Liu,
	cmr, Zheng Yongjun, Po-Hsu Lin, sandipan, cy.fan, Markus Elfring,
	jiapeng.chong, skirmisher, Florian Fainelli, Bhaskar Chowdhury,
	eerykitty, Haren Myneni, ganeshgr, Michal Suchanek, Nathan Lynch,
	Rob Herring, kjain, Nick Piggin, Mark Brown, Qian Cai,
	Cédric Le Goater, Hari Bathini, Athira Jajeev, Randy Dunlap,
	Linux Kernel Mailing List, Aneesh Kumar K.V, Frederic Barrat,
	linuxppc-dev
In-Reply-To: <CAHk-=wj9nZYEZnTYMpHwVT6B6P+zFXW_P-PWH_bRR5bp-cWbOQ@mail.gmail.com>

On Tue, Feb 23, 2021 at 9:44 AM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Mon, Feb 22, 2021 at 4:06 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
> >
> > Please pull powerpc updates for 5.12.
>
> Pulled. However:
>
> >  mode change 100755 => 100644 tools/testing/selftests/powerpc/eeh/eeh-functions.sh
> >  create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-vf-aware.sh
> >  create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-vf-unaware.sh
>
> Somebody is being confused.
>
> Why create two new shell scripts with the proper executable bit, and
> then remove the executable bit from an existing one?
>
> That just seems very inconsistent.

eeh-function.sh just provides some helper functions for the other
scripts and doesn't do anything when executed directly. I thought
making it non-executable made sense.

>
>              Linus

^ permalink raw reply

* Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.12-1 tag
From: Linus Torvalds @ 2021-02-22 22:37 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: ananth, aik, kernelfans, cmr, Zheng Yongjun,
	Oliver O'Halloran, sandipan, cy.fan, Markus Elfring,
	jiapeng.chong, skirmisher, Florian Fainelli, Bhaskar Chowdhury,
	eerykitty, Aneesh Kumar K.V, haren, ganeshgr, msuchanek, nathanl,
	Rob Herring, kjain, Nick Piggin, Mark Brown, Qian Cai, clg,
	hbathini, Athira Jajeev, Randy Dunlap, Linux Kernel Mailing List,
	fbarrat, Po-Hsu Lin, linuxppc-dev
In-Reply-To: <87o8gctii6.fsf@mpe.ellerman.id.au>

On Mon, Feb 22, 2021 at 4:06 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>
> Please pull powerpc updates for 5.12.

Pulled. However:

>  mode change 100755 => 100644 tools/testing/selftests/powerpc/eeh/eeh-functions.sh
>  create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-vf-aware.sh
>  create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-vf-unaware.sh

Somebody is being confused.

Why create two new shell scripts with the proper executable bit, and
then remove the executable bit from an existing one?

That just seems very inconsistent.

             Linus

^ permalink raw reply

* Re: [PATCH 06/13] KVM: PPC: Book3S 64: Move GUEST_MODE_SKIP test into KVM
From: Fabiano Rosas @ 2021-02-22 22:40 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210219063542.1425130-7-npiggin@gmail.com>

Nicholas Piggin <npiggin@gmail.com> writes:

> Move the GUEST_MODE_SKIP logic into KVM code. This is quite a KVM
> internal detail that has no real need to be in common handlers.
>
> Also add a comment explaining why this this thing exists.

this this

>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>

> ---
>  arch/powerpc/kernel/exceptions-64s.S | 60 --------------------------
>  arch/powerpc/kvm/book3s_64_entry.S   | 64 ++++++++++++++++++++++++----
>  2 files changed, 56 insertions(+), 68 deletions(-)
>
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index a1640d6ea65d..96f22c582213 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -133,7 +133,6 @@ name:
>  #define IBRANCH_TO_COMMON	.L_IBRANCH_TO_COMMON_\name\() /* ENTRY branch to common */
>  #define IREALMODE_COMMON	.L_IREALMODE_COMMON_\name\() /* Common runs in realmode */
>  #define IMASK		.L_IMASK_\name\()	/* IRQ soft-mask bit */
> -#define IKVM_SKIP	.L_IKVM_SKIP_\name\()	/* Generate KVM skip handler */
>  #define IKVM_REAL	.L_IKVM_REAL_\name\()	/* Real entry tests KVM */
>  #define __IKVM_REAL(name)	.L_IKVM_REAL_ ## name
>  #define IKVM_VIRT	.L_IKVM_VIRT_\name\()	/* Virt entry tests KVM */
> @@ -191,9 +190,6 @@ do_define_int n
>  	.ifndef IMASK
>  		IMASK=0
>  	.endif
> -	.ifndef IKVM_SKIP
> -		IKVM_SKIP=0
> -	.endif
>  	.ifndef IKVM_REAL
>  		IKVM_REAL=0
>  	.endif
> @@ -254,15 +250,10 @@ do_define_int n
>  	.balign IFETCH_ALIGN_BYTES
>  \name\()_kvm:
>
> -	.if IKVM_SKIP
> -	cmpwi	r10,KVM_GUEST_MODE_SKIP
> -	beq	89f
> -	.else
>  BEGIN_FTR_SECTION
>  	ld	r10,IAREA+EX_CFAR(r13)
>  	std	r10,HSTATE_CFAR(r13)
>  END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
> -	.endif
>
>  	ld	r10,IAREA+EX_CTR(r13)
>  	mtctr	r10
> @@ -289,27 +280,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
>  	ori	r12,r12,(IVEC)
>  	.endif
>  	b	kvmppc_interrupt
> -
> -	.if IKVM_SKIP
> -89:	mtocrf	0x80,r9
> -	ld	r10,IAREA+EX_CTR(r13)
> -	mtctr	r10
> -	ld	r9,IAREA+EX_R9(r13)
> -	ld	r10,IAREA+EX_R10(r13)
> -	ld	r11,IAREA+EX_R11(r13)
> -	ld	r12,IAREA+EX_R12(r13)
> -	.if IHSRR_IF_HVMODE
> -	BEGIN_FTR_SECTION
> -	b	kvmppc_skip_Hinterrupt
> -	FTR_SECTION_ELSE
> -	b	kvmppc_skip_interrupt
> -	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
> -	.elseif IHSRR
> -	b	kvmppc_skip_Hinterrupt
> -	.else
> -	b	kvmppc_skip_interrupt
> -	.endif
> -	.endif
>  .endm
>
>  #else
> @@ -1128,7 +1098,6 @@ INT_DEFINE_BEGIN(machine_check)
>  	ISET_RI=0
>  	IDAR=1
>  	IDSISR=1
> -	IKVM_SKIP=1
>  	IKVM_REAL=1
>  INT_DEFINE_END(machine_check)
>
> @@ -1419,7 +1388,6 @@ INT_DEFINE_BEGIN(data_access)
>  	IVEC=0x300
>  	IDAR=1
>  	IDSISR=1
> -	IKVM_SKIP=1
>  	IKVM_REAL=1
>  INT_DEFINE_END(data_access)
>
> @@ -1465,7 +1433,6 @@ INT_DEFINE_BEGIN(data_access_slb)
>  	IVEC=0x380
>  	IRECONCILE=0
>  	IDAR=1
> -	IKVM_SKIP=1
>  	IKVM_REAL=1
>  INT_DEFINE_END(data_access_slb)
>
> @@ -2111,7 +2078,6 @@ INT_DEFINE_BEGIN(h_data_storage)
>  	IHSRR=1
>  	IDAR=1
>  	IDSISR=1
> -	IKVM_SKIP=1
>  	IKVM_REAL=1
>  	IKVM_VIRT=1
>  INT_DEFINE_END(h_data_storage)
> @@ -3088,32 +3054,6 @@ EXPORT_SYMBOL(do_uaccess_flush)
>  MASKED_INTERRUPT
>  MASKED_INTERRUPT hsrr=1
>
> -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> -kvmppc_skip_interrupt:
> -	/*
> -	 * Here all GPRs are unchanged from when the interrupt happened
> -	 * except for r13, which is saved in SPRG_SCRATCH0.
> -	 */
> -	mfspr	r13, SPRN_SRR0
> -	addi	r13, r13, 4
> -	mtspr	SPRN_SRR0, r13
> -	GET_SCRATCH0(r13)
> -	RFI_TO_KERNEL
> -	b	.
> -
> -kvmppc_skip_Hinterrupt:
> -	/*
> -	 * Here all GPRs are unchanged from when the interrupt happened
> -	 * except for r13, which is saved in SPRG_SCRATCH0.
> -	 */
> -	mfspr	r13, SPRN_HSRR0
> -	addi	r13, r13, 4
> -	mtspr	SPRN_HSRR0, r13
> -	GET_SCRATCH0(r13)
> -	HRFI_TO_KERNEL
> -	b	.
> -#endif
> -
>  	/*
>  	 * Relocation-on interrupts: A subset of the interrupts can be delivered
>  	 * with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering
> diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S
> index 147ebf1c3c1f..820d103e5f50 100644
> --- a/arch/powerpc/kvm/book3s_64_entry.S
> +++ b/arch/powerpc/kvm/book3s_64_entry.S
> @@ -1,9 +1,10 @@
> +#include <asm/asm-offsets.h>
>  #include <asm/cache.h>
> -#include <asm/ppc_asm.h>
> +#include <asm/exception-64s.h>
>  #include <asm/kvm_asm.h>
> -#include <asm/reg.h>
> -#include <asm/asm-offsets.h>
>  #include <asm/kvm_book3s_asm.h>
> +#include <asm/ppc_asm.h>
> +#include <asm/reg.h>
>
>  /*
>   * This is branched to from interrupt handlers in exception-64s.S which set
> @@ -19,17 +20,64 @@ kvmppc_interrupt:
>  	 * guest R12 saved in shadow VCPU SCRATCH0
>  	 * guest R13 saved in SPRN_SCRATCH0
>  	 */
> +	std	r9,HSTATE_SCRATCH2(r13)
> +	lbz	r9,HSTATE_IN_GUEST(r13)
> +	cmpwi	r9,KVM_GUEST_MODE_SKIP
> +	beq-	.Lmaybe_skip
> +.Lno_skip:
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> -	std	r9, HSTATE_SCRATCH2(r13)
> -	lbz	r9, HSTATE_IN_GUEST(r13)
> -	cmpwi	r9, KVM_GUEST_MODE_HOST_HV
> +	cmpwi	r9,KVM_GUEST_MODE_HOST_HV
>  	beq	kvmppc_bad_host_intr
>  #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
> -	cmpwi	r9, KVM_GUEST_MODE_GUEST
> -	ld	r9, HSTATE_SCRATCH2(r13)
> +	cmpwi	r9,KVM_GUEST_MODE_GUEST
> +	ld	r9,HSTATE_SCRATCH2(r13)
>  	beq	kvmppc_interrupt_pr
>  #endif
>  	b	kvmppc_interrupt_hv
>  #else
>  	b	kvmppc_interrupt_pr
>  #endif
> +
> +/*
> + * KVM uses a trick where it is running in MSR[HV]=1 mode in real-mode with the
> + * guest MMU context loaded, and it sets KVM_GUEST_MODE_SKIP and enables
> + * MSR[DR]=1 while leaving MSR[IR]=0, so it continues to fetch HV instructions
> + * but loads and stores will access the guest context. This is used to load
> + * the faulting instruction without walking page tables.
> + *
> + * However the guest context may not be able to translate, or it may cause a
> + * machine check or other issue, which will result in a fault in the host
> + * (even with KVM-HV).
> + *
> + * These faults are caught here and if the fault was (or was likely) due to
> + * that load, then we just return with the PC advanced +4 and skip the load,
> + * which then goes via the slow path.
> + */
> +.Lmaybe_skip:
> +	cmpwi	r12,BOOK3S_INTERRUPT_MACHINE_CHECK
> +	beq	1f
> +	cmpwi	r12,BOOK3S_INTERRUPT_DATA_STORAGE
> +	beq	1f
> +	cmpwi	r12,BOOK3S_INTERRUPT_DATA_SEGMENT
> +	beq	1f
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE | 0x2
> +	beq	2f
> +#endif
> +	b	.Lno_skip
> +1:	mfspr	r9,SPRN_SRR0
> +	addi	r9,r9,4
> +	mtspr	SPRN_SRR0,r9
> +	ld	r12,HSTATE_SCRATCH0(r13)
> +	ld	r9,HSTATE_SCRATCH2(r13)
> +	GET_SCRATCH0(r13)
> +	RFI_TO_KERNEL
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +2:	mfspr	r9,SPRN_HSRR0
> +	addi	r9,r9,4
> +	mtspr	SPRN_HSRR0,r9
> +	ld	r12,HSTATE_SCRATCH0(r13)
> +	ld	r9,HSTATE_SCRATCH2(r13)
> +	GET_SCRATCH0(r13)
> +	HRFI_TO_KERNEL
> +#endif

^ permalink raw reply

* Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.12-1 tag
From: pr-tracker-bot @ 2021-02-22 22:39 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: ananth, aik, skirmisher, cmr, zhengyongjun3, oohall, sandipan,
	cy.fan, elfring, jiapeng.chong, f.fainelli, unixbhaskar,
	eerykitty, aneesh.kumar, haren, ganeshgr, msuchanek, nathanl,
	robh, kjain, npiggin, broonie, cai, kernelfans, hbathini,
	atrajeev, linuxppc-dev, rdunlap, linux-kernel, fbarrat,
	po-hsu.lin, Linus Torvalds, clg
In-Reply-To: <87o8gctii6.fsf@mpe.ellerman.id.au>

The pull request you sent on Mon, 22 Feb 2021 23:05:37 +1100:

> https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git tags/powerpc-5.12-1

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/b12b47249688915e987a9a2a393b522f86f6b7ab

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html

^ permalink raw reply

* Re: [PATCH 03/13] KVM: PPC: Book3S HV: Ensure MSR[ME] is always set in guest MSR
From: Fabiano Rosas @ 2021-02-22 22:23 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210219063542.1425130-4-npiggin@gmail.com>

Nicholas Piggin <npiggin@gmail.com> writes:

> Rather than add the ME bit to the MSR when the guest is entered, make
> it clear that the hypervisor does not allow the guest to clear the bit.
>
> The ME addition is kept in the code for now, but a future patch will
> warn if it's not present.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>

> ---
>  arch/powerpc/kvm/book3s_hv_builtin.c | 7 +++++++
>  1 file changed, 7 insertions(+)
>
> diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
> index dad118760a4e..ae8f291c5c48 100644
> --- a/arch/powerpc/kvm/book3s_hv_builtin.c
> +++ b/arch/powerpc/kvm/book3s_hv_builtin.c
> @@ -661,6 +661,13 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
>
>  void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
>  {
> +	/*
> +	 * Guest must always run with machine check interrupt
> +	 * enabled.
> +	 */
> +	if (!(msr & MSR_ME))
> +		msr |= MSR_ME;
> +
>  	/*
>  	 * Check for illegal transactional state bit combination
>  	 * and if we find it, force the TS field to a safe state.

^ permalink raw reply

* Re: [PATCH 02/13] powerpc/64s: remove KVM SKIP test from instruction breakpoint handler
From: Fabiano Rosas @ 2021-02-22 22:22 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210219063542.1425130-3-npiggin@gmail.com>

Nicholas Piggin <npiggin@gmail.com> writes:

> The code being executed in KVM_GUEST_MODE_SKIP is hypervisor code with
> MSR[IR]=0, so the faults of concern are the d-side ones caused by access
> to guest context by the hypervisor.
>
> Instruction breakpoint interrupts are not a concern here. It's unlikely
> any good would come of causing breaks in this code, but skipping the
> instruction that caused it won't help matters (e.g., skip the mtmsr that
> sets MSR[DR]=0 or clears KVM_GUEST_MODE_SKIP).
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>

> ---
>  arch/powerpc/kernel/exceptions-64s.S | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 5d0ad3b38e90..5bc689a546ae 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -2597,7 +2597,6 @@ EXC_VIRT_NONE(0x5200, 0x100)
>  INT_DEFINE_BEGIN(instruction_breakpoint)
>  	IVEC=0x1300
>  #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
> -	IKVM_SKIP=1
>  	IKVM_REAL=1
>  #endif
>  INT_DEFINE_END(instruction_breakpoint)

^ permalink raw reply

* Re: [PATCH 01/13] powerpc/64s: Remove KVM handler support from CBE_RAS interrupts
From: Fabiano Rosas @ 2021-02-22 22:22 UTC (permalink / raw)
  To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210219063542.1425130-2-npiggin@gmail.com>

Nicholas Piggin <npiggin@gmail.com> writes:

> Cell does not support KVM.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>

> ---
>  arch/powerpc/kernel/exceptions-64s.S | 6 ------
>  1 file changed, 6 deletions(-)
>
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 39cbea495154..5d0ad3b38e90 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -2574,8 +2574,6 @@ EXC_VIRT_NONE(0x5100, 0x100)
>  INT_DEFINE_BEGIN(cbe_system_error)
>  	IVEC=0x1200
>  	IHSRR=1
> -	IKVM_SKIP=1
> -	IKVM_REAL=1
>  INT_DEFINE_END(cbe_system_error)
>
>  EXC_REAL_BEGIN(cbe_system_error, 0x1200, 0x100)
> @@ -2745,8 +2743,6 @@ EXC_COMMON_BEGIN(denorm_exception_common)
>  INT_DEFINE_BEGIN(cbe_maintenance)
>  	IVEC=0x1600
>  	IHSRR=1
> -	IKVM_SKIP=1
> -	IKVM_REAL=1
>  INT_DEFINE_END(cbe_maintenance)
>
>  EXC_REAL_BEGIN(cbe_maintenance, 0x1600, 0x100)
> @@ -2798,8 +2794,6 @@ EXC_COMMON_BEGIN(altivec_assist_common)
>  INT_DEFINE_BEGIN(cbe_thermal)
>  	IVEC=0x1800
>  	IHSRR=1
> -	IKVM_SKIP=1
> -	IKVM_REAL=1
>  INT_DEFINE_END(cbe_thermal)
>
>  EXC_REAL_BEGIN(cbe_thermal, 0x1800, 0x100)

^ permalink raw reply

* Re: [PATCH kernel 2/2] powerpc/iommu: Do not immediately panic when failed IOMMU table allocation
From: Leonardo Bras @ 2021-02-22 18:39 UTC (permalink / raw)
  To: Alexey Kardashevskiy, linuxppc-dev; +Cc: kvm-ppc, David Gibson
In-Reply-To: <0ae74efe-8b9a-566a-d984-8e5b662ff330@ozlabs.ru>

On Mon, 2021-02-22 at 16:24 +1100, Alexey Kardashevskiy wrote:
> 
> On 18/02/2021 06:32, Leonardo Bras wrote:
> > On Tue, 2021-02-16 at 14:33 +1100, Alexey Kardashevskiy wrote:
> > > Most platforms allocate IOMMU table structures (specifically it_map)
> > > at the boot time and when this fails - it is a valid reason for panic().
> > > 
> > > However the powernv platform allocates it_map after a device is returned
> > > to the host OS after being passed through and this happens long after
> > > the host OS booted. It is quite possible to trigger the it_map allocation
> > > panic() and kill the host even though it is not necessary - the host OS
> > > can still use the DMA bypass mode (requires a tiny fraction of it_map's
> > > memory) and even if that fails, the host OS is runnnable as it was without
> > > the device for which allocating it_map causes the panic.
> > > 
> > > Instead of immediately crashing in a powernv/ioda2 system, this prints
> > > an error and continues. All other platforms still call panic().
> > > 
> > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> > 
> > Hello Alexey,
> > 
> > This looks like a good change, that passes panic() decision to platform
> > code. Everything looks pretty straightforward, but I have a question
> > regarding this:
> > 
> > > @@ -1930,16 +1931,16 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
> > >   		res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
> > >   		res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
> > >   	}
> > > -	iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
> > > -	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
> > > 
> > > +	if (iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end))
> > > +		rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
> > > +	else
> > > +		rc = -ENOMEM;
> > >   	if (rc) {
> > > -		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
> > > -				rc);
> > > +		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", rc);
> > >   		iommu_tce_table_put(tbl);
> > > -		return rc;
> > > +		tbl = NULL; /* This clears iommu_table_base below */
> > >   	}
> > > -
> > >   	if (!pnv_iommu_bypass_disabled)
> > >   		pnv_pci_ioda2_set_bypass(pe, true);
> > >   
> > > 
> > > 
> > > 
> > > 
> > 
> > If I could understand correctly, previously if iommu_init_table() did
> > not panic(), and pnv_pci_ioda2_set_window() returned something other
> > than 0, it would return rc in the if (rc) clause, but now it does not
> > happen anymore, going through if (!pnv_iommu_bypass_disabled) onwards.
> > 
> > Is that desired?
> 
> 
> Yes. A PE (==device, pretty much) has 2 DMA windows:
> - the default one which requires some RAM to operate
> - a bypass mode which tells the hardware that PCI addresses are 
> statically mapped to RAM 1:1.
> 
> This bypass mode does not require extra memory to work and is used in 
> the most cases on the bare metal as long as the device supports 64bit 
> DMA which is everything except GPUs. Since it is cheap to enable and 
> this what we prefer anyway, no urge to fail.
> 
> 
> > As far as I could see, returning rc there seems a good procedure after
> > iommu_init_table returning -ENOMEM.
> 
> This change is intentional and yes it could be done by a separate patch 
> but I figured there is no that much value in splitting.

Ok then, thanks for clarifying.
FWIW:

Reviewed-by: Leonardo Bras <leobras.c@gmail.com>



^ permalink raw reply

* Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.12-1 tag
From: Rob Herring @ 2021-02-22 17:41 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: ananth, Alexey Kardashevskiy, kernelfans, cmr, zhengyongjun3,
	Oliver O'Halloran, sandipan, cy.fan, SF Markus Elfring,
	jiapeng.chong, skirmisher, Florian Fainelli, Bhaskar Chowdhury,
	eerykitty, Aneesh Kumar K.V, Haren Myneni, ganeshgr,
	Michal Suchanek, nathanl, kjain, Nicholas Piggin, Mark Brown,
	Qian Cai, Cédric Le Goater, Hari Bathini, atrajeev,
	Linus Torvalds, Randy Dunlap, linux-kernel@vger.kernel.org,
	fbarrat, po-hsu.lin, linuxppc-dev
In-Reply-To: <87o8gctii6.fsf@mpe.ellerman.id.au>

On Mon, Feb 22, 2021 at 6:05 AM Michael Ellerman <mpe@ellerman.id.au> wrote:
>
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA256
>
> Hi Linus,
>
> Please pull powerpc updates for 5.12.
>
> There will be a conflict with the devicetree tree. It's OK to just take their
> side of the conflict, we'll fix up the minor behaviour change that causes in a
> follow-up patch.

The issues turned out to be worse than just this, so I've dropped the
conflicting change for 5.12.

Rob

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox