LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH v5 07/10] powerpc/signal64: Replace restore_sigcontext() w/ unsafe_restore_sigcontext()
From: Christopher M. Riedl @ 2021-02-17 19:32 UTC (permalink / raw)
  To: Daniel Axtens, linuxppc-dev
In-Reply-To: <87y2ftc7el.fsf@dja-thinkpad.axtens.net>

On Fri Feb 12, 2021 at 3:17 PM CST, Daniel Axtens wrote:
> Hi Chris,
>
> > Previously restore_sigcontext() performed a costly KUAP switch on every
> > uaccess operation. These repeated uaccess switches cause a significant
> > drop in signal handling performance.
> >
> > Rewrite restore_sigcontext() to assume that a userspace read access
> > window is open. Replace all uaccess functions with their 'unsafe'
> > versions which avoid the repeated uaccess switches.
> >
>
> Much of the same comments apply here as to the last patch:
> - the commit message might be improved by adding that you are also
> changing the calling functions to open the uaccess window before
> calling into the new unsafe functions
>
> - I have checked that the safe to unsafe conversions look right.
>
> - I think you're opening too wide a window in user_read_access_begin,
> it seems to me that it could be reduced to just the
> uc_mcontext. (Again, not that it makes a difference with the current
> HW.)

Ok, I'll fix these in the next version as well.

>
> Kind regards,
> Daniel
>
> > Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
> > ---
> >  arch/powerpc/kernel/signal_64.c | 68 ++++++++++++++++++++-------------
> >  1 file changed, 41 insertions(+), 27 deletions(-)
> >
> > diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
> > index 4248e4489ff1..d668f8af18fe 100644
> > --- a/arch/powerpc/kernel/signal_64.c
> > +++ b/arch/powerpc/kernel/signal_64.c
> > @@ -326,14 +326,14 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
> >  /*
> >   * Restore the sigcontext from the signal frame.
> >   */
> > -
> > -static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
> > -			      struct sigcontext __user *sc)
> > +#define unsafe_restore_sigcontext(tsk, set, sig, sc, e) \
> > +	unsafe_op_wrap(__unsafe_restore_sigcontext(tsk, set, sig, sc), e)
> > +static long notrace __unsafe_restore_sigcontext(struct task_struct *tsk, sigset_t *set,
> > +						int sig, struct sigcontext __user *sc)
> >  {
> >  #ifdef CONFIG_ALTIVEC
> >  	elf_vrreg_t __user *v_regs;
> >  #endif
> > -	unsigned long err = 0;
> >  	unsigned long save_r13 = 0;
> >  	unsigned long msr;
> >  	struct pt_regs *regs = tsk->thread.regs;
> > @@ -348,27 +348,28 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
> >  		save_r13 = regs->gpr[13];
> >  
> >  	/* copy the GPRs */
> > -	err |= __copy_from_user(regs->gpr, sc->gp_regs, sizeof(regs->gpr));
> > -	err |= __get_user(regs->nip, &sc->gp_regs[PT_NIP]);
> > +	unsafe_copy_from_user(regs->gpr, sc->gp_regs, sizeof(regs->gpr),
> > +			      efault_out);
> > +	unsafe_get_user(regs->nip, &sc->gp_regs[PT_NIP], efault_out);
> >  	/* get MSR separately, transfer the LE bit if doing signal return */
> > -	err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
> > +	unsafe_get_user(msr, &sc->gp_regs[PT_MSR], efault_out);
> >  	if (sig)
> >  		regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
> > -	err |= __get_user(regs->orig_gpr3, &sc->gp_regs[PT_ORIG_R3]);
> > -	err |= __get_user(regs->ctr, &sc->gp_regs[PT_CTR]);
> > -	err |= __get_user(regs->link, &sc->gp_regs[PT_LNK]);
> > -	err |= __get_user(regs->xer, &sc->gp_regs[PT_XER]);
> > -	err |= __get_user(regs->ccr, &sc->gp_regs[PT_CCR]);
> > +	unsafe_get_user(regs->orig_gpr3, &sc->gp_regs[PT_ORIG_R3], efault_out);
> > +	unsafe_get_user(regs->ctr, &sc->gp_regs[PT_CTR], efault_out);
> > +	unsafe_get_user(regs->link, &sc->gp_regs[PT_LNK], efault_out);
> > +	unsafe_get_user(regs->xer, &sc->gp_regs[PT_XER], efault_out);
> > +	unsafe_get_user(regs->ccr, &sc->gp_regs[PT_CCR], efault_out);
> >  	/* Don't allow userspace to set SOFTE */
> >  	set_trap_norestart(regs);
> > -	err |= __get_user(regs->dar, &sc->gp_regs[PT_DAR]);
> > -	err |= __get_user(regs->dsisr, &sc->gp_regs[PT_DSISR]);
> > -	err |= __get_user(regs->result, &sc->gp_regs[PT_RESULT]);
> > +	unsafe_get_user(regs->dar, &sc->gp_regs[PT_DAR], efault_out);
> > +	unsafe_get_user(regs->dsisr, &sc->gp_regs[PT_DSISR], efault_out);
> > +	unsafe_get_user(regs->result, &sc->gp_regs[PT_RESULT], efault_out);
> >  
> >  	if (!sig)
> >  		regs->gpr[13] = save_r13;
> >  	if (set != NULL)
> > -		err |=  __get_user(set->sig[0], &sc->oldmask);
> > +		unsafe_get_user(set->sig[0], &sc->oldmask, efault_out);
> >  
> >  	/*
> >  	 * Force reload of FP/VEC.
> > @@ -378,29 +379,28 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
> >  	regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX);
> >  
> >  #ifdef CONFIG_ALTIVEC
> > -	err |= __get_user(v_regs, &sc->v_regs);
> > -	if (err)
> > -		return err;
> > +	unsafe_get_user(v_regs, &sc->v_regs, efault_out);
> >  	if (v_regs && !access_ok(v_regs, 34 * sizeof(vector128)))
> >  		return -EFAULT;
> >  	/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
> >  	if (v_regs != NULL && (msr & MSR_VEC) != 0) {
> > -		err |= __copy_from_user(&tsk->thread.vr_state, v_regs,
> > -					33 * sizeof(vector128));
> > +		unsafe_copy_from_user(&tsk->thread.vr_state, v_regs,
> > +				      33 * sizeof(vector128), efault_out);
> >  		tsk->thread.used_vr = true;
> >  	} else if (tsk->thread.used_vr) {
> >  		memset(&tsk->thread.vr_state, 0, 33 * sizeof(vector128));
> >  	}
> >  	/* Always get VRSAVE back */
> >  	if (v_regs != NULL)
> > -		err |= __get_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33]);
> > +		unsafe_get_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33],
> > +				efault_out);
> >  	else
> >  		tsk->thread.vrsave = 0;
> >  	if (cpu_has_feature(CPU_FTR_ALTIVEC))
> >  		mtspr(SPRN_VRSAVE, tsk->thread.vrsave);
> >  #endif /* CONFIG_ALTIVEC */
> >  	/* restore floating point */
> > -	err |= copy_fpr_from_user(tsk, &sc->fp_regs);
> > +	unsafe_copy_fpr_from_user(tsk, &sc->fp_regs, efault_out);
> >  #ifdef CONFIG_VSX
> >  	/*
> >  	 * Get additional VSX data. Update v_regs to point after the
> > @@ -409,14 +409,17 @@ static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
> >  	 */
> >  	v_regs += ELF_NVRREG;
> >  	if ((msr & MSR_VSX) != 0) {
> > -		err |= copy_vsx_from_user(tsk, v_regs);
> > +		unsafe_copy_vsx_from_user(tsk, v_regs, efault_out);
> >  		tsk->thread.used_vsr = true;
> >  	} else {
> >  		for (i = 0; i < 32 ; i++)
> >  			tsk->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
> >  	}
> >  #endif
> > -	return err;
> > +	return 0;
> > +
> > +efault_out:
> > +	return -EFAULT;
> >  }
> >  
> >  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> > @@ -701,8 +704,14 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
> >  	if (__copy_from_user(&set, &new_ctx->uc_sigmask, sizeof(set)))
> >  		do_exit(SIGSEGV);
> >  	set_current_blocked(&set);
> > -	if (restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext))
> > +
> > +	if (!user_read_access_begin(new_ctx, ctx_size))
> > +		return -EFAULT;
> > +	if (__unsafe_restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext)) {
> > +		user_read_access_end();
> >  		do_exit(SIGSEGV);
> > +	}
> > +	user_read_access_end();
> >  
> >  	/* This returns like rt_sigreturn */
> >  	set_thread_flag(TIF_RESTOREALL);
> > @@ -807,8 +816,13 @@ SYSCALL_DEFINE0(rt_sigreturn)
> >  		 * causing a TM bad thing.
> >  		 */
> >  		current->thread.regs->msr &= ~MSR_TS_MASK;
> > -		if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
> > +		if (!user_read_access_begin(uc, sizeof(*uc)))
> > +			return -EFAULT;
> > +		if (__unsafe_restore_sigcontext(current, NULL, 1, &uc->uc_mcontext)) {
> > +			user_read_access_end();
> >  			goto badframe;
> > +		}
> > +		user_read_access_end();
> >  	}
> >  
> >  	if (restore_altstack(&uc->uc_stack))
> > -- 
> > 2.26.1


^ permalink raw reply

* Re: [PATCH v5 06/10] powerpc/signal64: Replace setup_sigcontext() w/ unsafe_setup_sigcontext()
From: Christopher M. Riedl @ 2021-02-17 19:31 UTC (permalink / raw)
  To: Daniel Axtens, linuxppc-dev
In-Reply-To: <871rdletbo.fsf@dja-thinkpad.axtens.net>

On Thu Feb 11, 2021 at 11:41 PM CST, Daniel Axtens wrote:
> Hi Chris,
>
> > Previously setup_sigcontext() performed a costly KUAP switch on every
> > uaccess operation. These repeated uaccess switches cause a significant
> > drop in signal handling performance.
> >
> > Rewrite setup_sigcontext() to assume that a userspace write access window
> > is open. Replace all uaccess functions with their 'unsafe' versions
> > which avoid the repeated uaccess switches.
>
> Just to clarify the commit message a bit: you're also changing the
> callers of the old safe versions to first open the window, then call the
> unsafe variants, then close the window again.

Noted!

>
> > Signed-off-by: Christopher M. Riedl <cmr@codefail.de>
> > ---
> >  arch/powerpc/kernel/signal_64.c | 70 ++++++++++++++++++++-------------
> >  1 file changed, 43 insertions(+), 27 deletions(-)
> >
> > diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
> > index 8e1d804ce552..4248e4489ff1 100644
> > --- a/arch/powerpc/kernel/signal_64.c
> > +++ b/arch/powerpc/kernel/signal_64.c
> > @@ -101,9 +101,13 @@ static void prepare_setup_sigcontext(struct task_struct *tsk, int ctx_has_vsx_re
> >   * Set up the sigcontext for the signal frame.
> >   */
> >  
> > -static long setup_sigcontext(struct sigcontext __user *sc,
> > -		struct task_struct *tsk, int signr, sigset_t *set,
> > -		unsigned long handler, int ctx_has_vsx_region)
> > +#define unsafe_setup_sigcontext(sc, tsk, signr, set, handler,		\
> > +				ctx_has_vsx_region, e)			\
> > +	unsafe_op_wrap(__unsafe_setup_sigcontext(sc, tsk, signr, set,	\
> > +			handler, ctx_has_vsx_region), e)
> > +static long notrace __unsafe_setup_sigcontext(struct sigcontext __user *sc,
> > +					struct task_struct *tsk, int signr, sigset_t *set,
> > +					unsigned long handler, int ctx_has_vsx_region)
> >  {
> >  	/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
> >  	 * process never used altivec yet (MSR_VEC is zero in pt_regs of
> > @@ -118,20 +122,19 @@ static long setup_sigcontext(struct sigcontext __user *sc,
> >  #endif
> >  	struct pt_regs *regs = tsk->thread.regs;
> >  	unsigned long msr = regs->msr;
> > -	long err = 0;
> >  	/* Force usr to alway see softe as 1 (interrupts enabled) */
> >  	unsigned long softe = 0x1;
> >  
> >  	BUG_ON(tsk != current);
> >  
> >  #ifdef CONFIG_ALTIVEC
> > -	err |= __put_user(v_regs, &sc->v_regs);
> > +	unsafe_put_user(v_regs, &sc->v_regs, efault_out);
> >  
> >  	/* save altivec registers */
> >  	if (tsk->thread.used_vr) {
> >  		/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
> > -		err |= __copy_to_user(v_regs, &tsk->thread.vr_state,
> > -				      33 * sizeof(vector128));
> > +		unsafe_copy_to_user(v_regs, &tsk->thread.vr_state,
> > +				    33 * sizeof(vector128), efault_out);
> >  		/* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg)
> >  		 * contains valid data.
> >  		 */
> > @@ -140,12 +143,12 @@ static long setup_sigcontext(struct sigcontext __user *sc,
> >  	/* We always copy to/from vrsave, it's 0 if we don't have or don't
> >  	 * use altivec.
> >  	 */
> > -	err |= __put_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33]);
> > +	unsafe_put_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33], efault_out);
> >  #else /* CONFIG_ALTIVEC */
> > -	err |= __put_user(0, &sc->v_regs);
> > +	unsafe_put_user(0, &sc->v_regs, efault_out);
> >  #endif /* CONFIG_ALTIVEC */
> >  	/* copy fpr regs and fpscr */
> > -	err |= copy_fpr_to_user(&sc->fp_regs, tsk);
> > +	unsafe_copy_fpr_to_user(&sc->fp_regs, tsk, efault_out);
> >  
> >  	/*
> >  	 * Clear the MSR VSX bit to indicate there is no valid state attached
> > @@ -160,24 +163,27 @@ static long setup_sigcontext(struct sigcontext __user *sc,
> >  	 */
> >  	if (tsk->thread.used_vsr && ctx_has_vsx_region) {
> >  		v_regs += ELF_NVRREG;
> > -		err |= copy_vsx_to_user(v_regs, tsk);
> > +		unsafe_copy_vsx_to_user(v_regs, tsk, efault_out);
> >  		/* set MSR_VSX in the MSR value in the frame to
> >  		 * indicate that sc->vs_reg) contains valid data.
> >  		 */
> >  		msr |= MSR_VSX;
> >  	}
> >  #endif /* CONFIG_VSX */
> > -	err |= __put_user(&sc->gp_regs, &sc->regs);
> > +	unsafe_put_user(&sc->gp_regs, &sc->regs, efault_out);
> >  	WARN_ON(!FULL_REGS(regs));
> > -	err |= __copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE);
> > -	err |= __put_user(msr, &sc->gp_regs[PT_MSR]);
> > -	err |= __put_user(softe, &sc->gp_regs[PT_SOFTE]);
> > -	err |= __put_user(signr, &sc->signal);
> > -	err |= __put_user(handler, &sc->handler);
> > +	unsafe_copy_to_user(&sc->gp_regs, regs, GP_REGS_SIZE, efault_out);
> > +	unsafe_put_user(msr, &sc->gp_regs[PT_MSR], efault_out);
> > +	unsafe_put_user(softe, &sc->gp_regs[PT_SOFTE], efault_out);
> > +	unsafe_put_user(signr, &sc->signal, efault_out);
> > +	unsafe_put_user(handler, &sc->handler, efault_out);
> >  	if (set != NULL)
> > -		err |=  __put_user(set->sig[0], &sc->oldmask);
> > +		unsafe_put_user(set->sig[0], &sc->oldmask, efault_out);
> >  
> > -	return err;
> > +	return 0;
> > +
> > +efault_out:
> > +	return -EFAULT;
> >  }
> >  
> >  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> > @@ -664,12 +670,15 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
> >  
> >  	if (old_ctx != NULL) {
> >  		prepare_setup_sigcontext(current, ctx_has_vsx_region);
> > -		if (!access_ok(old_ctx, ctx_size)
> > -		    || setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, 0,
> > -					ctx_has_vsx_region)
> > -		    || __copy_to_user(&old_ctx->uc_sigmask,
> > -				      &current->blocked, sizeof(sigset_t)))
> > +		if (!user_write_access_begin(old_ctx, ctx_size))
> >  			return -EFAULT;
>
> I notice we get rid of access_ok, but that's fine because
> user_write_access_begin includes access_ok since Linus decided that was
> a good idea.
>
> > +
> > +		unsafe_setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL,
> > +					0, ctx_has_vsx_region, efault_out);
> > +		unsafe_copy_to_user(&old_ctx->uc_sigmask, &current->blocked,
> > +				    sizeof(sigset_t), efault_out);
> > +
> > +		user_write_access_end();
> >  	}
> >  	if (new_ctx == NULL)
> >  		return 0;
> > @@ -698,6 +707,10 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx,
> >  	/* This returns like rt_sigreturn */
> >  	set_thread_flag(TIF_RESTOREALL);
> >  	return 0;
> > +
> > +efault_out:
> > +	user_write_access_end();
> > +	return -EFAULT;
> >  }
> >  
> >  
> > @@ -850,9 +863,12 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
> >  	} else {
> >  		err |= __put_user(0, &frame->uc.uc_link);
> >  		prepare_setup_sigcontext(tsk, 1);
> > -		err |= setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig,
> > -					NULL, (unsigned long)ksig->ka.sa.sa_handler,
> > -					1);
> > +		if (!user_write_access_begin(frame, sizeof(struct rt_sigframe)))
> > +			return -EFAULT;
>
> Here you're opening a window for all of `frame`...
>
> > +		err |= __unsafe_setup_sigcontext(&frame->uc.uc_mcontext, tsk,
> ... but here you're only passing in frame->uc.uc_mcontext for writing.
>
> ISTR that the underlying AMR switch is fully on / fully off so I don't
> think it really matters, but in this case should you be calling
> user_write_access_begin() with &frame->uc.uc_mcontext and the size of
> that?

You are correct that it doesn't _really_ matter w/ the current
implementation of KUAP/AMR. It's still inconsistent and could cause
problems in the future so I will fix this to pass the proper size.

>
> > +						ksig->sig, NULL,
> > +						(unsigned long)ksig->ka.sa.sa_handler, 1);
> > +		user_write_access_end();
> >  	}
> >  	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
> >  	if (err)
>
>
> Apart from the size thing, everything looks good to me. I checked that
> all the things you've changed from safe to unsafe pass the same
> parameters, and they all looked good to me.

Thanks!

>
> With those caveats,
> Reviewed-by: Daniel Axtens <dja@axtens.net>
>
> Kind regards,
> Daniel


^ permalink raw reply

* Re: [PATCH kernel 1/2] powerpc/iommu: Allocate it_map by vmalloc
From: Leonardo Bras @ 2021-02-17 19:11 UTC (permalink / raw)
  To: Alexey Kardashevskiy, linuxppc-dev; +Cc: kvm-ppc, David Gibson
In-Reply-To: <20210216033307.69863-2-aik@ozlabs.ru>

On Tue, 2021-02-16 at 14:33 +1100, Alexey Kardashevskiy wrote:
> The IOMMU table uses the it_map bitmap to keep track of allocated DMA
> pages. This has always been a contiguous array allocated at either
> the boot time or when a passed through device is returned to the host OS.
> The it_map memory is allocated by alloc_pages() which allocates
> contiguous physical memory.
> 
> Such allocation method occasionally creates a problem when there is
> no big chunk of memory available (no free memory or too fragmented).
> On powernv/ioda2 the default DMA window requires 16MB for it_map.
> 
> This replaces alloc_pages_node() with vzalloc_node() which allocates
> contiguous block but in virtual memory. This should reduce changes of
> failure but should not cause other behavioral changes as it_map is only
> used by the kernel's DMA hooks/api when MMU is on.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

It looks a very good change, and also makes code much simpler to read.

FWIW:

Reviewed-by: Leonardo Bras <leobras.c@gmail,com>

> ---
>  arch/powerpc/kernel/iommu.c | 15 +++------------
>  1 file changed, 3 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index c00214a4355c..8eb6eb0afa97 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -719,7 +719,6 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
>  {
>  	unsigned long sz;
>  	static int welcomed = 0;
> -	struct page *page;
>  	unsigned int i;
>  	struct iommu_pool *p;
>  
> 
> 
> 
> @@ -728,11 +727,9 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
>  	/* number of bytes needed for the bitmap */
>  	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
>  
> 
> 
> 
> -	page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz));
> -	if (!page)
> +	tbl->it_map = vzalloc_node(sz, nid);
> +	if (!tbl->it_map)
>  		panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
> -	tbl->it_map = page_address(page);
> -	memset(tbl->it_map, 0, sz);
>  
> 
> 
> 
>  	iommu_table_reserve_pages(tbl, res_start, res_end);
>  
> 
> 
> 
> @@ -774,8 +771,6 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
>  
> 
> 
> 
>  static void iommu_table_free(struct kref *kref)
>  {
> -	unsigned long bitmap_sz;
> -	unsigned int order;
>  	struct iommu_table *tbl;
>  
> 
> 
> 
>  	tbl = container_of(kref, struct iommu_table, it_kref);
> @@ -796,12 +791,8 @@ static void iommu_table_free(struct kref *kref)
>  	if (!bitmap_empty(tbl->it_map, tbl->it_size))
>  		pr_warn("%s: Unexpected TCEs\n", __func__);
>  
> 
> 
> 
> -	/* calculate bitmap size in bytes */
> -	bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
> -
>  	/* free bitmap */
> -	order = get_order(bitmap_sz);
> -	free_pages((unsigned long) tbl->it_map, order);
> +	vfree(tbl->it_map);
>  
> 
> 
> 
>  	/* free table */
>  	kfree(tbl);



^ permalink raw reply

* Re: [PATCH 2/2] powerpc/uaccess: Move might_fault() into user_access_begin()
From: Christophe Leroy @ 2021-02-17 18:29 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: aik, linuxppc-dev
In-Reply-To: <87czwzv4io.fsf@mpe.ellerman.id.au>

Michael Ellerman <mpe@ellerman.id.au> a écrit :

> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> Le 08/02/2021 à 14:57, Michael Ellerman a écrit :
>>> We have a might_fault() check in __unsafe_put_user_goto(), but that is
>>> dangerous as it potentially calls lots of code while user access is
>>> enabled.
>>>
>>> It also triggers the check Alexey added to the irq restore path to
>>> catch cases like that:
>>>
>>>    WARNING: CPU: 30 PID: 1 at  
>>> arch/powerpc/include/asm/book3s/64/kup.h:324  
>>> arch_local_irq_restore+0x160/0x190
>>>    NIP arch_local_irq_restore+0x160/0x190
>>>    LR  lock_is_held_type+0x140/0x200
>>>    Call Trace:
>>>      0xc00000007f392ff8 (unreliable)
>>>      ___might_sleep+0x180/0x320
>>>      __might_fault+0x50/0xe0
>>>      filldir64+0x2d0/0x5d0
>>>      call_filldir+0xc8/0x180
>>>      ext4_readdir+0x948/0xb40
>>>      iterate_dir+0x1ec/0x240
>>>      sys_getdents64+0x80/0x290
>>>      system_call_exception+0x160/0x280
>>>      system_call_common+0xf0/0x27c
>>>
>>> So remove the might fault check from unsafe_put_user().
>>>
>>> Any call to unsafe_put_user() must be inside a region that's had user
>>> access enabled with user_access_begin(), so move the might_fault() in
>>> there. That also allows us to drop the is_kernel_addr() test, because
>>> there should be no code using user_access_begin() in order to access a
>>> kernel address.
>>
>> x86 and mips only have might_fault() on get_user() and put_user(),
>> neither on __get_user() nor on __put_user() nor on the unsafe
>> alternative.
>
> Yeah, that's their choice, or perhaps it's by accident.
>
> arm64 on the other hand has might_fault() in all variants.
>
> A __get_user() can fault just as much as a get_user(), so there's no
> reason the check should be omitted from __get_user(), other than perhaps
> some historical argument about __get_user() being the "fast" case.
>
>> When have might_fault() in __get_user_nocheck() that is used by
>> __get_user() and __get_user_allowed() ie by unsafe_get_user().
>>
>> Shoudln't those be dropped as well ?
>
> That was handled by Alexey's patch, which I ended up merging with this
> one:
>
>   https://git.kernel.org/torvalds/c/7d506ca97b665b95e698a53697dad99fae813c1a
>
>
> ie. we still have might_fault() in __get_user_nocheck(), but it's
> guarded by a check of do_allow, so we won't call it for
> __get_user_allowed().
>
> So I think the code (in my next branch) is correct, we don't have any
> might_fault() calls in unsafe regions.
>
> But I'd still be happier if we could simplify our uaccess.h more, it's a
> bit of a rats nest. We could start by making __get/put_user() ==
> get/put_user() the same way arm64 did.

I agree there are several easy simplifications to do there. I'll look  
at that in the coming weeks.

I'm not sure it is good to make __get/put_user equal get/put_user as  
it would mean calling access_ok() everytime. But we could most likely  
make something simpler with get_user() calling access_ok() then  
__get_user().

I think we should also audit our use of the _inatomic variants.  
might_fault() voids when pagefault is disabled so I think the inatomic  
variants should be needed. As far as I can see, powerpc is the only  
arch having that.

Need to also check why we still need that is_kernel_addr() check  
because since the removal of set_fs(), __get/put_user() helpers  
shouldn't be used anymore for kernel addresses

Christophe




^ permalink raw reply

* Re: [PATCH 2/2] powerpc/uaccess: Move might_fault() into user_access_begin()
From: Michael Ellerman @ 2021-02-17  1:58 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev; +Cc: aik
In-Reply-To: <b1d3af99-2d38-0794-0694-95a735fccbe3@csgroup.eu>

Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> Le 08/02/2021 à 14:57, Michael Ellerman a écrit :
>> We have a might_fault() check in __unsafe_put_user_goto(), but that is
>> dangerous as it potentially calls lots of code while user access is
>> enabled.
>> 
>> It also triggers the check Alexey added to the irq restore path to
>> catch cases like that:
>> 
>>    WARNING: CPU: 30 PID: 1 at arch/powerpc/include/asm/book3s/64/kup.h:324 arch_local_irq_restore+0x160/0x190
>>    NIP arch_local_irq_restore+0x160/0x190
>>    LR  lock_is_held_type+0x140/0x200
>>    Call Trace:
>>      0xc00000007f392ff8 (unreliable)
>>      ___might_sleep+0x180/0x320
>>      __might_fault+0x50/0xe0
>>      filldir64+0x2d0/0x5d0
>>      call_filldir+0xc8/0x180
>>      ext4_readdir+0x948/0xb40
>>      iterate_dir+0x1ec/0x240
>>      sys_getdents64+0x80/0x290
>>      system_call_exception+0x160/0x280
>>      system_call_common+0xf0/0x27c
>> 
>> So remove the might fault check from unsafe_put_user().
>> 
>> Any call to unsafe_put_user() must be inside a region that's had user
>> access enabled with user_access_begin(), so move the might_fault() in
>> there. That also allows us to drop the is_kernel_addr() test, because
>> there should be no code using user_access_begin() in order to access a
>> kernel address.
>
> x86 and mips only have might_fault() on get_user() and put_user(),
> neither on __get_user() nor on __put_user() nor on the unsafe
> alternative.

Yeah, that's their choice, or perhaps it's by accident.

arm64 on the other hand has might_fault() in all variants.

A __get_user() can fault just as much as a get_user(), so there's no
reason the check should be omitted from __get_user(), other than perhaps
some historical argument about __get_user() being the "fast" case.

> When have might_fault() in __get_user_nocheck() that is used by
> __get_user() and __get_user_allowed() ie by unsafe_get_user().
>
> Shoudln't those be dropped as well ?

That was handled by Alexey's patch, which I ended up merging with this
one:

  https://git.kernel.org/torvalds/c/7d506ca97b665b95e698a53697dad99fae813c1a


ie. we still have might_fault() in __get_user_nocheck(), but it's
guarded by a check of do_allow, so we won't call it for
__get_user_allowed().

So I think the code (in my next branch) is correct, we don't have any
might_fault() calls in unsafe regions.

But I'd still be happier if we could simplify our uaccess.h more, it's a
bit of a rats nest. We could start by making __get/put_user() ==
get/put_user() the same way arm64 did.

cheers

^ permalink raw reply

* Re: [PATCH v4 1/3] powerpc/book3s64/radix/tlb: tlbie primitives for process-scoped invalidations from guests
From: David Gibson @ 2021-02-17  0:24 UTC (permalink / raw)
  To: Bharata B Rao; +Cc: farosas, aneesh.kumar, npiggin, kvm-ppc, linuxppc-dev
In-Reply-To: <20210215063542.3642366-2-bharata@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 8651 bytes --]

On Mon, Feb 15, 2021 at 12:05:40PM +0530, Bharata B Rao wrote:
> H_RPT_INVALIDATE hcall needs to perform process scoped tlbie
> invalidations of L1 and nested guests from L0. This needs RS register
> for TLBIE instruction to contain both PID and LPID. Introduce
> primitives that execute tlbie instruction with both PID
> and LPID set in prepartion for H_RPT_INVALIDATE hcall.
> 
> While we are here, move RIC_FLUSH definitions to header file
> and introduce helper rpti_pgsize_to_psize() that will be needed
> by the upcoming hcall.
> 
> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> ---
>  .../include/asm/book3s/64/tlbflush-radix.h    |  18 +++
>  arch/powerpc/mm/book3s64/radix_tlb.c          | 122 +++++++++++++++++-
>  2 files changed, 136 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> index 94439e0cefc9..aace7e9b2397 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> @@ -4,6 +4,10 @@
>  
>  #include <asm/hvcall.h>
>  
> +#define RIC_FLUSH_TLB 0
> +#define RIC_FLUSH_PWC 1
> +#define RIC_FLUSH_ALL 2
> +
>  struct vm_area_struct;
>  struct mm_struct;
>  struct mmu_gather;
> @@ -21,6 +25,20 @@ static inline u64 psize_to_rpti_pgsize(unsigned long psize)
>  	return H_RPTI_PAGE_ALL;
>  }
>  
> +static inline int rpti_pgsize_to_psize(unsigned long page_size)
> +{
> +	if (page_size == H_RPTI_PAGE_4K)
> +		return MMU_PAGE_4K;
> +	if (page_size == H_RPTI_PAGE_64K)
> +		return MMU_PAGE_64K;
> +	if (page_size == H_RPTI_PAGE_2M)
> +		return MMU_PAGE_2M;
> +	if (page_size == H_RPTI_PAGE_1G)
> +		return MMU_PAGE_1G;
> +	else
> +		return MMU_PAGE_64K; /* Default */
> +}

Would it make sense to put the H_RPT_PAGE_ tags into the
mmu_psize_defs table and scan that here, rather than open coding the
conversion?

> +
>  static inline int mmu_get_ap(int psize)
>  {
>  	return mmu_psize_defs[psize].ap;
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
> index fb66d154b26c..097402435303 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -18,10 +18,6 @@
>  #include <asm/cputhreads.h>
>  #include <asm/plpar_wrappers.h>
>  
> -#define RIC_FLUSH_TLB 0
> -#define RIC_FLUSH_PWC 1
> -#define RIC_FLUSH_ALL 2
> -
>  /*
>   * tlbiel instruction for radix, set invalidation
>   * i.e., r=1 and is=01 or is=10 or is=11
> @@ -128,6 +124,21 @@ static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
>  	trace_tlbie(0, 0, rb, rs, ric, prs, r);
>  }
>  
> +static __always_inline void __tlbie_pid_lpid(unsigned long pid,
> +					     unsigned long lpid,
> +					     unsigned long ric)
> +{
> +	unsigned long rb, rs, prs, r;
> +
> +	rb = PPC_BIT(53); /* IS = 1 */
> +	rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
> +	prs = 1; /* process scoped */
> +	r = 1;   /* radix format */
> +
> +	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
> +		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
> +	trace_tlbie(0, 0, rb, rs, ric, prs, r);
> +}
>  static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
>  {
>  	unsigned long rb,rs,prs,r;
> @@ -188,6 +199,23 @@ static __always_inline void __tlbie_va(unsigned long va, unsigned long pid,
>  	trace_tlbie(0, 0, rb, rs, ric, prs, r);
>  }
>  
> +static __always_inline void __tlbie_va_lpid(unsigned long va, unsigned long pid,
> +					    unsigned long lpid,
> +					    unsigned long ap, unsigned long ric)
> +{
> +	unsigned long rb, rs, prs, r;
> +
> +	rb = va & ~(PPC_BITMASK(52, 63));
> +	rb |= ap << PPC_BITLSHIFT(58);
> +	rs = (pid << PPC_BITLSHIFT(31)) | (lpid & ~(PPC_BITMASK(0, 31)));
> +	prs = 1; /* process scoped */
> +	r = 1;   /* radix format */
> +
> +	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
> +		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
> +	trace_tlbie(0, 0, rb, rs, ric, prs, r);
> +}
> +
>  static __always_inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
>  					    unsigned long ap, unsigned long ric)
>  {
> @@ -233,6 +261,22 @@ static inline void fixup_tlbie_va_range(unsigned long va, unsigned long pid,
>  	}
>  }
>  
> +static inline void fixup_tlbie_va_range_lpid(unsigned long va,
> +					     unsigned long pid,
> +					     unsigned long lpid,
> +					     unsigned long ap)
> +{
> +	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
> +		asm volatile("ptesync" : : : "memory");
> +		__tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
> +	}
> +
> +	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
> +		asm volatile("ptesync" : : : "memory");
> +		__tlbie_va_lpid(va, pid, lpid, ap, RIC_FLUSH_TLB);
> +	}
> +}
> +
>  static inline void fixup_tlbie_pid(unsigned long pid)
>  {
>  	/*
> @@ -252,6 +296,25 @@ static inline void fixup_tlbie_pid(unsigned long pid)
>  	}
>  }
>  
> +static inline void fixup_tlbie_pid_lpid(unsigned long pid, unsigned long lpid)
> +{
> +	/*
> +	 * We can use any address for the invalidation, pick one which is
> +	 * probably unused as an optimisation.
> +	 */
> +	unsigned long va = ((1UL << 52) - 1);
> +
> +	if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
> +		asm volatile("ptesync" : : : "memory");
> +		__tlbie_pid_lpid(0, lpid, RIC_FLUSH_TLB);
> +	}
> +
> +	if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
> +		asm volatile("ptesync" : : : "memory");
> +		__tlbie_va_lpid(va, pid, lpid, mmu_get_ap(MMU_PAGE_64K),
> +				RIC_FLUSH_TLB);
> +	}
> +}
>  
>  static inline void fixup_tlbie_lpid_va(unsigned long va, unsigned long lpid,
>  				       unsigned long ap)
> @@ -342,6 +405,31 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
>  	asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +static inline void _tlbie_pid_lpid(unsigned long pid, unsigned long lpid,
> +				   unsigned long ric)
> +{
> +	asm volatile("ptesync" : : : "memory");
> +
> +	/*
> +	 * Workaround the fact that the "ric" argument to __tlbie_pid
> +	 * must be a compile-time contraint to match the "i" constraint
> +	 * in the asm statement.
> +	 */
> +	switch (ric) {
> +	case RIC_FLUSH_TLB:
> +		__tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
> +		fixup_tlbie_pid_lpid(pid, lpid);
> +		break;
> +	case RIC_FLUSH_PWC:
> +		__tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
> +		break;
> +	case RIC_FLUSH_ALL:
> +	default:
> +		__tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
> +		fixup_tlbie_pid_lpid(pid, lpid);
> +	}
> +	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
> +}
>  struct tlbiel_pid {
>  	unsigned long pid;
>  	unsigned long ric;
> @@ -467,6 +555,20 @@ static inline void __tlbie_va_range(unsigned long start, unsigned long end,
>  	fixup_tlbie_va_range(addr - page_size, pid, ap);
>  }
>  
> +static inline void __tlbie_va_range_lpid(unsigned long start, unsigned long end,
> +					 unsigned long pid, unsigned long lpid,
> +					 unsigned long page_size,
> +					 unsigned long psize)
> +{
> +	unsigned long addr;
> +	unsigned long ap = mmu_get_ap(psize);
> +
> +	for (addr = start; addr < end; addr += page_size)
> +		__tlbie_va_lpid(addr, pid, lpid, ap, RIC_FLUSH_TLB);
> +
> +	fixup_tlbie_va_range_lpid(addr - page_size, pid, lpid, ap);
> +}
> +
>  static __always_inline void _tlbie_va(unsigned long va, unsigned long pid,
>  				      unsigned long psize, unsigned long ric)
>  {
> @@ -547,6 +649,18 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
>  	asm volatile("eieio; tlbsync; ptesync": : :"memory");
>  }
>  
> +static inline void _tlbie_va_range_lpid(unsigned long start, unsigned long end,
> +					unsigned long pid, unsigned long lpid,
> +					unsigned long page_size,
> +					unsigned long psize, bool also_pwc)
> +{
> +	asm volatile("ptesync" : : : "memory");
> +	if (also_pwc)
> +		__tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
> +	__tlbie_va_range_lpid(start, end, pid, lpid, page_size, psize);
> +	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
> +}
> +
>  static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
>  				unsigned long start, unsigned long end,
>  				unsigned long pid, unsigned long page_size,

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH kernel 2/2] powerpc/iommu: Do not immediately panic when failed IOMMU table allocation
From: David Gibson @ 2021-02-17  0:16 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev, kvm-ppc
In-Reply-To: <20210216033307.69863-3-aik@ozlabs.ru>

[-- Attachment #1: Type: text/plain, Size: 7551 bytes --]

On Tue, Feb 16, 2021 at 02:33:07PM +1100, Alexey Kardashevskiy wrote:
> Most platforms allocate IOMMU table structures (specifically it_map)
> at the boot time and when this fails - it is a valid reason for panic().
> 
> However the powernv platform allocates it_map after a device is returned
> to the host OS after being passed through and this happens long after
> the host OS booted. It is quite possible to trigger the it_map allocation
> panic() and kill the host even though it is not necessary - the host OS
> can still use the DMA bypass mode (requires a tiny fraction of it_map's
> memory) and even if that fails, the host OS is runnnable as it was without
> the device for which allocating it_map causes the panic.
> 
> Instead of immediately crashing in a powernv/ioda2 system, this prints
> an error and continues. All other platforms still call panic().
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  arch/powerpc/kernel/iommu.c               |  6 ++++--
>  arch/powerpc/platforms/cell/iommu.c       |  3 ++-
>  arch/powerpc/platforms/pasemi/iommu.c     |  4 +++-
>  arch/powerpc/platforms/powernv/pci-ioda.c | 15 ++++++++-------
>  arch/powerpc/platforms/pseries/iommu.c    | 10 +++++++---
>  arch/powerpc/sysdev/dart_iommu.c          |  3 ++-
>  6 files changed, 26 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 8eb6eb0afa97..c1a5c366a664 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -728,8 +728,10 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
>  	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
>  
>  	tbl->it_map = vzalloc_node(sz, nid);
> -	if (!tbl->it_map)
> -		panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
> +	if (!tbl->it_map) {
> +		pr_err("%s: Can't allocate %ld bytes\n", __func__, sz);
> +		return NULL;
> +	}
>  
>  	iommu_table_reserve_pages(tbl, res_start, res_end);
>  
> diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
> index 2124831cf57c..fa08699aedeb 100644
> --- a/arch/powerpc/platforms/cell/iommu.c
> +++ b/arch/powerpc/platforms/cell/iommu.c
> @@ -486,7 +486,8 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
>  	window->table.it_size = size >> window->table.it_page_shift;
>  	window->table.it_ops = &cell_iommu_ops;
>  
> -	iommu_init_table(&window->table, iommu->nid, 0, 0);
> +	if (!iommu_init_table(&window->table, iommu->nid, 0, 0))
> +		panic("Failed to initialize iommu table");
>  
>  	pr_debug("\tioid      %d\n", window->ioid);
>  	pr_debug("\tblocksize %ld\n", window->table.it_blocksize);
> diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
> index b500a6e47e6b..5be7242fbd86 100644
> --- a/arch/powerpc/platforms/pasemi/iommu.c
> +++ b/arch/powerpc/platforms/pasemi/iommu.c
> @@ -146,7 +146,9 @@ static void iommu_table_iobmap_setup(void)
>  	 */
>  	iommu_table_iobmap.it_blocksize = 4;
>  	iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops;
> -	iommu_init_table(&iommu_table_iobmap, 0, 0, 0);
> +	if (!iommu_init_table(&iommu_table_iobmap, 0, 0, 0))
> +		panic("Failed to initialize iommu table");
> +
>  	pr_debug(" <- %s\n", __func__);
>  }
>  
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index f0f901683a2f..66c3c3337334 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -1762,7 +1762,8 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
>  	tbl->it_ops = &pnv_ioda1_iommu_ops;
>  	pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
>  	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
> -	iommu_init_table(tbl, phb->hose->node, 0, 0);
> +	if (!iommu_init_table(tbl, phb->hose->node, 0, 0))
> +		panic("Failed to initialize iommu table");
>  
>  	pe->dma_setup_done = true;
>  	return;
> @@ -1930,16 +1931,16 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
>  		res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
>  		res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
>  	}
> -	iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
>  
> -	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
> +	if (iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end))
> +		rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
> +	else
> +		rc = -ENOMEM;
>  	if (rc) {
> -		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
> -				rc);
> +		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", rc);
>  		iommu_tce_table_put(tbl);
> -		return rc;
> +		tbl = NULL; /* This clears iommu_table_base below */
>  	}
> -
>  	if (!pnv_iommu_bypass_disabled)
>  		pnv_pci_ioda2_set_bypass(pe, true);
>  
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 9fc5217f0c8e..4d9ac1f181c2 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -638,7 +638,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
>  
>  	iommu_table_setparms(pci->phb, dn, tbl);
>  	tbl->it_ops = &iommu_table_pseries_ops;
> -	iommu_init_table(tbl, pci->phb->node, 0, 0);
> +	if (!iommu_init_table(tbl, pci->phb->node, 0, 0))
> +		panic("Failed to initialize iommu table");
>  
>  	/* Divide the rest (1.75GB) among the children */
>  	pci->phb->dma_window_size = 0x80000000ul;
> @@ -720,7 +721,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
>  		iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
>  				ppci->table_group, dma_window);
>  		tbl->it_ops = &iommu_table_lpar_multi_ops;
> -		iommu_init_table(tbl, ppci->phb->node, 0, 0);
> +		if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
> +			panic("Failed to initialize iommu table");
>  		iommu_register_group(ppci->table_group,
>  				pci_domain_nr(bus), 0);
>  		pr_debug("  created table: %p\n", ppci->table_group);
> @@ -749,7 +751,9 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
>  		tbl = PCI_DN(dn)->table_group->tables[0];
>  		iommu_table_setparms(phb, dn, tbl);
>  		tbl->it_ops = &iommu_table_pseries_ops;
> -		iommu_init_table(tbl, phb->node, 0, 0);
> +		if (!iommu_init_table(tbl, phb->node, 0, 0))
> +			panic("Failed to initialize iommu table");
> +
>  		set_iommu_table_base(&dev->dev, tbl);
>  		return;
>  	}
> diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
> index 6b4a34b36d98..1d33b7a5ea83 100644
> --- a/arch/powerpc/sysdev/dart_iommu.c
> +++ b/arch/powerpc/sysdev/dart_iommu.c
> @@ -344,7 +344,8 @@ static void iommu_table_dart_setup(void)
>  	iommu_table_dart.it_index = 0;
>  	iommu_table_dart.it_blocksize = 1;
>  	iommu_table_dart.it_ops = &iommu_dart_ops;
> -	iommu_init_table(&iommu_table_dart, -1, 0, 0);
> +	if (!iommu_init_table(&iommu_table_dart, -1, 0, 0))
> +		panic("Failed to initialize iommu table");
>  
>  	/* Reserve the last page of the DART to avoid possible prefetch
>  	 * past the DART mapped area

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH v4 2/3] KVM: PPC: Book3S HV: Add support for H_RPT_INVALIDATE
From: David Gibson @ 2021-02-17  0:38 UTC (permalink / raw)
  To: Bharata B Rao; +Cc: farosas, aneesh.kumar, npiggin, kvm-ppc, linuxppc-dev
In-Reply-To: <20210215063542.3642366-3-bharata@linux.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 14794 bytes --]

On Mon, Feb 15, 2021 at 12:05:41PM +0530, Bharata B Rao wrote:
> Implement H_RPT_INVALIDATE hcall and add KVM capability
> KVM_CAP_PPC_RPT_INVALIDATE to indicate the support for the same.
> 
> This hcall does two types of TLB invalidations:
> 
> 1. Process-scoped invalidations for guests with LPCR[GTSE]=0.
>    This is currently not used in KVM as GTSE is not usually
>    disabled in KVM.
> 2. Partition-scoped invalidations that an L1 hypervisor does on
>    behalf of an L2 guest. This replaces the uses of the existing
>    hcall H_TLB_INVALIDATE.
> 
> In order to handle process scoped invalidations of L2, we
> intercept the nested exit handling code in L0 only to handle
> H_TLB_INVALIDATE hcall.
> 
> Signed-off-by: Bharata B Rao <bharata@linux.ibm.com>
> ---
>  Documentation/virt/kvm/api.rst         | 17 +++++
>  arch/powerpc/include/asm/kvm_book3s.h  |  3 +
>  arch/powerpc/include/asm/mmu_context.h | 11 +++
>  arch/powerpc/kvm/book3s_hv.c           | 91 ++++++++++++++++++++++++
>  arch/powerpc/kvm/book3s_hv_nested.c    | 96 ++++++++++++++++++++++++++
>  arch/powerpc/kvm/powerpc.c             |  3 +
>  arch/powerpc/mm/book3s64/radix_tlb.c   | 25 +++++++
>  include/uapi/linux/kvm.h               |  1 +
>  8 files changed, 247 insertions(+)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 99ceb978c8b0..416c36aa35d4 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6038,6 +6038,23 @@ KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR exit notifications which user space
>  can then handle to implement model specific MSR handling and/or user notifications
>  to inform a user that an MSR was not handled.
>  
> +7.22 KVM_CAP_PPC_RPT_INVALIDATE
> +------------------------------
> +
> +:Capability: KVM_CAP_PPC_RPT_INVALIDATE
> +:Architectures: ppc
> +:Type: vm
> +
> +This capability indicates that the kernel is capable of handling
> +H_RPT_INVALIDATE hcall.
> +
> +In order to enable the use of H_RPT_INVALIDATE in the guest,
> +user space might have to advertise it for the guest. For example,
> +IBM pSeries (sPAPR) guest starts using it if "hcall-rpt-invalidate" is
> +present in the "ibm,hypertas-functions" device-tree property.
> +
> +This capability is always enabled.

I guess that means it's always enabled when it's available - I'm
pretty sure it won't be enabled on POWER8 or on PR KVM.

> +
>  8. Other capabilities.
>  ======================
>  
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index d32ec9ae73bd..0f1c5fa6e8ce 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -298,6 +298,9 @@ void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
>  void kvmhv_release_all_nested(struct kvm *kvm);
>  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
>  long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
> +long kvmhv_h_rpti_nested(struct kvm_vcpu *vcpu, unsigned long lpid,
> +			 unsigned long type, unsigned long pg_sizes,
> +			 unsigned long start, unsigned long end);
>  int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu,
>  			  u64 time_limit, unsigned long lpcr);
>  void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
> diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
> index d5821834dba9..fbf3b5b45fe9 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -124,8 +124,19 @@ static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea)
>  
>  #if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU)
>  extern void radix_kvm_prefetch_workaround(struct mm_struct *mm);
> +void do_h_rpt_invalidate(unsigned long pid, unsigned long lpid,
> +			 unsigned long type, unsigned long page_size,
> +			 unsigned long psize, unsigned long start,
> +			 unsigned long end);
>  #else
>  static inline void radix_kvm_prefetch_workaround(struct mm_struct *mm) { }
> +static inline void do_h_rpt_invalidate(unsigned long pid,
> +				       unsigned long lpid,
> +				       unsigned long type,
> +				       unsigned long page_size,
> +				       unsigned long psize,
> +				       unsigned long start,
> +				       unsigned long end) { }
>  #endif
>  
>  extern void switch_cop(struct mm_struct *next);
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 6f612d240392..802cb77c39cc 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -904,6 +904,64 @@ static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
>  	return yield_count;
>  }
>  
> +static void do_h_rpt_invalidate_prs(unsigned long pid, unsigned long lpid,
> +				    unsigned long type, unsigned long pg_sizes,
> +				    unsigned long start, unsigned long end)
> +{
> +	unsigned long psize;
> +
> +	if (pg_sizes & H_RPTI_PAGE_64K) {
> +		psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_64K);
> +		do_h_rpt_invalidate(pid, lpid, type, (1UL << 16), psize,
> +				    start, end);
> +	}
> +
> +	if (pg_sizes & H_RPTI_PAGE_2M) {
> +		psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_2M);
> +		do_h_rpt_invalidate(pid, lpid, type, (1UL << 21), psize,
> +				    start, end);
> +	}
> +
> +	if (pg_sizes & H_RPTI_PAGE_1G) {
> +		psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_1G);
> +		do_h_rpt_invalidate(pid, lpid, type, (1UL << 30), psize,
> +				    start, end);
> +	}

Hrm.  Here you're stepping through the hcall defined pagesizes, then
mapping each one to the Linux internal page size defs.

It might be more elegant to step through mmu_psize_defs table, and
conditionally performan an invalidate on that pagesize if the
corresponding bit in pg_sizes is set (as noted earlier you could
easily add the H_RPTI_PAGE bit to the table).  That way it's a direct
table lookup rather than a bunch of ifs or switches.

> +}
> +
> +static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
> +				    unsigned long pid, unsigned long target,
> +				    unsigned long type, unsigned long pg_sizes,
> +				    unsigned long start, unsigned long end)
> +{
> +	if (!kvm_is_radix(vcpu->kvm))
> +		return H_UNSUPPORTED;
> +
> +	if (kvmhv_on_pseries())
> +		return H_UNSUPPORTED;

This doesn't seem quite right.  If you have multiply nested guests,
won't the L2 be issueing H_RPT_INVALIDATE hcalls to the L1 on behalf
of the L3?  The L1 would have to implement them by calling the L0, but
the L1 can't just reject them, no?

Likewise for the !H_RPTI_TYPE_NESTED case, but on what happens to be a
nested guest in any case, couldn't this case legitimately arise and
need to be handled?

> +
> +	if (end < start)
> +		return H_P5;
> +
> +	if (type & H_RPTI_TYPE_NESTED) {
> +		if (!nesting_enabled(vcpu->kvm))
> +			return H_FUNCTION;
> +
> +		/* Support only cores as target */
> +		if (target != H_RPTI_TARGET_CMMU)
> +			return H_P2;
> +
> +		return kvmhv_h_rpti_nested(vcpu, pid,
> +					   (type & ~H_RPTI_TYPE_NESTED),
> +					    pg_sizes, start, end);
> +	}
> +
> +	do_h_rpt_invalidate_prs(pid, vcpu->kvm->arch.lpid, type, pg_sizes,
> +				start, end);
> +	return H_SUCCESS;
> +}
> +
>  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>  {
>  	unsigned long req = kvmppc_get_gpr(vcpu, 3);
> @@ -1112,6 +1170,14 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>  		 */
>  		ret = kvmppc_h_svm_init_abort(vcpu->kvm);
>  		break;
> +	case H_RPT_INVALIDATE:
> +		ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
> +					      kvmppc_get_gpr(vcpu, 5),
> +					      kvmppc_get_gpr(vcpu, 6),
> +					      kvmppc_get_gpr(vcpu, 7),
> +					      kvmppc_get_gpr(vcpu, 8),
> +					      kvmppc_get_gpr(vcpu, 9));
> +		break;
>  
>  	default:
>  		return RESUME_HOST;
> @@ -1158,6 +1224,7 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
>  	case H_XIRR_X:
>  #endif
>  	case H_PAGE_INIT:
> +	case H_RPT_INVALIDATE:
>  		return 1;
>  	}
>  
> @@ -1573,6 +1640,30 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
>  		if (!xics_on_xive())
>  			kvmppc_xics_rm_complete(vcpu, 0);
>  		break;
> +	case BOOK3S_INTERRUPT_SYSCALL:
> +	{
> +		unsigned long req = kvmppc_get_gpr(vcpu, 3);
> +
> +		if (req != H_RPT_INVALIDATE) {
> +			r = RESUME_HOST;
> +			break;
> +		}
> +
> +		/*
> +		 * The H_RPT_INVALIDATE hcalls issued by nested
> +		 * guest for process scoped invalidations when
> +		 * GTSE=0 are handled here.
> +		 */
> +		do_h_rpt_invalidate_prs(kvmppc_get_gpr(vcpu, 4),
> +					vcpu->arch.nested->shadow_lpid,
> +					kvmppc_get_gpr(vcpu, 5),
> +					kvmppc_get_gpr(vcpu, 6),
> +					kvmppc_get_gpr(vcpu, 7),
> +					kvmppc_get_gpr(vcpu, 8));
> +		kvmppc_set_gpr(vcpu, 3, H_SUCCESS);
> +		r = RESUME_GUEST;
> +		break;
> +	}
>  	default:
>  		r = RESUME_HOST;
>  		break;
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> index 33b58549a9aa..40ed4eb80adb 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -1149,6 +1149,102 @@ long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
>  	return H_SUCCESS;
>  }
>  
> +static long do_tlb_invalidate_nested_tlb(struct kvm_vcpu *vcpu,
> +					 unsigned long lpid,
> +					 unsigned long page_size,
> +					 unsigned long ap,
> +					 unsigned long start,
> +					 unsigned long end)
> +{
> +	unsigned long addr = start;
> +	int ret;
> +
> +	do {
> +		ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap,
> +						   get_epn(addr));
> +		if (ret)
> +			return ret;
> +		addr += page_size;
> +	} while (addr < end);
> +
> +	return ret;
> +}
> +
> +static long do_tlb_invalidate_nested_all(struct kvm_vcpu *vcpu,
> +					 unsigned long lpid)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	struct kvm_nested_guest *gp;
> +
> +	gp = kvmhv_get_nested(kvm, lpid, false);
> +	if (gp) {
> +		kvmhv_emulate_tlbie_lpid(vcpu, gp, RIC_FLUSH_ALL);
> +		kvmhv_put_nested(gp);
> +	}
> +	return H_SUCCESS;
> +}
> +
> +long kvmhv_h_rpti_nested(struct kvm_vcpu *vcpu, unsigned long lpid,
> +			 unsigned long type, unsigned long pg_sizes,
> +			 unsigned long start, unsigned long end)
> +{
> +	struct kvm_nested_guest *gp;
> +	long ret;
> +	unsigned long psize, ap;
> +
> +	/*
> +	 * If L2 lpid isn't valid, we need to return H_PARAMETER.
> +	 *
> +	 * However, nested KVM issues a L2 lpid flush call when creating
> +	 * partition table entries for L2. This happens even before the
> +	 * corresponding shadow lpid is created in HV which happens in
> +	 * H_ENTER_NESTED call. Since we can't differentiate this case from
> +	 * the invalid case, we ignore such flush requests and return success.
> +	 */
> +	gp = kvmhv_find_nested(vcpu->kvm, lpid);
> +	if (!gp)
> +		return H_SUCCESS;
> +
> +	if ((type & H_RPTI_TYPE_NESTED_ALL) == H_RPTI_TYPE_NESTED_ALL)
> +		return do_tlb_invalidate_nested_all(vcpu, lpid);
> +
> +	if ((type & H_RPTI_TYPE_TLB) == H_RPTI_TYPE_TLB) {
> +		if (pg_sizes & H_RPTI_PAGE_64K) {
> +			psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_64K);
> +			ap = mmu_get_ap(psize);
> +
> +			ret = do_tlb_invalidate_nested_tlb(vcpu, lpid,
> +							   (1UL << 16),
> +							   ap, start, end);
> +			if (ret)
> +				return H_P4;
> +		}
> +
> +		if (pg_sizes & H_RPTI_PAGE_2M) {
> +			psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_2M);
> +			ap = mmu_get_ap(psize);
> +
> +			ret = do_tlb_invalidate_nested_tlb(vcpu, lpid,
> +							   (1UL << 21),
> +							   ap, start, end);
> +			if (ret)
> +				return H_P4;
> +		}
> +
> +		if (pg_sizes & H_RPTI_PAGE_1G) {
> +			psize = rpti_pgsize_to_psize(pg_sizes & H_RPTI_PAGE_1G);
> +			ap = mmu_get_ap(psize);
> +
> +			ret = do_tlb_invalidate_nested_tlb(vcpu, lpid,
> +							   (1UL << 30),
> +							   ap, start, end);
> +			if (ret)
> +				return H_P4;
> +		}

Again it might be more elegant to step through the pagesizes from the
mmu_psize_defs side, rather than from the pg_sizes side.

> +	}
> +	return H_SUCCESS;
> +}
> +
>  /* Used to convert a nested guest real address to a L1 guest real address */
>  static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
>  				       struct kvm_nested_guest *gp,
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index cf52d26f49cd..5388cd4a206a 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -678,6 +678,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  		r = hv_enabled && kvmppc_hv_ops->enable_svm &&
>  			!kvmppc_hv_ops->enable_svm(NULL);
>  		break;
> +	case KVM_CAP_PPC_RPT_INVALIDATE:
> +		r = 1;
> +		break;
>  #endif
>  	default:
>  		r = 0;
> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
> index 097402435303..4f746d34b420 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -1400,4 +1400,29 @@ extern void radix_kvm_prefetch_workaround(struct mm_struct *mm)
>  	}
>  }
>  EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround);
> +
> +/*
> + * Process-scoped invalidations for a given LPID.
> + */
> +void do_h_rpt_invalidate(unsigned long pid, unsigned long lpid,
> +			 unsigned long type, unsigned long page_size,
> +			 unsigned long psize, unsigned long start,
> +			 unsigned long end)
> +{
> +	if ((type & H_RPTI_TYPE_ALL) == H_RPTI_TYPE_ALL) {
> +		_tlbie_pid_lpid(pid, lpid, RIC_FLUSH_ALL);
> +		return;
> +	}
> +
> +	if (type & H_RPTI_TYPE_PWC)
> +		_tlbie_pid_lpid(pid, lpid, RIC_FLUSH_PWC);
> +
> +	if (!start && end == -1) /* PID */
> +		_tlbie_pid_lpid(pid, lpid, RIC_FLUSH_TLB);
> +	else /* EA */
> +		_tlbie_va_range_lpid(start, end, pid, lpid, page_size,
> +				     psize, false);
> +}
> +EXPORT_SYMBOL_GPL(do_h_rpt_invalidate);
> +
>  #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 374c67875cdb..6fd530fae452 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1058,6 +1058,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
>  #define KVM_CAP_SYS_HYPERV_CPUID 191
>  #define KVM_CAP_DIRTY_LOG_RING 192
> +#define KVM_CAP_PPC_RPT_INVALIDATE 193
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH kernel 1/2] powerpc/iommu: Allocate it_map by vmalloc
From: David Gibson @ 2021-02-17  0:16 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev, kvm-ppc
In-Reply-To: <20210216033307.69863-2-aik@ozlabs.ru>

[-- Attachment #1: Type: text/plain, Size: 3078 bytes --]

On Tue, Feb 16, 2021 at 02:33:06PM +1100, Alexey Kardashevskiy wrote:
> The IOMMU table uses the it_map bitmap to keep track of allocated DMA
> pages. This has always been a contiguous array allocated at either
> the boot time or when a passed through device is returned to the host OS.
> The it_map memory is allocated by alloc_pages() which allocates
> contiguous physical memory.
> 
> Such allocation method occasionally creates a problem when there is
> no big chunk of memory available (no free memory or too fragmented).
> On powernv/ioda2 the default DMA window requires 16MB for it_map.
> 
> This replaces alloc_pages_node() with vzalloc_node() which allocates
> contiguous block but in virtual memory. This should reduce changes of
> failure but should not cause other behavioral changes as it_map is only
> used by the kernel's DMA hooks/api when MMU is on.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  arch/powerpc/kernel/iommu.c | 15 +++------------
>  1 file changed, 3 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index c00214a4355c..8eb6eb0afa97 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -719,7 +719,6 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
>  {
>  	unsigned long sz;
>  	static int welcomed = 0;
> -	struct page *page;
>  	unsigned int i;
>  	struct iommu_pool *p;
>  
> @@ -728,11 +727,9 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
>  	/* number of bytes needed for the bitmap */
>  	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
>  
> -	page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz));
> -	if (!page)
> +	tbl->it_map = vzalloc_node(sz, nid);
> +	if (!tbl->it_map)
>  		panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
> -	tbl->it_map = page_address(page);
> -	memset(tbl->it_map, 0, sz);
>  
>  	iommu_table_reserve_pages(tbl, res_start, res_end);
>  
> @@ -774,8 +771,6 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
>  
>  static void iommu_table_free(struct kref *kref)
>  {
> -	unsigned long bitmap_sz;
> -	unsigned int order;
>  	struct iommu_table *tbl;
>  
>  	tbl = container_of(kref, struct iommu_table, it_kref);
> @@ -796,12 +791,8 @@ static void iommu_table_free(struct kref *kref)
>  	if (!bitmap_empty(tbl->it_map, tbl->it_size))
>  		pr_warn("%s: Unexpected TCEs\n", __func__);
>  
> -	/* calculate bitmap size in bytes */
> -	bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
> -
>  	/* free bitmap */
> -	order = get_order(bitmap_sz);
> -	free_pages((unsigned long) tbl->it_map, order);
> +	vfree(tbl->it_map);
>  
>  	/* free table */
>  	kfree(tbl);

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH 1/4] add generic builtin command line
From: Daniel Gimpelevich @ 2021-02-16 21:20 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Christophe Leroy, xe-linux-external, linux-kernel, Rob Herring,
	Paul Mackerras, Maksym Kokhan, Daniel Walker, Andrew Morton,
	linuxppc-dev, Daniel Walker
In-Reply-To: <20210216184247.Horde.If3nEUb5zLh4eU_4qXZCAw1@messagerie.c-s.fr>

On Tue, 2021-02-16 at 18:42 +0100, Christophe Leroy wrote:
> I'd suggest also to find the good arguments to convince us that this  
> series has a real added value, not just "cisco use it in its kernels  
> so it is good".

Well, IIRC, this series was endorsed by the device tree maintainers as
the preferred alternative to this:

https://lore.kernel.org/linux-devicetree/1565020400-25679-1-git-send-email-daniel@gimpelevich.san-francisco.ca.us/T/#u

The now-defunct patchwork.linux-mips.org link in that thread pointed to:

https://lore.kernel.org/linux-mips/1510796793.16864.25.camel@chimera/T/#u

When running modern kernels from ancient vendor bootloaders, it is
sometimes necessary to pick and choose bits and pieces of the info they
pass without taking it verbatim.


^ permalink raw reply

* Re: [PATCH 4/4] ibmvfc: store return code of H_FREE_SUB_CRQ during cleanup
From: Tyrel Datwyler @ 2021-02-16 18:43 UTC (permalink / raw)
  To: Brian King, james.bottomley
  Cc: brking, linuxppc-dev, linux-scsi, martin.petersen, linux-kernel
In-Reply-To: <94321ded-7970-258c-cee9-222f7b2b511f@linux.vnet.ibm.com>

On 2/16/21 6:58 AM, Brian King wrote:
> On 2/11/21 12:57 PM, Tyrel Datwyler wrote:
>> diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
>> index ba6fcf9cbc57..23b803ac4a13 100644
>> --- a/drivers/scsi/ibmvscsi/ibmvfc.c
>> +++ b/drivers/scsi/ibmvscsi/ibmvfc.c
>> @@ -5670,7 +5670,7 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost,
>>  
>>  irq_failed:
>>  	do {
>> -		plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie);
>> +		rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie);
>>  	} while (rc == H_BUSY || H_IS_LONG_BUSY(rc));
> 
> Other places in the driver where we get a busy return code back we have an msleep(100).
> Should we be doing that here as well?

Indeed, and actually even better would be to use rtas_busy_delay() which will
perform the sleep with the correct ms delay, and marks itself with the
might_sleep() macro.

-Tyrel

> 
> Thanks,
> 
> Brian
> 


^ permalink raw reply

* Re: [PATCH 1/4] add generic builtin command line
From: Daniel Walker @ 2021-02-16 19:02 UTC (permalink / raw)
  To: Daniel Gimpelevich
  Cc: Christophe Leroy, Maksym Kokhan, linux-kernel, Rob Herring,
	Paul Mackerras, xe-linux-external, Andrew Morton, linuxppc-dev,
	Daniel Walker
In-Reply-To: <1613417521.3853.5.camel@chimera>

On Mon, Feb 15, 2021 at 11:32:01AM -0800, Daniel Gimpelevich wrote:
> On Thu, 2019-03-21 at 15:15 -0700, Andrew Morton wrote:
> > On Thu, 21 Mar 2019 08:13:08 -0700 Daniel Walker <danielwa@cisco.com> wrote:
> > > On Wed, Mar 20, 2019 at 08:14:33PM -0700, Andrew Morton wrote:
> > > > The patches (or some version of them) are already in linux-next,
> > > > which messes me up.  I'll disable them for now.
> > >  
> > > Those are from my tree, but I remove them when you picked up the series. The
> > > next linux-next should not have them.
> > 
> > Yup, thanks, all looks good now.
> 
> This patchset is currently neither in mainline nor in -next. May I ask
> what happened to it? Thanks.
> 

It was dropped silently by Andrew at some point. I wasn't watching -next closely
to know when. I have no idea why he dropped it.

We still use this series extensively in Cisco, and have extended it beyond this
current series.

We can re-submit.

Daniel

^ permalink raw reply

* Re: [PATCH 1/4] add generic builtin command line
From: Christophe Leroy @ 2021-02-16 17:42 UTC (permalink / raw)
  To: Daniel Gimpelevich
  Cc: Christophe Leroy, xe-linux-external, linux-kernel, Rob Herring,
	Paul Mackerras, Maksym Kokhan, Daniel Walker, Andrew Morton,
	linuxppc-dev, Daniel Walker
In-Reply-To: <1613417521.3853.5.camel@chimera>

Daniel Gimpelevich <daniel@gimpelevich.san-francisco.ca.us> a écrit :

> On Thu, 2019-03-21 at 15:15 -0700, Andrew Morton wrote:
>> On Thu, 21 Mar 2019 08:13:08 -0700 Daniel Walker <danielwa@cisco.com> wrote:
>> > On Wed, Mar 20, 2019 at 08:14:33PM -0700, Andrew Morton wrote:
>> > > The patches (or some version of them) are already in linux-next,
>> > > which messes me up.  I'll disable them for now.
>> >
>> > Those are from my tree, but I remove them when you picked up the  
>> series. The
>> > next linux-next should not have them.
>>
>> Yup, thanks, all looks good now.
>
> This patchset is currently neither in mainline nor in -next. May I ask
> what happened to it? Thanks.

As far as I remember, there has been a lot of discussion around this series.

As of today, it doesn't apply cleanly anymore and would require rebasing.

I'd suggest also to find the good arguments to convince us that this  
series has a real added value, not just "cisco use it in its kernels  
so it is good".

I proposed an alternative at  
https://patchwork.ozlabs.org/project/linuxppc-dev/cover/cover.1554195798.git.christophe.leroy@c-s.fr/ but never got any feedback so I gave  
up.

If you submit a new series, don't forget to copy ppclinux-dev and  
linux-arch lists.

Christophe

^ permalink raw reply

* Re: [PATCH 4/4] ibmvfc: store return code of H_FREE_SUB_CRQ during cleanup
From: Brian King @ 2021-02-16 14:58 UTC (permalink / raw)
  To: Tyrel Datwyler, james.bottomley
  Cc: brking, linuxppc-dev, linux-scsi, martin.petersen, linux-kernel
In-Reply-To: <20210211185742.50143-5-tyreld@linux.ibm.com>

On 2/11/21 12:57 PM, Tyrel Datwyler wrote:
> diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
> index ba6fcf9cbc57..23b803ac4a13 100644
> --- a/drivers/scsi/ibmvscsi/ibmvfc.c
> +++ b/drivers/scsi/ibmvscsi/ibmvfc.c
> @@ -5670,7 +5670,7 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost,
>  
>  irq_failed:
>  	do {
> -		plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie);
> +		rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie);
>  	} while (rc == H_BUSY || H_IS_LONG_BUSY(rc));

Other places in the driver where we get a busy return code back we have an msleep(100).
Should we be doing that here as well?

Thanks,

Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center


^ permalink raw reply

* Re: [PATCH 3/4] ibmvfc: treat H_CLOSED as success during sub-CRQ registration
From: Brian King @ 2021-02-16 14:55 UTC (permalink / raw)
  To: Tyrel Datwyler, james.bottomley
  Cc: brking, linuxppc-dev, linux-scsi, martin.petersen, linux-kernel
In-Reply-To: <20210211185742.50143-4-tyreld@linux.ibm.com>

Reviewed-by: Brian King <brking@linux.vnet.ibm.com>


-- 
Brian King
Power Linux I/O
IBM Linux Technology Center


^ permalink raw reply

* Re: [PATCH 2/4] ibmvfc: fix invalid sub-CRQ handles after hard reset
From: Brian King @ 2021-02-16 14:39 UTC (permalink / raw)
  To: Tyrel Datwyler, james.bottomley
  Cc: brking, linuxppc-dev, linux-scsi, martin.petersen, linux-kernel
In-Reply-To: <20210211185742.50143-3-tyreld@linux.ibm.com>

Reviewed-by: Brian King <brking@linux.vnet.ibm.com>


-- 
Brian King
Power Linux I/O
IBM Linux Technology Center


^ permalink raw reply

* Re: [PATCH 3/6] powerpc/64s: Use htab_convert_pte_flags() in hash__mark_rodata_ro()
From: Daniel Axtens @ 2021-02-16  5:50 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: aneesh.kumar
In-Reply-To: <20210211135130.3474832-3-mpe@ellerman.id.au>

Hi Michael,

> In hash__mark_rodata_ro() we pass the raw PP_RXXX value to
> hash__change_memory_range(). That has the effect of setting the key to
> zero, because PP_RXXX contains no key value.
>
> Fix it by using htab_convert_pte_flags(), which knows how to convert a
> pgprot into a pp value, including the key.

So far as I can tell by chasing the definitions around, this appears
to do what it claims to do.

So, for what it's worth:
Reviewed-by: Daniel Axtens <dja@axtens.net>

Kind regards,
Daniel

>
> Fixes: d94b827e89dc ("powerpc/book3s64/kuap: Use Key 3 for kernel mapping with hash translation")
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>  arch/powerpc/mm/book3s64/hash_pgtable.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c
> index 567e0c6b3978..03819c259f0a 100644
> --- a/arch/powerpc/mm/book3s64/hash_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
> @@ -428,12 +428,14 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end,
>  
>  void hash__mark_rodata_ro(void)
>  {
> -	unsigned long start, end;
> +	unsigned long start, end, pp;
>  
>  	start = (unsigned long)_stext;
>  	end = (unsigned long)__init_begin;
>  
> -	WARN_ON(!hash__change_memory_range(start, end, PP_RXXX));
> +	pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY);
> +
> +	WARN_ON(!hash__change_memory_range(start, end, pp));
>  }
>  
>  void hash__mark_initmem_nx(void)
> -- 
> 2.25.1

^ permalink raw reply

* Re: [PATCH 2/6] powerpc/pseries: Add key to flags in pSeries_lpar_hpte_updateboltedpp()
From: Daniel Axtens @ 2021-02-16  5:39 UTC (permalink / raw)
  To: Michael Ellerman, linuxppc-dev; +Cc: aneesh.kumar
In-Reply-To: <20210211135130.3474832-2-mpe@ellerman.id.au>

Michael Ellerman <mpe@ellerman.id.au> writes:

> The flags argument to plpar_pte_protect() (aka. H_PROTECT), includes
> the key in bits 9-13, but currently we always set those bits to zero.
>
> In the past that hasn't been a problem because we always used key 0
> for the kernel, and updateboltedpp() is only used for kernel mappings.
>
> However since commit d94b827e89dc ("powerpc/book3s64/kuap: Use Key 3
> for kernel mapping with hash translation") we are now inadvertently
> changing the key (to zero) when we call plpar_pte_protect().
>
> That hasn't broken anything because updateboltedpp() is only used for
> STRICT_KERNEL_RWX, which is currently disabled on 64s due to other
> bugs.
>
> But we want to fix that, so first we need to pass the key correctly to
> plpar_pte_protect(). In the `newpp` value the low 3 bits of the key
> are already in the correct spot, but the high 2 bits of the key need
> to be shifted down.
>
> Fixes: d94b827e89dc ("powerpc/book3s64/kuap: Use Key 3 for kernel mapping with hash translation")
> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
> ---
>  arch/powerpc/platforms/pseries/lpar.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
> index 764170fdb0f7..8bbbddff7226 100644
> --- a/arch/powerpc/platforms/pseries/lpar.c
> +++ b/arch/powerpc/platforms/pseries/lpar.c
> @@ -976,11 +976,13 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
>  	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
>  	BUG_ON(slot == -1);
>  
> -	flags = newpp & 7;
> +	flags = newpp & (HPTE_R_PP | HPTE_R_N);
>  	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
>  		/* Move pp0 into bit 8 (IBM 55) */
>  		flags |= (newpp & HPTE_R_PP0) >> 55;
>  
> +	flags |= ((newpp & HPTE_R_KEY_HI) >> 48) | (newpp & HPTE_R_KEY_LO);
> +

I'm really confused about how these bits are getting packed into the
flags parameter. It seems to match how they are unpacked in
kvmppc_h_pr_protect, but I cannot figure out why they are packed in that
order, and the LoPAR doesn't seem especially illuminating on this topic
- although I may have missed the relevant section.

Kind regards,
Daniel

>  	lpar_rc = plpar_pte_protect(flags, slot, 0);
>  
>  	BUG_ON(lpar_rc != H_SUCCESS);
> -- 
> 2.25.1

^ permalink raw reply

* [PATCH kernel 2/2] powerpc/iommu: Do not immediately panic when failed IOMMU table allocation
From: Alexey Kardashevskiy @ 2021-02-16  3:33 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alexey Kardashevskiy, kvm-ppc, David Gibson
In-Reply-To: <20210216033307.69863-1-aik@ozlabs.ru>

Most platforms allocate IOMMU table structures (specifically it_map)
at the boot time and when this fails - it is a valid reason for panic().

However the powernv platform allocates it_map after a device is returned
to the host OS after being passed through and this happens long after
the host OS booted. It is quite possible to trigger the it_map allocation
panic() and kill the host even though it is not necessary - the host OS
can still use the DMA bypass mode (requires a tiny fraction of it_map's
memory) and even if that fails, the host OS is runnnable as it was without
the device for which allocating it_map causes the panic.

Instead of immediately crashing in a powernv/ioda2 system, this prints
an error and continues. All other platforms still call panic().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kernel/iommu.c               |  6 ++++--
 arch/powerpc/platforms/cell/iommu.c       |  3 ++-
 arch/powerpc/platforms/pasemi/iommu.c     |  4 +++-
 arch/powerpc/platforms/powernv/pci-ioda.c | 15 ++++++++-------
 arch/powerpc/platforms/pseries/iommu.c    | 10 +++++++---
 arch/powerpc/sysdev/dart_iommu.c          |  3 ++-
 6 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 8eb6eb0afa97..c1a5c366a664 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -728,8 +728,10 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
 	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
 
 	tbl->it_map = vzalloc_node(sz, nid);
-	if (!tbl->it_map)
-		panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
+	if (!tbl->it_map) {
+		pr_err("%s: Can't allocate %ld bytes\n", __func__, sz);
+		return NULL;
+	}
 
 	iommu_table_reserve_pages(tbl, res_start, res_end);
 
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 2124831cf57c..fa08699aedeb 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -486,7 +486,8 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 	window->table.it_size = size >> window->table.it_page_shift;
 	window->table.it_ops = &cell_iommu_ops;
 
-	iommu_init_table(&window->table, iommu->nid, 0, 0);
+	if (!iommu_init_table(&window->table, iommu->nid, 0, 0))
+		panic("Failed to initialize iommu table");
 
 	pr_debug("\tioid      %d\n", window->ioid);
 	pr_debug("\tblocksize %ld\n", window->table.it_blocksize);
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index b500a6e47e6b..5be7242fbd86 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -146,7 +146,9 @@ static void iommu_table_iobmap_setup(void)
 	 */
 	iommu_table_iobmap.it_blocksize = 4;
 	iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops;
-	iommu_init_table(&iommu_table_iobmap, 0, 0, 0);
+	if (!iommu_init_table(&iommu_table_iobmap, 0, 0, 0))
+		panic("Failed to initialize iommu table");
+
 	pr_debug(" <- %s\n", __func__);
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index f0f901683a2f..66c3c3337334 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1762,7 +1762,8 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
 	tbl->it_ops = &pnv_ioda1_iommu_ops;
 	pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
 	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
-	iommu_init_table(tbl, phb->hose->node, 0, 0);
+	if (!iommu_init_table(tbl, phb->hose->node, 0, 0))
+		panic("Failed to initialize iommu table");
 
 	pe->dma_setup_done = true;
 	return;
@@ -1930,16 +1931,16 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 		res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
 		res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
 	}
-	iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
 
-	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+	if (iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end))
+		rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+	else
+		rc = -ENOMEM;
 	if (rc) {
-		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
-				rc);
+		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", rc);
 		iommu_tce_table_put(tbl);
-		return rc;
+		tbl = NULL; /* This clears iommu_table_base below */
 	}
-
 	if (!pnv_iommu_bypass_disabled)
 		pnv_pci_ioda2_set_bypass(pe, true);
 
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 9fc5217f0c8e..4d9ac1f181c2 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -638,7 +638,8 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 
 	iommu_table_setparms(pci->phb, dn, tbl);
 	tbl->it_ops = &iommu_table_pseries_ops;
-	iommu_init_table(tbl, pci->phb->node, 0, 0);
+	if (!iommu_init_table(tbl, pci->phb->node, 0, 0))
+		panic("Failed to initialize iommu table");
 
 	/* Divide the rest (1.75GB) among the children */
 	pci->phb->dma_window_size = 0x80000000ul;
@@ -720,7 +721,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
 				ppci->table_group, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
-		iommu_init_table(tbl, ppci->phb->node, 0, 0);
+		if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
+			panic("Failed to initialize iommu table");
 		iommu_register_group(ppci->table_group,
 				pci_domain_nr(bus), 0);
 		pr_debug("  created table: %p\n", ppci->table_group);
@@ -749,7 +751,9 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		tbl = PCI_DN(dn)->table_group->tables[0];
 		iommu_table_setparms(phb, dn, tbl);
 		tbl->it_ops = &iommu_table_pseries_ops;
-		iommu_init_table(tbl, phb->node, 0, 0);
+		if (!iommu_init_table(tbl, phb->node, 0, 0))
+			panic("Failed to initialize iommu table");
+
 		set_iommu_table_base(&dev->dev, tbl);
 		return;
 	}
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index 6b4a34b36d98..1d33b7a5ea83 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -344,7 +344,8 @@ static void iommu_table_dart_setup(void)
 	iommu_table_dart.it_index = 0;
 	iommu_table_dart.it_blocksize = 1;
 	iommu_table_dart.it_ops = &iommu_dart_ops;
-	iommu_init_table(&iommu_table_dart, -1, 0, 0);
+	if (!iommu_init_table(&iommu_table_dart, -1, 0, 0))
+		panic("Failed to initialize iommu table");
 
 	/* Reserve the last page of the DART to avoid possible prefetch
 	 * past the DART mapped area
-- 
2.17.1


^ permalink raw reply related

* [PATCH kernel 0/2] powerpc/iommu: Stop crashing the host when VM is terminated
From: Alexey Kardashevskiy @ 2021-02-16  3:33 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alexey Kardashevskiy, kvm-ppc, David Gibson

Killing a VM on a host under memory pressure kills a host which is
annoying. 1/2 reduces the chances, 2/2 eliminates panic() on
ioda2.


This is based on sha1
f40ddce88593 Linus Torvalds "Linux 5.11".

Please comment. Thanks.



Alexey Kardashevskiy (2):
  powerpc/iommu: Allocate it_map by vmalloc
  powerpc/iommu: Do not immediately panic when failed IOMMU table
    allocation

 arch/powerpc/kernel/iommu.c               | 19 ++++++-------------
 arch/powerpc/platforms/cell/iommu.c       |  3 ++-
 arch/powerpc/platforms/pasemi/iommu.c     |  4 +++-
 arch/powerpc/platforms/powernv/pci-ioda.c | 15 ++++++++-------
 arch/powerpc/platforms/pseries/iommu.c    | 10 +++++++---
 arch/powerpc/sysdev/dart_iommu.c          |  3 ++-
 6 files changed, 28 insertions(+), 26 deletions(-)

-- 
2.17.1


^ permalink raw reply

* [PATCH kernel 1/2] powerpc/iommu: Allocate it_map by vmalloc
From: Alexey Kardashevskiy @ 2021-02-16  3:33 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alexey Kardashevskiy, kvm-ppc, David Gibson
In-Reply-To: <20210216033307.69863-1-aik@ozlabs.ru>

The IOMMU table uses the it_map bitmap to keep track of allocated DMA
pages. This has always been a contiguous array allocated at either
the boot time or when a passed through device is returned to the host OS.
The it_map memory is allocated by alloc_pages() which allocates
contiguous physical memory.

Such allocation method occasionally creates a problem when there is
no big chunk of memory available (no free memory or too fragmented).
On powernv/ioda2 the default DMA window requires 16MB for it_map.

This replaces alloc_pages_node() with vzalloc_node() which allocates
contiguous block but in virtual memory. This should reduce changes of
failure but should not cause other behavioral changes as it_map is only
used by the kernel's DMA hooks/api when MMU is on.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kernel/iommu.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index c00214a4355c..8eb6eb0afa97 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -719,7 +719,6 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
 {
 	unsigned long sz;
 	static int welcomed = 0;
-	struct page *page;
 	unsigned int i;
 	struct iommu_pool *p;
 
@@ -728,11 +727,9 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
 	/* number of bytes needed for the bitmap */
 	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
 
-	page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz));
-	if (!page)
+	tbl->it_map = vzalloc_node(sz, nid);
+	if (!tbl->it_map)
 		panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
-	tbl->it_map = page_address(page);
-	memset(tbl->it_map, 0, sz);
 
 	iommu_table_reserve_pages(tbl, res_start, res_end);
 
@@ -774,8 +771,6 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
 
 static void iommu_table_free(struct kref *kref)
 {
-	unsigned long bitmap_sz;
-	unsigned int order;
 	struct iommu_table *tbl;
 
 	tbl = container_of(kref, struct iommu_table, it_kref);
@@ -796,12 +791,8 @@ static void iommu_table_free(struct kref *kref)
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
 		pr_warn("%s: Unexpected TCEs\n", __func__);
 
-	/* calculate bitmap size in bytes */
-	bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
-
 	/* free bitmap */
-	order = get_order(bitmap_sz);
-	free_pages((unsigned long) tbl->it_map, order);
+	vfree(tbl->it_map);
 
 	/* free table */
 	kfree(tbl);
-- 
2.17.1


^ permalink raw reply related

* [PATCH kernel] powerpc/iommu: Annotate nested lock for lockdep
From: Alexey Kardashevskiy @ 2021-02-16  3:20 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Alexey Kardashevskiy, kvm-ppc

The IOMMU table is divided into pools for concurrent mappings and each
pool has a separate spinlock. When taking the ownership of an IOMMU group
to pass through a device to a VM, we lock these spinlocks which triggers
a false negative warning in lockdep (below).

This fixes it by annotating the large pool's spinlock as a nest lock.

===
WARNING: possible recursive locking detected
5.11.0-le_syzkaller_a+fstn1 #100 Not tainted
--------------------------------------------
qemu-system-ppc/4129 is trying to acquire lock:
c0000000119bddb0 (&(p->lock)/1){....}-{2:2}, at: iommu_take_ownership+0xac/0x1e0

but task is already holding lock:
c0000000119bdd30 (&(p->lock)/1){....}-{2:2}, at: iommu_take_ownership+0xac/0x1e0

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(&(p->lock)/1);
  lock(&(p->lock)/1);
===

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/kernel/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 557a09dd5b2f..2ee642a6731a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1089,7 +1089,7 @@ int iommu_take_ownership(struct iommu_table *tbl)
 
 	spin_lock_irqsave(&tbl->large_pool.lock, flags);
 	for (i = 0; i < tbl->nr_pools; i++)
-		spin_lock(&tbl->pools[i].lock);
+		spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
 
 	iommu_table_release_pages(tbl);
 
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH kernel] powerpc/perf: Stop crashing with generic_compat_pmu
From: Alexey Kardashevskiy @ 2021-02-16  1:06 UTC (permalink / raw)
  To: Madhavan Srinivasan, linuxppc-dev; +Cc: Madhavan Srinivasan
In-Reply-To: <b7b928ed-f338-cc5d-9045-d783bc85cfec@linux.ibm.com>



On 03/12/2020 16:27, Madhavan Srinivasan wrote:
> 
> On 12/2/20 8:31 AM, Alexey Kardashevskiy wrote:
>> Hi Maddy,
>>
>> I just noticed that I still have "powerpc/perf: Add checks for 
>> reserved values" in my pile (pushed here 
>> https://github.com/aik/linux/commit/61e1bc3f2e19d450e2e2d39174d422160b21957b 
>> ), do we still need it? The lockups I saw were fixed by 
>> https://github.com/aik/linux/commit/17899eaf88d689 but it is hardly a 
>> replacement. Thanks,
> 
> sorry missed this. Will look at this again. Since we will need 
> generation specific checks for the reserve field.


So any luck with this? Cheers,




> 
> Maddy
> 
>>
>>
>> On 04/06/2020 02:34, Madhavan Srinivasan wrote:
>>>
>>>
>>> On 6/2/20 8:26 AM, Alexey Kardashevskiy wrote:
>>>> The bhrb_filter_map ("The  Branch History  Rolling  Buffer") 
>>>> callback is
>>>> only defined in raw CPUs' power_pmu structs. The "architected" CPUs use
>>>> generic_compat_pmu which does not have this callback and crashed occur.
>>>>
>>>> This add a NULL pointer check for bhrb_filter_map() which behaves as if
>>>> the callback returned an error.
>>>>
>>>> This does not add the same check for config_bhrb() as the only caller
>>>> checks for cpuhw->bhrb_users which remains zero if bhrb_filter_map==0.
>>>
>>> Changes looks fine.
>>> Reviewed-by: Madhavan Srinivasan <maddy@linux.ibm.com>
>>>
>>> The commit be80e758d0c2e ('powerpc/perf: Add generic compat mode pmu 
>>> driver')
>>> which introduced generic_compat_pmu was merged in v5.2.  So we need to
>>> CC stable starting from 5.2 :( .  My bad,  sorry.
>>>
>>> Maddy
>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>   arch/powerpc/perf/core-book3s.c | 19 ++++++++++++++-----
>>>>   1 file changed, 14 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/arch/powerpc/perf/core-book3s.c 
>>>> b/arch/powerpc/perf/core-book3s.c
>>>> index 3dcfecf858f3..36870569bf9c 100644
>>>> --- a/arch/powerpc/perf/core-book3s.c
>>>> +++ b/arch/powerpc/perf/core-book3s.c
>>>> @@ -1515,9 +1515,16 @@ static int power_pmu_add(struct perf_event 
>>>> *event, int ef_flags)
>>>>       ret = 0;
>>>>    out:
>>>>       if (has_branch_stack(event)) {
>>>> -        power_pmu_bhrb_enable(event);
>>>> -        cpuhw->bhrb_filter = ppmu->bhrb_filter_map(
>>>> -                    event->attr.branch_sample_type);
>>>> +        u64 bhrb_filter = -1;
>>>> +
>>>> +        if (ppmu->bhrb_filter_map)
>>>> +            bhrb_filter = ppmu->bhrb_filter_map(
>>>> +                event->attr.branch_sample_type);
>>>> +
>>>> +        if (bhrb_filter != -1) {
>>>> +            cpuhw->bhrb_filter = bhrb_filter;
>>>> +            power_pmu_bhrb_enable(event); /* Does bhrb_users++ */
>>>> +        }
>>>>       }
>>>>
>>>>       perf_pmu_enable(event->pmu);
>>>> @@ -1839,7 +1846,6 @@ static int power_pmu_event_init(struct 
>>>> perf_event *event)
>>>>       int n;
>>>>       int err;
>>>>       struct cpu_hw_events *cpuhw;
>>>> -    u64 bhrb_filter;
>>>>
>>>>       if (!ppmu)
>>>>           return -ENOENT;
>>>> @@ -1945,7 +1951,10 @@ static int power_pmu_event_init(struct 
>>>> perf_event *event)
>>>>       err = power_check_constraints(cpuhw, events, cflags, n + 1);
>>>>
>>>>       if (has_branch_stack(event)) {
>>>> -        bhrb_filter = ppmu->bhrb_filter_map(
>>>> +        u64 bhrb_filter = -1;
>>>> +
>>>> +        if (ppmu->bhrb_filter_map)
>>>> +            bhrb_filter = ppmu->bhrb_filter_map(
>>>>                       event->attr.branch_sample_type);
>>>>
>>>>           if (bhrb_filter == -1) {
>>>
>>

-- 
Alexey

^ permalink raw reply

* Re: [PATCH v18 03/11] of: Add a common kexec FDT setup function
From: Thiago Jung Bauermann @ 2021-02-15 21:41 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian
  Cc: mark.rutland, tao.li, zohar, paulus, vincenzo.frascino,
	frowand.list, sashal, robh, masahiroy, jmorris, takahiro.akashi,
	linux-arm-kernel, catalin.marinas, serge, devicetree,
	pasha.tatashin, will, prsriva, hsinyi, allison, christophe.leroy,
	mbrugger, balajib, dmitry.kasatkin, linux-kernel, james.morse,
	gregkh, joe, linux-integrity, linuxppc-dev
In-Reply-To: <20210213161049.6190-4-nramas@linux.microsoft.com>


Lakshmi Ramasubramanian <nramas@linux.microsoft.com> writes:

> From: Rob Herring <robh@kernel.org>
>
> Both arm64 and powerpc do essentially the same FDT /chosen setup for
> kexec.  The differences are either omissions that arm64 should have
> or additional properties that will be ignored.  The setup code can be
> combined and shared by both powerpc and arm64.
>
> The differences relative to the arm64 version:
>  - If /chosen doesn't exist, it will be created (should never happen).
>  - Any old dtb and initrd reserved memory will be released.
>  - The new initrd and elfcorehdr are marked reserved.
>  - "linux,booted-from-kexec" is set.
>
> The differences relative to the powerpc version:
>  - "kaslr-seed" and "rng-seed" may be set.
>  - "linux,elfcorehdr" is set.
>  - Any existing "linux,usable-memory-range" is removed.
>
> Combine the code for setting up the /chosen node in the FDT and updating
> the memory reservation for kexec, for powerpc and arm64, in
> of_kexec_alloc_and_setup_fdt() and move it to "drivers/of/kexec.c".
>
> Signed-off-by: Rob Herring <robh@kernel.org>
> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
> ---
>  drivers/of/Makefile |   6 +
>  drivers/of/kexec.c  | 265 ++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/of.h  |   5 +
>  3 files changed, 276 insertions(+)
>  create mode 100644 drivers/of/kexec.c

Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH v18 06/11] powerpc: Move ima buffer fields to struct kimage
From: Thiago Jung Bauermann @ 2021-02-15 21:58 UTC (permalink / raw)
  To: Lakshmi Ramasubramanian
  Cc: mark.rutland, tao.li, zohar, paulus, vincenzo.frascino,
	frowand.list, sashal, robh, masahiroy, jmorris, takahiro.akashi,
	linux-arm-kernel, catalin.marinas, serge, devicetree,
	pasha.tatashin, will, prsriva, hsinyi, allison, christophe.leroy,
	mbrugger, balajib, dmitry.kasatkin, linux-kernel, james.morse,
	gregkh, joe, linux-integrity, linuxppc-dev
In-Reply-To: <20210213161049.6190-7-nramas@linux.microsoft.com>


Lakshmi Ramasubramanian <nramas@linux.microsoft.com> writes:

> The fields ima_buffer_addr and ima_buffer_size in "struct kimage_arch"
> for powerpc are used to carry forward the IMA measurement list across
> kexec system call.  These fields are not architecture specific, but are
> currently limited to powerpc.
>
> arch_ima_add_kexec_buffer() defined in "arch/powerpc/kexec/ima.c"
> sets ima_buffer_addr and ima_buffer_size for the kexec system call.
> This function does not have architecture specific code, but is
> currently limited to powerpc.
>
> Move ima_buffer_addr and ima_buffer_size to "struct kimage".
> Set ima_buffer_addr and ima_buffer_size in ima_add_kexec_buffer()
> in security/integrity/ima/ima_kexec.c.
>
> Co-developed-by: Prakhar Srivastava <prsriva@linux.microsoft.com>
> Signed-off-by: Prakhar Srivastava <prsriva@linux.microsoft.com>
> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
> Suggested-by: Will Deacon <will@kernel.org>
> ---
>  arch/powerpc/include/asm/ima.h     |  3 ---
>  arch/powerpc/include/asm/kexec.h   |  5 -----
>  arch/powerpc/kexec/ima.c           | 29 ++++++-----------------------
>  include/linux/kexec.h              |  3 +++
>  security/integrity/ima/ima_kexec.c |  8 ++------
>  5 files changed, 11 insertions(+), 37 deletions(-)

Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox