LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH v2] powerpc 32: Provides VIRT_CPU_ACCOUNTING
From: Benjamin Herrenschmidt @ 2014-04-30  4:57 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: scottwood, linuxppc-dev, Paul Mackerras, linux-kernel, alistair
In-Reply-To: <20140407073147.6F87E1A4BDE@localhost.localdomain>

On Mon, 2014-04-07 at 09:31 +0200, Christophe Leroy wrote:
> This patch provides VIRT_CPU_ACCOUTING to PPC32 architecture.
> Unlike PPC64, PPC32 doesn't use the PACA convention. Therefore the
> implementation is taken from the IA64 architecture.
> It is based on additional information added to the Task Info structure.
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>

Scott, Can you review/ack this (or get somebody to) ?

It looks like a great idea but I really don't have the bandwidth to
review in detail and test right now.

(Adding Alister as well who maintains our 4xx 32-bit stuff nowadays).

Cheers,
Ben.

> Index: b/arch/powerpc/Kconfig
> ===================================================================
> --- a/arch/powerpc/Kconfig	(revision 5607)
> +++ b/arch/powerpc/Kconfig	(revision 5611)
> @@ -138,6 +138,7 @@
>  	select OLD_SIGSUSPEND
>  	select OLD_SIGACTION if PPC32
>  	select HAVE_DEBUG_STACKOVERFLOW
> +	select HAVE_VIRT_CPU_ACCOUNTING
>  
>  config EARLY_PRINTK
>  	bool
> Index: a/arch/powerpc/kernel/time.c
> ===================================================================
> --- a/arch/powerpc/kernel/time.c	(revision 5607)
> +++ b/arch/powerpc/kernel/time.c	(revision 5611)
> @@ -162,7 +162,9 @@
>  
>  cputime_t cputime_one_jiffy;
>  
> +#ifdef CONFIG_PPC_SPLPAR
>  void (*dtl_consumer)(struct dtl_entry *, u64);
> +#endif
>  
>  static void calc_cputime_factors(void)
>  {
> @@ -178,6 +180,7 @@
>  	__cputime_clockt_factor = res.result_low;
>  }
>  
> +#ifdef CONFIG_PPC64
>  /*
>   * Read the SPURR on systems that have it, otherwise the PURR,
>   * or if that doesn't exist return the timebase value passed in.
> @@ -190,6 +193,7 @@
>  		return mfspr(SPRN_PURR);
>  	return tb;
>  }
> +#endif
>  
>  #ifdef CONFIG_PPC_SPLPAR
>  
> @@ -291,6 +295,7 @@
>   * Account time for a transition between system, hard irq
>   * or soft irq state.
>   */
> +#ifdef CONFIG_PPC64
>  static u64 vtime_delta(struct task_struct *tsk,
>  			u64 *sys_scaled, u64 *stolen)
>  {
> @@ -377,7 +382,70 @@
>  	get_paca()->utime_sspurr = 0;
>  	account_user_time(tsk, utime, utimescaled);
>  }
> +#else
>  
> +void vtime_account_user(struct task_struct *tsk)
> +{
> +	cputime_t delta_utime;
> +	struct thread_info *ti = task_thread_info(tsk);
> +
> +	if (ti->ac_utime) {
> +		delta_utime = ti->ac_utime;
> +		account_user_time(tsk, delta_utime, delta_utime);
> +		ti->ac_utime = 0;
> +	}
> +}
> +
> +/*
> + * Called from the context switch with interrupts disabled, to charge all
> + * accumulated times to the current process, and to prepare accounting on
> + * the next process.
> + */
> +void arch_vtime_task_switch(struct task_struct *prev)
> +{
> +	struct thread_info *pi = task_thread_info(prev);
> +	struct thread_info *ni = task_thread_info(current);
> +
> +	ni->ac_stamp = pi->ac_stamp;
> +	ni->ac_stime = ni->ac_utime = 0;
> +}
> +
> +/*
> + * Account time for a transition between system, hard irq or soft irq state.
> + * Note that this function is called with interrupts enabled.
> + */
> +static cputime_t vtime_delta(struct task_struct *tsk)
> +{
> +	struct thread_info *ti = task_thread_info(tsk);
> +	__u32 delta_stime;
> +	__u32 now;
> +
> +	WARN_ON_ONCE(!irqs_disabled());
> +
> +	now = mftbl();
> +
> +	delta_stime = ti->ac_stime + (now - ti->ac_stamp);
> +	ti->ac_stime = 0;
> +	ti->ac_stamp = now;
> +
> +	return (cputime_t)delta_stime;
> +}
> +
> +void vtime_account_system(struct task_struct *tsk)
> +{
> +	cputime_t delta = vtime_delta(tsk);
> +
> +	account_system_time(tsk, 0, delta, delta);
> +}
> +EXPORT_SYMBOL_GPL(vtime_account_system);
> +
> +void vtime_account_idle(struct task_struct *tsk)
> +{
> +	account_idle_time(vtime_delta(tsk));
> +}
> +
> +#endif
> +
>  #else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
>  #define calc_cputime_factors()
>  #endif
> @@ -871,6 +939,8 @@
>  		       ppc_proc_freq / 1000000, ppc_proc_freq % 1000000);
>  	}
>  
> +	mttbl(0);
> +	mttbu(0);
>  	tb_ticks_per_jiffy = ppc_tb_freq / HZ;
>  	tb_ticks_per_sec = ppc_tb_freq;
>  	tb_ticks_per_usec = ppc_tb_freq / 1000000;
> Index: b/arch/powerpc/kernel/entry_32.S
> ===================================================================
> --- a/arch/powerpc/kernel/entry_32.S	(revision 5607)
> +++ b/arch/powerpc/kernel/entry_32.S	(revision 5611)
> @@ -177,6 +177,12 @@
>  	addi	r12,r12,-1
>  	stw	r12,4(r11)
>  #endif
> +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> +	CURRENT_THREAD_INFO(r9, r1)
> +	tophys(r9, r9)
> +	ACCOUNT_CPU_USER_ENTRY(r9, r11, r12)
> +#endif
> +
>  	b	3f
>  
>  2:	/* if from kernel, check interrupted DOZE/NAP mode and
> @@ -406,6 +412,13 @@
>  	lwarx	r7,0,r1
>  END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX)
>  	stwcx.	r0,0,r1			/* to clear the reservation */
> +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> +	andi.	r4,r8,MSR_PR
> +	beq	3f
> +	CURRENT_THREAD_INFO(r4, r1)
> +	ACCOUNT_CPU_USER_EXIT(r4, r5, r7)
> +3:
> +#endif
>  	lwz	r4,_LINK(r1)
>  	lwz	r5,_CCR(r1)
>  	mtlr	r4
> @@ -841,6 +854,10 @@
>  	andis.	r10,r0,DBCR0_IDM@h
>  	bnel-	load_dbcr0
>  #endif
> +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> +	CURRENT_THREAD_INFO(r9, r1)
> +	ACCOUNT_CPU_USER_EXIT(r9, r10, r11)
> +#endif
>  
>  	b	restore
>  
> Index: b/arch/powerpc/kernel/asm-offsets.c
> ===================================================================
> --- a/arch/powerpc/kernel/asm-offsets.c	(revision 5607)
> +++ b/arch/powerpc/kernel/asm-offsets.c	(revision 5611)
> @@ -167,6 +167,12 @@
>  	DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
>  	DEFINE(TI_TASK, offsetof(struct thread_info, task));
>  	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
> +#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC32)
> +	DEFINE(TI_AC_STAMP, offsetof(struct thread_info, ac_stamp));
> +	DEFINE(TI_AC_LEAVE, offsetof(struct thread_info, ac_leave));
> +	DEFINE(TI_AC_STIME, offsetof(struct thread_info, ac_stime));
> +	DEFINE(TI_AC_UTIME, offsetof(struct thread_info, ac_utime));
> +#endif
>  
>  #ifdef CONFIG_PPC64
>  	DEFINE(DCACHEL1LINESIZE, offsetof(struct ppc64_caches, dline_size));
> Index: b/arch/powerpc/include/asm/thread_info.h
> ===================================================================
> --- a/arch/powerpc/include/asm/thread_info.h	(revision 5607)
> +++ b/arch/powerpc/include/asm/thread_info.h	(revision 5611)
> @@ -43,6 +43,12 @@
>  	int		cpu;			/* cpu we're on */
>  	int		preempt_count;		/* 0 => preemptable,
>  						   <0 => BUG */
> +#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC32)
> +	u32 ac_stamp;
> +	u32 ac_leave;
> +	u32 ac_stime;
> +	u32 ac_utime;
> +#endif
>  	struct restart_block restart_block;
>  	unsigned long	local_flags;		/* private flags for thread */
>  
> Index: b/arch/powerpc/include/asm/cputime.h
> ===================================================================
> --- a/arch/powerpc/include/asm/cputime.h	(revision 5607)
> +++ b/arch/powerpc/include/asm/cputime.h	(revision 5611)
> @@ -228,7 +228,11 @@
>  
>  #define cputime64_to_clock_t(ct)	cputime_to_clock_t((cputime_t)(ct))
>  
> +#ifdef CONFIG_PPC64
>  static inline void arch_vtime_task_switch(struct task_struct *tsk) { }
> +#else
> +extern void arch_vtime_task_switch(struct task_struct *tsk);
> +#endif
>  
>  #endif /* __KERNEL__ */
>  #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
> Index: b/arch/powerpc/include/asm/ppc_asm.h
> ===================================================================
> --- a/arch/powerpc/include/asm/ppc_asm.h	(revision 5607)
> +++ b/arch/powerpc/include/asm/ppc_asm.h	(revision 5611)
> @@ -25,10 +25,16 @@
>   */
>  
>  #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> +#ifdef CONFIG_PPC64
>  #define ACCOUNT_CPU_USER_ENTRY(ra, rb)
>  #define ACCOUNT_CPU_USER_EXIT(ra, rb)
> +#else /* CONFIG_PPC64 */
> +#define ACCOUNT_CPU_USER_ENTRY(ti, ra, rb)
> +#define ACCOUNT_CPU_USER_EXIT(ti, ra, rb)
> +#endif /* CONFIG_PPC64 */
>  #define ACCOUNT_STOLEN_TIME
> -#else
> +#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
> +#ifdef CONFIG_PPC64
>  #define ACCOUNT_CPU_USER_ENTRY(ra, rb)					\
>  	MFTB(ra);			/* get timebase */		\
>  	ld	rb,PACA_STARTTIME_USER(r13);				\
> @@ -68,7 +74,27 @@
>  #define ACCOUNT_STOLEN_TIME
>  
>  #endif /* CONFIG_PPC_SPLPAR */
> +#else /* CONFIG_PPC64 */
> +#define ACCOUNT_CPU_USER_ENTRY(ti, ra, rb)				\
> +	MFTB(ra);							\
> +	lwz rb, TI_AC_LEAVE(ti);					\
> +	stw ra, TI_AC_STAMP(ti);	/* AC_STAMP = NOW */		\
> +	subf rb, rb, ra;		/* R = NOW - AC_LEAVE */	\
> +	lwz ra, TI_AC_UTIME(ti);					\
> +	add ra, rb, ra;			/* AC_UTIME += R */		\
> +	stw ra, TI_AC_UTIME(ti);					\
>  
> +#define ACCOUNT_CPU_USER_EXIT(ti, ra, rb)				\
> +	MFTB(ra);							\
> +	lwz rb, TI_AC_STAMP(ti);					\
> +	stw ra, TI_AC_LEAVE(ti);					\
> +	subf rb, rb, ra;		/* R = NOW - AC_STAMP */	\
> +	lwz ra, TI_AC_STIME(ti);					\
> +	add ra, rb, ra;			/* AC_STIME += R */		\
> +	stw ra, TI_AC_STIME(ti);					\
> +
> +#endif /* CONFIG_PPC64 */
> +
>  #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
>  
>  /*
> Index: b/arch/powerpc/platforms/Kconfig.cputype
> ===================================================================
> --- a/arch/powerpc/platforms/Kconfig.cputype	(revision 5607)
> +++ b/arch/powerpc/platforms/Kconfig.cputype	(revision 5611)
> @@ -1,7 +1,6 @@
>  config PPC64
>  	bool "64-bit kernel"
>  	default n
> -	select HAVE_VIRT_CPU_ACCOUNTING
>  	help
>  	  This option selects whether a 32-bit or a 64-bit kernel
>  	  will be built.

^ permalink raw reply

* Re: [PATCH 1/1] powerpc: crtsaveres.o needed only when -Os flag is enabled
From: Benjamin Herrenschmidt @ 2014-04-30  5:14 UTC (permalink / raw)
  To: Ram Pai; +Cc: linuxppc-dev, Brian W Hart
In-Reply-To: <20140429153806.GA25743@oc3347516403.ibm.com>

On Tue, 2014-04-29 at 10:38 -0500, Brian W Hart wrote:

> >  CHECKFLAGS	+= -m$(CONFIG_WORD_SIZE) -D__powerpc__ -D__powerpc$(CONFIG_WORD_SIZE)__
> >  
> > +ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
> >  KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
> > +endif
> > +
> >  
> >  # No AltiVec or VSX instructions when building kernel
> >  KBUILD_CFLAGS += $(call cc-option,-mno-altivec)
> 
> I didn't try building a kernel or in-tree modules, but I confirmed
> that it allows building of out-of-tree modules when crtsavres.o is
> not present (e.g. as for a distro install where the kernel headers
> are provided by package, rather than being manually prepared from
> the sources).
> 
> Tested-by: Brian W Hart <hartb@linux.vnet.ibm.com>

I still don't like it. What guarantee do we have that gcc will never
call into this with other optimisation settings ? It might decide
one day that calling out for saving a large pile of registers
is still more efficient than unrolling the whole lot, including
for speed.

Besides that doesn't fix the root problem. We want to be able to
build the kernel with CONFIG_CC_OPTIMIZE_FOR_SIZE and still have
modules.

So a better solution needs to be found. I don't know what that
solution is (we might want to look at what other archs are doing
maybe ?), could be to include crtsaveres.S in the build of every
module (we really don't want to EXPORT_SYMBOL these guys), but
that would mean having it installed somewhere with the kernel
headers for out-of-tree modules...

If necessary, involve lkml, Rusty etc... but this patch is crap.

Ben.

^ permalink raw reply

* Re: [PATCH 0/6] Implement split core for POWER8
From: Michael Neuling @ 2014-04-30  5:09 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, kvm-ppc, Paul Mackerras, kvm
In-Reply-To: <1398303165-6576-1-git-send-email-mikey@neuling.org>

> This patch series implements split core mode on POWER8.  This enables up to 4
> subcores per core which can each independently run guests (per guest SPRs like
> SDR1, LPIDR etc are replicated per subcore).  Lots more documentation on this
> feature in the code and commit messages.
> 
> Most of this code is in the powernv platform but there's a couple of KVM
> specific patches too.
> 
> Alex: If you're happy with the KVM patches, please ACK them and benh can hold
> this series.

Alex,

Any chance we can get an ACK on these two KVM patches so benh can put
this series in his next branch?

Mikey

^ permalink raw reply

* Re: [PATCH] powerpc 32: Provides VIRT_CPU_ACCOUNTING
From: Benjamin Herrenschmidt @ 2014-04-30  4:56 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev, Paul Mackerras, linux-kernel
In-Reply-To: <1395266717.12479.293.camel@snotra.buserror.net>

On Wed, 2014-03-19 at 17:05 -0500, Scott Wood wrote:
> On Wed, 2014-03-19 at 22:52 +0100, Christophe Leroy wrote:
> > This patch provides VIRT_CPU_ACCOUTING to PPC32 architecture.
> > Unlike PPC64, PPC32 doesn't provide the PACA register. Therefore the
> > implementation is similar to the one done in the IA64 architecture.
> > It is based on additional information added to the Task Info structure.
> 
> PACA isn't a register -- just a convention for how Linux uses a GPR.
> Maybe it's time to use it on PPC32 as well?

PACA is actually a data structure and you really really don't want it
on ppc32 :-) Having a register point to current works, having a register
point to per-cpu data instead works too (ie, change what we do today),
but don't introduce a PACA *please* :-)

> 
> > Index: b/arch/powerpc/kernel/asm-offsets.c
> > ===================================================================
> > --- b/arch/powerpc/kernel/asm-offsets.c	(revision 5607)
> > +++ b/arch/powerpc/kernel/asm-offsets.c	(revision 5608)
> > @@ -167,6 +167,10 @@
> >  	DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count));
> >  	DEFINE(TI_TASK, offsetof(struct thread_info, task));
> >  	DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
> > +	DEFINE(TI_AC_STAMP, offsetof(struct thread_info, ac_stamp));
> > +	DEFINE(TI_AC_LEAVE, offsetof(struct thread_info, ac_leave));
> > +	DEFINE(TI_AC_STIME, offsetof(struct thread_info, ac_stime));
> > +	DEFINE(TI_AC_UTIME, offsetof(struct thread_info, ac_utime));
> 
> Doesn't this need to be protected by #ifdef
> CONFIG_VIRT_CPU_ACCOUNTING_NATIVE?
> 
> >  
> >  #ifdef CONFIG_PPC64
> >  	DEFINE(DCACHEL1LINESIZE, offsetof(struct ppc64_caches, dline_size));
> > Index: b/arch/powerpc/include/asm/thread_info.h
> > ===================================================================
> > --- b/arch/powerpc/include/asm/thread_info.h	(revision 5607)
> > +++ b/arch/powerpc/include/asm/thread_info.h	(revision 5608)
> > @@ -43,6 +43,12 @@
> >  	int		cpu;			/* cpu we're on */
> >  	int		preempt_count;		/* 0 => preemptable,
> >  						   <0 => BUG */
> > +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
> > +	__u32 ac_stamp;
> > +	__u32 ac_leave;
> > +	__u32 ac_stime;
> > +	__u32 ac_utime;
> > +#endif
> 
> This isn't uapi; why not use "u32"?
> 
> Plus, it should be made clear that this is only used on 32-bit.
> 
> >  	struct restart_block restart_block;
> >  	unsigned long	local_flags;		/* private flags for thread */
> >  
> > @@ -58,6 +64,8 @@
> >  	.task =		&tsk,			\
> >  	.exec_domain =	&default_exec_domain,	\
> >  	.cpu =		0,			\
> > +	.ac_stime =	0,			\
> > +	.ac_utime =	0,			\
> 
> Also needs to be ifdeffed -- which isn't going to work in a macro, so
> maybe remove the ifdef from the variable declarations, or just let the
> fields be initialized to zero by default.  Or add PACA to 32-bit. :-)
> 
> -Scott
> 

^ permalink raw reply

* Re: [PATCH 1/2] powerpc/powernv: reduce multi-hit of iommu_add_device()
From: Wei Yang @ 2014-04-30  3:28 UTC (permalink / raw)
  To: Gavin Shan; +Cc: linuxppc-dev, Wei Yang, Alexey Kardashevskiy
In-Reply-To: <20140430002812.GA13183@shangw>

On Wed, Apr 30, 2014 at 10:28:12AM +1000, Gavin Shan wrote:
>
>It seems that we have 2 problems here:
>
>- For non-SRIOV case, pcibios_setup_device() is called for towice. That
>  seems incorrect. We could simply remove pcibios_setup_bus_devices()
>  from pcibios_fixup_bus().

I have thought about this solution before, but I guess this would have some
side effect on other platforms.

Well, just did a test by removing this line in pcibios_fixup_bus(), the result
is:
1. system up, thanks god.
2. no one invoke the pcibios_setup_device() at bootup time.

The reason for no one invoke the pcibios_setup_device() is: in
pcibios_add_device() it will check whether the bus is added before calling
pcibios_setup_device(). And at this time, the bus is not added.

Still wierd, why the system could be up. But one thing for sure is, no one
invoke the pcibios_setup_device() at bootup stage. So this solution may not
work.

>
>- It's too early to register IOMMU group/device in pnv_pci_ioda_dma_dev_setup()
>  because the sysfs entries of the PCI device aren't finalized yet. So we could
>  remove all logic we have in pnv_pci_ioda_dma_dev_setup() and just purely rely
>  on "tce_iommu_bus_notifier".

This would be another solution. 

One concern:

    If we want to do like this, we need to retrieve the pe information and
    get the tce32_table base in tce_iommu_bus_notifier. Hmm... looks a little
    not that nice.

>
>By the way, I never tried EEH on SRIOV PF/VFs. However, I never hit similar
>issue in non-SRIOV cases.

I have test this case on a PF with no VF enabled.

I just make the mlx4_pci_err_detected() return PCI_ERS_RESULT_NONE and do
nothing. So this will force the eeh to do a hotplug recovery.

You could have a try on your machine too.

>
>Thanks,
>Gavin

-- 
Richard Yang
Help you, Help me

^ permalink raw reply

* RE: [PATCH] powerpc/fsl-rio: Fix fsl_rio_setup error paths and use-after-unmap
From: Gang.Liu @ 2014-04-30  2:58 UTC (permalink / raw)
  To: Scott Wood; +Cc: linuxppc-dev@lists.ozlabs.org
In-Reply-To: <1398791552.24575.56.camel@snotra.buserror.net>

DQo+IC0tLS0tT3JpZ2luYWwgTWVzc2FnZS0tLS0tDQo+IEZyb206IFdvb2QgU2NvdHQtQjA3NDIx
DQo+IFNlbnQ6IFdlZG5lc2RheSwgQXByaWwgMzAsIDIwMTQgMToxMyBBTQ0KPiBUbzogTGl1IEdh
bmctQjM0MTgyDQo+IENjOiBsaW51eHBwYy1kZXZAbGlzdHMub3psYWJzLm9yZw0KPiBTdWJqZWN0
OiBSZTogW1BBVENIXSBwb3dlcnBjL2ZzbC1yaW86IEZpeCBmc2xfcmlvX3NldHVwIGVycm9yIHBh
dGhzIGFuZA0KPiB1c2UtYWZ0ZXItdW5tYXANCj4gDQo+IE9uIE1vbiwgMjAxNC0wNC0yOCBhdCAy
MzowNCAtMDUwMCwgTGl1IEdhbmctQjM0MTgyIHdyb3RlOg0KPiA+ID4gLS0tLS1PcmlnaW5hbCBN
ZXNzYWdlLS0tLS0NCj4gPiA+IEZyb206IFdvb2QgU2NvdHQtQjA3NDIxDQo+ID4gPiBTZW50OiBU
dWVzZGF5LCBBcHJpbCAyOSwgMjAxNCA5OjMyIEFNDQo+ID4gPiBUbzogbGludXhwcGMtZGV2QGxp
c3RzLm96bGFicy5vcmcNCj4gPiA+IENjOiBXb29kIFNjb3R0LUIwNzQyMTsgTGl1IEdhbmctQjM0
MTgyDQo+ID4gPiBTdWJqZWN0OiBbUEFUQ0hdIHBvd2VycGMvZnNsLXJpbzogRml4IGZzbF9yaW9f
c2V0dXAgZXJyb3IgcGF0aHMgYW5kDQo+ID4gPiB1c2UtIGFmdGVyLXVubWFwDQo+ID4gPg0KPiA+
ID4gU2V2ZXJhbCBvZiB0aGUgZXJyb3IgcGF0aHMgZnJvbSBmc2xfcmlvX3NldHVwIGFyZSBtaXNz
aW5nIGVycm9yDQo+IG1lc3NhZ2VzLg0KPiA+ID4NCj4gPiA+IFdvcnNlLCBmc2xfcmlvX3NldHVw
IGluaXRpYWxpemVzIHNldmVyYWwgZ2xvYmFsIHBvaW50ZXJzIGFuZCBkb2VzDQo+ID4gPiBub3Qg
TlVMTCB0aGVtIG91dCBhZnRlciBmcmVlaW5nL3VubWFwcGluZyBvbiBlcnJvci4gIFRoaXMgY2F1
c2VkDQo+ID4gPiBmc2xfcmlvX21jaGVja19leGNlcHRpb24oKSB0byBjcmFzaCB3aGVuIGFjY2Vz
c2luZyByaW9fcmVnc193aW4NCj4gPiA+IHdoaWNoIHdhcyBub24tTlVMTCBidXQgaGFkIGJlZW4g
dW5tYXBwZWQuDQo+ID4gPg0KPiA+ID4gU2lnbmVkLW9mZi1ieTogU2NvdHQgV29vZCA8c2NvdHR3
b29kQGZyZWVzY2FsZS5jb20+DQo+ID4gPiBDYzogTGl1IEdhbmcgPEdhbmcuTGl1QGZyZWVzY2Fs
ZS5jb20+DQo+ID4gPiAtLS0NCj4gPiA+IExpdSBHYW5nLCBhcmUgeW91IHN1cmUgYWxsIG9mIHRo
ZXNlIGVycm9yIGNvbmRpdGlvbnMgYXJlIGZhdGFsPyAgV2h5DQo+ID4gPiBkb2VzIHRoZSByaW8g
ZHJpdmVyIGZhaWwgaWYgcm11IGlzIG5vdCBwcmVzZW50IChlLmcuICBvbiB0NDI0MCk/DQo+ID4N
Cj4gPiBIaSBTY290dCwgSSB0aGluayB0aGUgZXJyb3JzIHlvdSBtb2RpZmllZCBpbiB0aGUgcGF0
Y2ggYXJlIHNlcmlvdXMgYW5kDQo+ID4gc2hvdWxkIGJlIGZpeGVkLiBUaGFua3MgdmVyeSBtdWNo
IQ0KPiA+IEFuZCBpbiBmYWN0LCB0aGUgcmlvIGRyaXZlciBjYW4gYmUgdXNlZCBqdXN0IGZvciB0
aGUgc3VibW9kdWxlIG9mIHRoZQ0KPiBTUklPOiBSTVUuDQo+ID4gSXQgc2hvdWxkIGJlIHVzZWQg
d2l0aCBhcmNoL3Bvd2VycGMvc3lzZGV2L2ZzbF9ybXUuYyBhbmQgdGhlcmUgc2hvdWxkDQo+ID4g
aGF2ZSB0aGUgUk1VIG1vZHVsZS4NCj4gPiBUaGUgZnNsX3Jpby5jIGltcGxlbWVudHMgc29tZSBi
YXNpYyBhbmQgbmVlZGVkIHdvcmtzIHRvIHN1cHBvcnQgdGhlIFJNVQ0KPiBydW5uaW5nIHdlbGwu
DQo+IA0KPiBJIGRvbid0IHF1aXRlIGZvbGxvdyAtLSBpcyBpdCBleHBlY3RlZCB0aGF0IHJpbyBj
YW4gd29yayB3aXRob3V0IHJtdSwgb3INCj4gbm90PyAgQXMgaXMsIGZzbF9yaW9fc2V0dXAoKSB3
aWxsIGVycm9yIG91dCBpZiBpdCBkb2Vzbid0IGZpbmQgYW4NCj4gZnNsLHNyaW8tcm11LWhhbmRs
ZSBwcm9wZXJ0eS4NCg0KZnNsX3Jpb19zZXR1cCgpIGRvZXNuJ3QgZXhwZWN0IHRoYXQgcmlvIGNh
biB3b3JrIHdpdGhvdXQgcm11LiBBbGwgdGhlIHJpbw0KZHJpdmVycyBqdXN0IGhhcyBvbmUgcHVy
cG9zZSwgaXQncyBybXUuIEJ1dCBybXUgaXMgYSBzdWJtb2R1bGUgb2YgdGhlIHJpbywNCnNvIHRo
ZSBkcml2ZXIgc2hvdWxkIHBhcnNlIHJpbyBpbiBkdGIgYW5kIGZpbmlzaCBzb21lIGluaXRpYWwg
d29ya3MgZmlyc3QsDQphbmQgdGhlbiB0byBzZXR1cCBybXUuDQpUaGF0J3Mgd2h5IHRoZSBkcml2
ZXJzIGNhbm5vdCBqdXN0IGhhdmUgYSBzdWNoIGFzIHJtdV9zZXR1cCgpIHRvIHBhcnNlDQpmc2ws
c3Jpby1ybXUtaGFuZGxlIHByb3BlcnR5Lg0KDQpUaGFua3MsDQpMaXUgR2FuZw0KIA0K

^ permalink raw reply

* Re: [PATCH v2 2/2] fsl/mpic_timer: make mpic_timer to support deep sleep feature
From: Scott Wood @ 2014-04-30  1:39 UTC (permalink / raw)
  To: Dongsheng Wang; +Cc: linuxppc-dev, chenhui.zhao, jason.jin
In-Reply-To: <1398319939-36348-1-git-send-email-dongsheng.wang@freescale.com>

On Thu, 2014-04-24 at 14:12 +0800, Dongsheng Wang wrote:
> From: Wang Dongsheng <dongsheng.wang@freescale.com>
> 
> At T104x platfrom the timer clock will be changed from platform_clock
> to sys_ref_clock when system going to deep sleep.
> 
> So before system going to deep sleep, we need to change time to adapt
> to the new frequency that is sys_ref_clock. And after resume from deep
> sleep, restore the time that based on platform_clock.
> 
> Signed-off-by: Wang Dongsheng <dongsheng.wang@freescale.com>
> ---
> *v2*
> Remove some unnecessary warning message.
> Remove "switch_freq_flag", it's unnecessary.
> 
> Modify the description of the patch.

The patch needs better code comments throughout to explain the flow of
what's happening.  Note that doesn't mean "more comments", but clearer
and more useful comments.

> diff --git a/arch/powerpc/sysdev/mpic_timer.c b/arch/powerpc/sysdev/mpic_timer.c
> index 9d9b062..71ad368 100644
> --- a/arch/powerpc/sysdev/mpic_timer.c
> +++ b/arch/powerpc/sysdev/mpic_timer.c
> @@ -18,6 +18,7 @@
>  #include <linux/mm.h>
>  #include <linux/interrupt.h>
>  #include <linux/slab.h>
> +#include <linux/suspend.h>
>  #include <linux/of.h>
>  #include <linux/of_address.h>
>  #include <linux/of_device.h>
> @@ -26,7 +27,9 @@
>  #include <sysdev/fsl_soc.h>
>  #include <asm/io.h>
>  
> +#include <asm/mpc85xx.h>
>  #include <asm/mpic_timer.h>
> +#include <asm/pm.h>
>  
>  #d	efine FSL_GLOBAL_TIMER		0x1
>  
> @@ -72,6 +75,8 @@ struct timer_group_priv {
>  	struct mpic_timer		timer[TIMERS_PER_GROUP];
>  	struct list_head		node;
>  	unsigned int			timerfreq;
> +	unsigned int			suspended_timerfreq;
> +	unsigned int			resume_timerfreq;
>  	unsigned int			idle;
>  	unsigned int			flags;
>  	spinlock_t			lock;
> @@ -423,6 +428,33 @@ struct mpic_timer *mpic_request_timer(irq_handler_t fn, void *dev,
>  }
>  EXPORT_SYMBOL(mpic_request_timer);
>  
> +static void timer_group_get_suspended_freq(struct timer_group_priv *priv)
> +{
> +	struct device_node *np;
> +
> +	np = of_find_compatible_node(NULL, NULL, "fsl,qoriq-clockgen-2.0");
> +	if (!np)
> +		return;
> +
> +	of_property_read_u32(np, "clock-frequency", &priv->suspended_timerfreq);
> +	of_node_put(np);
> +
> +	if (!priv->suspended_timerfreq)
> +		pr_warn("Mpic timer will not be accurate during deep sleep.\n");
> +}
> +
> +static int need_to_switch_freq(void)

/*
 * Returns true if the timers operate on a special frequency
 * when the system is suspended.
 */
static bool has_suspend_freq(void)

> @@ -437,6 +469,13 @@ static int timer_group_get_freq(struct device_node *np,
>  					&priv->timerfreq);
>  			of_node_put(dn);
>  		}
> +
> +		/*
> +		 * For deep sleep, if system goes to deep sleep,
> +		 * timer freq will be changed.
> +		 */

"For deep sleep" is redundant here.

> +		if (need_to_switch_freq())
> +			timer_group_get_suspended_freq(priv);
>  	}
>  
>  	if (priv->timerfreq <= 0)
> @@ -445,6 +484,7 @@ static int timer_group_get_freq(struct device_node *np,
>  	if (priv->flags & FSL_GLOBAL_TIMER) {
>  		div = (1 << (MPIC_TIMER_TCR_CLKDIV >> 8)) * 8;
>  		priv->timerfreq /= div;
> +		priv->suspended_timerfreq /= div;
>  	}
>  
>  	return 0;
> @@ -564,14 +604,182 @@ out:
>  	kfree(priv);
>  }
>  
> +static void mpic_reset_time(struct mpic_timer *handle, struct timeval *bcr_time,
> +				struct timeval *ccr_time)
> +{
> +	struct timer_group_priv *priv = container_of(handle,
> +			struct timer_group_priv, timer[handle->num]);
> +
> +	u64 ccr_ticks = 0;
> +	u64 bcr_ticks = 0;
> +
> +	/* switch bcr time */
> +	convert_time_to_ticks(priv, bcr_time, &bcr_ticks);
> +
> +	/* switch ccr time */
> +	convert_time_to_ticks(priv, ccr_time, &ccr_ticks);

Redundant comments.

> +	if (handle->cascade_handle) {
> +		u32 tmp_ticks;
> +		u32 rem_ticks;
> +
> +		/* reset ccr ticks to bcr */
> +		tmp_ticks = div_u64_rem(ccr_ticks, MAX_TICKS_CASCADE,
> +					&rem_ticks)
> +		out_be32(&priv->regs[handle->num].gtbcr,
> +			tmp_ticks | TIMER_STOP);
> +		out_be32(&priv->regs[handle->num - 1].gtbcr, rem_ticks);

The comment should explain why you're doing this -- that the hardware
copies bcr to ccr whenever TIMER_STOP is cleared.

> +		/* start timer */
> +		clrbits32(&priv->regs[handle->num].gtbcr, TIMER_STOP);
> +
> +		/* reset bcr */
> +		tmp_ticks = div_u64_rem(bcr_ticks, MAX_TICKS_CASCADE,
> +					&rem_ticks);
> +		out_be32(&priv->regs[handle->num].gtbcr,
> +			tmp_ticks & ~TIMER_STOP);
> +		out_be32(&priv->regs[handle->num - 1].gtbcr, rem_ticks);

Depending on the clock frequency it may be possible that bcr will be
copied again to ccr before the real bcr is written, if the old ccr was
very small.  This could result in two timer expirations where one was
expected.

It also looks like we start using the new timer counts before we
actually enter deep sleep, so the timeout could end up being shorter
than expected.

I'm not sure there's much we can do about either of these given the
hardware design, but it should at least be noted in a comment.

> +	} else {
> +		/* reset ccr ticks to bcr */
> +		out_be32(&priv->regs[handle->num].gtbcr,
> +			ccr_ticks | TIMER_STOP);
> +		/* start timer */
> +		clrbits32(&priv->regs[handle->num].gtbcr, TIMER_STOP);
> +		/* reset bcr */
> +		out_be32(&priv->regs[handle->num].gtbcr,
> +			bcr_ticks & ~TIMER_STOP);
> +	}
> +}
> +
> +static void do_switch_time(struct mpic_timer *handle, unsigned int new_freq)

timer_set_freq()

> +{
> +	struct timer_group_priv *priv = container_of(handle,
> +			struct timer_group_priv, timer[handle->num]);
> +	struct timeval ccr_time;
> +	struct timeval bcr_time;
> +	unsigned int timerfreq;
> +	u32 test_stop;
> +	u64 ticks;
> +
> +	test_stop = in_be32(&priv->regs[handle->num].gtbcr);
> +	test_stop &= TIMER_STOP;
> +	if (test_stop)
> +		return;

	if (in_be32(&priv->regs[handle->num].gtbcr) & TIMER_STOP)
		return;

> +	/* stop timer, prepare reset time */
> +	setbits32(&priv->regs[handle->num].gtbcr, TIMER_STOP);

"stop timer" is redundant.
What does "prepare reset time" mean here?

> +	/* get bcr time */

I can see that it reads "bcr".  "Get base time" would at least remind
the reader what bcr stands for.

> +	if (handle->cascade_handle) {
> +		u32 tmp_ticks;
> +
> +		tmp_ticks = in_be32(&priv->regs[handle->num].gtbcr);
> +		tmp_ticks &= ~TIMER_STOP;
> +		ticks = ((u64)tmp_ticks & UINT_MAX) * (u64)MAX_TICKS_CASCADE;

What does this "& UINT_MAX" accomplish?

> +		tmp_ticks = in_be32(&priv->regs[handle->num - 1].gtbcr);
> +		ticks += tmp_ticks;
> +	} else {
> +		ticks = in_be32(&priv->regs[handle->num].gtbcr);
> +		ticks &= ~TIMER_STOP;
> +	}
> +	convert_ticks_to_time(priv, ticks, &bcr_time);

It would be cleaner to calculate the suspend-freq base count when
configuring the timer, rather than try to work backwards from the
register values.

> +	/* get ccr time */
> +	mpic_get_remain_time(handle, &ccr_time);

> +	/* recalculate timer time */
> +	timerfreq = priv->timerfreq;
> +	priv->timerfreq = new_freq;
> +	mpic_reset_time(handle, &bcr_time, &ccr_time);
> +	priv->timerfreq = timerfreq;
> +}
> +
> +static void switch_group_timer(struct timer_group_priv *priv,
> +				unsigned int new_freq)

timer_group_set_freq()

> +{
> +	int i, num;
> +
> +	for (i = 0; i < TIMERS_PER_GROUP; i++) {
> +		num = TIMERS_PER_GROUP - 1 - i;
> +		/* cascade */
> +		if ((i + 1) < TIMERS_PER_GROUP &&
> +				priv->timer[num].cascade_handle) {

This comment is redundant -- "cascade" is already in "cascade_handle".

How about a comment explaining how you handle cascaded interrupts?

> +			do_switch_time(&priv->timer[num], new_freq);
> +			i++;
> +			continue;
> +		}

Are you sure it works to convert each timer of a cascade separately?

How is this different from letting the main loop do it, other than that
you don't test idle?  The do_switch_time() call is identical, the
increminting of i is identical...

I think you meant to call do_switch_time() on only one of the halves of
the cascade (I'm not sure which one) and skip the other, but that's not
what the code does.

> +		if (!test_bit(i, (unsigned long *)&priv->idle))
> +			do_switch_time(&priv->timer[num], new_freq);

That cast is broken.  If test_bit() wants a long, you need to provide an
actual long.  On 64-bit this code will be looking at "flags" rather than
"idle".

Is testing idle even necessary, or would TIMER_STOP do the job?

> +	}
> +}
> +
> +static int mpic_timer_suspend(void)
> +{
> +	struct timer_group_priv *priv;
> +	suspend_state_t pm_state;
> +
> +	pm_state = pm_suspend_state();
> +
> +	list_for_each_entry(priv, &timer_group_list, node) {
> +		/* timer not be used */
> +		if (priv->idle == 0xf)
> +			continue;

"The timer isn't used"

> +		switch (pm_state) {
> +		case PM_SUSPEND_STANDBY:
> +			break;

Unnecessary

> +		case PM_SUSPEND_MEM:
> +			if (!priv->suspended_timerfreq)
> +				continue;
> +
> +			/* will switch timers, a set of timer */
> +			switch_group_timer(priv, priv->suspended_timerfreq);

What does "a set of timer" mean here?  If you mean that it operates on a
timer group, please use the same terminology as elsewhere (plus it's
implied by the name of the function).

"will switch timers" is implied by the name of the function.  Comments
should explain things, not just restate them.

> +			/* Software: switch timerfreq to suspended freq */

What does "Software:" mean here?

> +			priv->resume_timerfreq = priv->timerfreq;
> +			priv->timerfreq = priv->suspended_timerfreq;

Why doesn't "switch_group_timer" handle this?

> +			break;
> +		default:
> +			break;

Unnecessary default

There's no need for a switch at all -- there's only one case in which
you're ever going to do anything here.

-Scott

^ permalink raw reply

* Re: [PATCH 1/2] powerpc/powernv: reduce multi-hit of iommu_add_device()
From: Wei Yang @ 2014-04-30  1:31 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev, Wei Yang, Alexey Kardashevskiy, gwshan
In-Reply-To: <535FA4E9.9050403@ozlabs.ru>

On Tue, Apr 29, 2014 at 11:11:05PM +1000, Alexey Kardashevskiy wrote:
>On 04/29/2014 07:37 PM, Wei Yang wrote:
>> On Tue, Apr 29, 2014 at 05:55:48PM +1000, Alexey Kardashevskiy wrote:
>>> On 04/29/2014 04:49 PM, Wei Yang wrote:
>>>> On Mon, Apr 28, 2014 at 11:35:32PM +1000, Alexey Kardashevskiy wrote:
>>>>> On 04/23/2014 12:26 PM, Wei Yang wrote:
>>>>>> During the EEH hotplug event, iommu_add_device() will be invoked three times
>>>>>> and two of them will trigger warning or error.
>>>>>>
>>>>>> The three times to invoke the iommu_add_device() are:
>>>>>>
>>>>>>     pci_device_add
>>>>>>        ...
>>>>>>        set_iommu_table_base_and_group   <- 1st time, fail
>>>>>>     device_add
>>>>>>        ...
>>>>>>        tce_iommu_bus_notifier           <- 2nd time, succees
>>>>>>     pcibios_add_pci_devices
>>>>>>        ...
>>>>>>        pcibios_setup_bus_devices        <- 3rd time, re-attach
>>>>>>
>>>>>> The first time fails, since the dev->kobj->sd is not initialized. The
>>>>>> dev->kobj->sd is initialized in device_add().
>>>>>> The third time's warning is triggered by the re-attach of the iommu_group.
>>>>>>
>>>>>> After applying this patch, the error
>>>>>
>>>>> Nack.
>>>>>
>>>>> The patch still seems incorrect and we actually need to remove
>>>>> tce_iommu_bus_notifier completely as pcibios_setup_bus_devices is called
>>>> >from another notifier anyway. Could you please test it?
>>>>>
>>>>>
>>>>
>>>> Hi, Alexey,
>>>>
>>>> Nice to see your comment. Let me show what I got fist.
>>>>
>>>> Generally, when kernel enumerate on the pci device, following functions will
>>>> be invoked.
>>>>
>>>>      pci_device_add
>>>>         pcibios_setup_bus_device
>>>>         ...
>>>>         set_iommu_table_base_and_group   
>>>>      device_add
>>>>         ...
>>>>         tce_iommu_bus_notifier           
>>>>      pcibios_fixp_bus
>>>>         pcibios_add_pci_devices
>>>>         ...
>>>>         pcibios_setup_bus_devices        
>>>>
>>>> >From the call flow, we see for a normall pci device, the
>>>> pcibios_setup_bus_device() will be invoked twice.
>>>
>>>
>>> tce_iommu_bus_notifier should not be called for devices during boot-time
>>> PCI discovery as the discovery is done from subsys_initcall handler and
>>> tce_iommu_bus_notifier is registered as subsys_initcall_sync. Are you sure
>>> this is happening? You should see warnings as for PF's EEH but you do not
>>> say that.
>>>
>> 
>> Let me make it clearer. I list the call flow for general purpose to show the
>> sequence of the call flow.
>> 
>> The tce_iommu_bus_notifier is not invoked at the boot time. And none of them
>> do real thing at boot time.
>> 
>> I don't understand your last sentence. I see warning and error during EEH
>> hotplug. If necessary, I will add this in the commit log.
>> 
>>>
>>>> At the bootup time, none of them succeed to setup the dma, since the PE is not
>>>> assigned or the tbl is not set. The iommu tbl and group is setup in
>>>> pnv_pci_ioda_setup_DMA().
>>>>
>>>> This call flow maintains the same when EEH error happens on Bus PE, while the
>>>> behavior is a little different. 
>>>>
>>>>      pci_device_add
>>>>         pcibios_setup_bus_device
>>>>         ...
>>>>         set_iommu_table_base_and_group   <- fail, kobj->sd is not initialized
>>>
>>>
>>> Sorry, what is uninitialized? The only kobj here is the one in iommu_group
>>> and it must have been taken care of before adding a device.
>> 
>> pci_dev->dev->kobj->sd.
>> 
>> I have explained this in previous discussion. This kobj->sd is initialized in
>> device_add().
>> 
>>>
>>>
>>>>      device_add
>>>>         ...
>>>>         tce_iommu_bus_notifier           <- succeed
>>>>      pcibios_fixp_bus
>>>>         pcibios_add_pci_devices
>>>>         ...
>>>>         pcibios_setup_bus_devices        <- warning, re-attach
>>>
>>>
>>> This is why I am suggesting getting rid of tce_iommu_bus_notifier - we will
>>> avoid the warning.
>> 
>> Then how about the first time's error?
>
>
>What do you call "first time error" here?
>

Would you please take a look at my commit log?

Currently, the iommu_group will be added three times. The error happens at the
first time we try to attatch the iommu_group in pci_device_add(). And yes,
this happens at the EEH recovery on PF for a hotplug case.

The error is: 

        iommu_tce: 0003:05:00.0 has not been added, ret=-14

And the reason is:
  
        pci_dev->dev->kobj->sd is not initialized.

>
>>>> While this call flow will change a little on a VF. For a VF,
>>>> pcibios_fixp_bus() will not be invoked. Current behavior is this.
>>>
>>>
>>> You meant pcibios_fixup_bus? I would expect it to get called (from
>>> pci_rescan_bus() or something like that?) and this seems to be the problem
>>> here.
>> 
>> When EEH happens and hotplug the pci bus, we need to do the pci_scan_bridge()
>> which will do the pcibios_fixup_bus().
>
>Are you talking now about EEH on PF (physical function) or EEH on VF
>(virtual function)?
>
>Are you calling pci_scan_bridge()
>

Yes, it is pcibios_add_pci_devices() do the hotplug and invoke the
pci_scan_bridge().

>
>> So you suggest to remove it? This is the generic code.
>
>
>I suggest removing tce_iommu_bus_notifier only. It must go. It was my
>mistake at the first place.
>
>
>>> How are VFs added in terms of code flow? Is there any git tree to look at?
>>>
>> 
>> VF add code flow is a generic code in drivers/pci/iov.c, virtfn_add().
>
>virtfn_add -> pci_device_add -> pcibios_add_device -> pcibios_setup_device
>-> ppc_md.pci_dma_dev_setup -> pnv_pci_dma_dev_setup ->
>pnv_pci_ioda_dma_dev_setup ->
>set_iommu_table_base.
>
>
>What part of this is missing?
>

Currently, you will do set_iommu_table_base_and_group() in
pnv_pci_ioda_dma_dev_setup(). And this will fail and return an error.

The error reason is the kobj->sd is not initialized yet.
I hope it is clear this time.

>
>> 
>>>
>>>
>>>>      pci_device_add
>>>>         pcibios_setup_bus_device
>>>>         ...
>>>>         set_iommu_table_base_and_group   <- fail, kobj->sd is not initialized
>>>>      device_add
>>>>         ...
>>>>         tce_iommu_bus_notifier           <- succeed
>>>>
>>>> And if an EEH error happens just on a VF, I believe the same call flow should
>>>> go for recovery. (This is not set down, still under discussion with Gavin.)
>>>>
>>>> My conclusion is:
>>>> 1. remove the tce_iommu_bus_notifier completely will make the VF not work.
>>>>    So I choose to revert the code and attach make the iommu group attachment
>>>>    just happens in one place.
>>>>
>>>> BTW, I know my patch is not a perfect one. For a PF, the tbl will still be set
>>>> twice. I am not sure why we need to invoke pcibios_setup_device() twice on a
>>>> PF, maybe some platform need to fixup some thing after the pci_bus is added.
>>>> So I don't remove one of them to solve the problem.
>>>>
>>>> If you have a better idea, I am glad to take it.
>
>
>
>-- 
>Alexey

-- 
Richard Yang
Help you, Help me

^ permalink raw reply

* Re: [PATCH v2 00/21] FDT clean-ups and libfdt support
From: Stephen N Chivers @ 2014-04-30  0:33 UTC (permalink / raw)
  To: Rob Herring
  Cc: devicetree, linux-kernel-owner, Chris Zankel, Aurelien Jacquiot,
	linux-kernel, Ingo Molnar, H. Peter Anvin, Grant Likely,
	linuxppc-dev, James Hogan
In-Reply-To: <1398215901-25609-1-git-send-email-robherring2@gmail.com>

> From: Rob Herring <robherring2@gmail.com>
> To: Grant Likely <grant.likely@linaro.org>, linux-
> kernel@vger.kernel.org, devicetree@vger.kernel.org
> Cc: Rob Herring <robh@kernel.org>, Aurelien Jacquiot <a-
> jacquiot@ti.com>, Benjamin Herrenschmidt <benh@kernel.crashing.org>,
> Chris Zankel <chris@zankel.net>, "H. Peter Anvin" <hpa@zytor.com>, 
> Ingo Molnar <mingo@redhat.com>, James Hogan <james.hogan@imgtec.co>
> Date: 04/30/2014 09:45 AM
> Subject: [PATCH v2 00/21] FDT clean-ups and libfdt support
> Sent by: linux-kernel-owner@vger.kernel.org
> 
> From: Rob Herring <robh@kernel.org>
> 
> This is a series of clean-ups of architecture FDT code and converts the
> core FDT code over to using libfdt functions. This is in preparation
> to add FDT based address translation parsing functions for early
> console support. This series removes direct access to FDT data from all
> arches except powerpc.
> 
> The current MIPS lantiq and xlp DT code is buggy as built-in DTBs need
> to be copied out of init section. Patches 2 and 3 should be applied to
> 3.15.
> 
> Changes in v2 are relatively minor. There was a bug in the unflattening
> code where walking up the tree was not being handled correctly (thanks
> to Michal Simek). I re-worked things a bit to avoid globally adding
> libfdt include paths.
> 
> A branch is available here[1], and I plan to put into linux-next in a 
few
> days. Please test! I've compiled on arm, arm64, mips, microblaze, 
xtensa,
> and powerpc and booted on arm and arm64.
I have tested this for PowerPC using a snapshot of libfdt[1] collected 
from
the today (30/04/2014).

The computers used were 32 bit, Freescale and IBM/AMCC CPUs:

        MVME5100 - Motorola/Fresscale CPU - PPCBug firmware
        SAM440EP - IBM/AMCC - U-Boot firmware

Tested-by: Stephen Chivers <schivers@csc.com>

Stephen Chivers,
CSC Australia Pty. Ltd.

[1] git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git libfdt

^ permalink raw reply

* Re: [PATCH 0/3] Add new ptrace request macros on PowerPC
From: Michael Neuling @ 2014-04-30  0:29 UTC (permalink / raw)
  To: Anshuman Khandual; +Cc: Linux PPC dev, linux-kernel, avagin, roland, oleg
In-Reply-To: <535F997B.3000500@linux.vnet.ibm.com>

Anshuman Khandual <khandual@linux.vnet.ibm.com> wrote:

> On 04/29/2014 01:52 PM, Michael Neuling wrote:
> > That's not what that patch does. It shouldn't make any user visible changes
> > to DSCR or PPR.
> 
> It may not when it runs uninterrupted but after the tracee process has
> stopped, thread.dscr reflects the default DSCR value as mentioned
> before. This can be proved by changing the "dscr_default" value in
> arch/powerpc/sysfs.c file.

The intention with DSCR is that if the user changes the DSCR, the kernel
should always save/restore it.  If you are seeing something else, then
that is a bug.  Anton has a test case for this here:

  http://ozlabs.org/~anton/junkcode/dscr_explicit_test.c

If that is failing, then there is a bug that we need to fix.

The PPR is the same, except that the kernel can change it over a
syscall.

> > Over syscall PPR and DSCR may change.

Sorry, this should be only PPR.  DSCR shouldn't change over a syscall,
at least that's the intention.

> > Depending on your test case, that may
> > be your problem.
> 
> I would guess when the tracee process stops for ptrace analysis, tm_reclaim or
> tm_recheckpoint path might be crossed which is causing this dscr_default value
> to go into thread_struct.

That shouldn't happen.  If that's happening, it's a bug.

Mikey

^ permalink raw reply

* Re: [PATCH 1/2] powerpc/powernv: reduce multi-hit of iommu_add_device()
From: Gavin Shan @ 2014-04-30  0:28 UTC (permalink / raw)
  To: Wei Yang; +Cc: linuxppc-dev, Alexey Kardashevskiy, gwshan
In-Reply-To: <20140429064955.GA5066@richard>

On Tue, Apr 29, 2014 at 02:49:55PM +0800, Wei Yang wrote:
>On Mon, Apr 28, 2014 at 11:35:32PM +1000, Alexey Kardashevskiy wrote:
>>On 04/23/2014 12:26 PM, Wei Yang wrote:

.../...

>Generally, when kernel enumerate on the pci device, following functions will
>be invoked.
>
>     pci_device_add
>        pcibios_setup_bus_device
>        ...
>        set_iommu_table_base_and_group   
>     device_add
>        ...
>        tce_iommu_bus_notifier           
>     pcibios_fixup_bus
>        pcibios_add_pci_devices
>        ...
>        pcibios_setup_bus_devices        
>
>From the call flow, we see for a normall pci device, the
>pcibios_setup_bus_device() will be invoked twice.
>
>At the bootup time, none of them succeed to setup the dma, since the PE is not
>assigned or the tbl is not set. The iommu tbl and group is setup in
>pnv_pci_ioda_setup_DMA().
>

Yes, we don't assign PE# for PCI devices until ppc_md.pcibios_fixup().
We gets IOMMU group and IOMMU group device registered in ppc_md.pcibios_fixup().

As Alexy already pointed out, "tce_iommu_bus_notifier" doesn't take effect
during system boot stage.

>This call flow maintains the same when EEH error happens on Bus PE, while the
>behavior is a little different. 
>
>     pci_device_add
>        pcibios_setup_bus_device
>        ...
>        set_iommu_table_base_and_group   <- fail, kobj->sd is not initialized
>     device_add
>        ...
>        tce_iommu_bus_notifier           <- succeed
>     pcibios_fixp_bus
>        pcibios_add_pci_devices
>        ...
>        pcibios_setup_bus_devices        <- warning, re-attach
>
>While this call flow will change a little on a VF. For a VF,
>pcibios_fixp_bus() will not be invoked. Current behavior is this.
>
>     pci_device_add
>        pcibios_setup_bus_device
>        ...
>        set_iommu_table_base_and_group   <- fail, kobj->sd is not initialized
>     device_add
>        ...
>        tce_iommu_bus_notifier           <- succeed
>

It seems that we have 2 problems here:

- For non-SRIOV case, pcibios_setup_device() is called for towice. That
  seems incorrect. We could simply remove pcibios_setup_bus_devices()
  from pcibios_fixup_bus().

- It's too early to register IOMMU group/device in pnv_pci_ioda_dma_dev_setup()
  because the sysfs entries of the PCI device aren't finalized yet. So we could
  remove all logic we have in pnv_pci_ioda_dma_dev_setup() and just purely rely
  on "tce_iommu_bus_notifier".

By the way, I never tried EEH on SRIOV PF/VFs. However, I never hit similar
issue in non-SRIOV cases.

Thanks,
Gavin

^ permalink raw reply

* [PATCH 2/2] powerpc: fix smp_processor_id() in preemptible splat in set_breakpoint
From: Paul Gortmaker @ 2014-04-29 19:25 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras
  Cc: Paul Gortmaker, linuxppc-dev, Steven Rostedt
In-Reply-To: <1398799517-47879-1-git-send-email-paul.gortmaker@windriver.com>

Currently, on 8641D, which doesn't set CONFIG_HAVE_HW_BREAKPOINT
we get the following splat:

BUG: using smp_processor_id() in preemptible [00000000] code: login/1382
caller is set_breakpoint+0x1c/0xa0
CPU: 0 PID: 1382 Comm: login Not tainted 3.15.0-rc3-00041-g2aafe1a4d451 #1
Call Trace:
[decd5d80] [c0008dc4] show_stack+0x50/0x158 (unreliable)
[decd5dc0] [c03c6fa0] dump_stack+0x7c/0xdc
[decd5de0] [c01f8818] check_preemption_disabled+0xf4/0x104
[decd5e00] [c00086b8] set_breakpoint+0x1c/0xa0
[decd5e10] [c00d4530] flush_old_exec+0x2bc/0x588
[decd5e40] [c011c468] load_elf_binary+0x2ac/0x1164
[decd5ec0] [c00d35f8] search_binary_handler+0xc4/0x1f8
[decd5ef0] [c00d4ee8] do_execve+0x3d8/0x4b8
[decd5f40] [c001185c] ret_from_syscall+0x0/0x38
 --- Exception: c01 at 0xfeee554
    LR = 0xfeee7d4

The call path in this case is:

	flush_thread
	   --> set_debug_reg_defaults
	     --> set_breakpoint
	       --> __get_cpu_var

Since preemption is enabled in the cleanup of flush thread, and
there is no need to disable it, introduce the distinction between
set_breakpoint and __set_breakpoint, leaving only the flush_thread
instance as the current user of set_breakpoint.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 arch/powerpc/include/asm/debug.h         |  1 +
 arch/powerpc/include/asm/hw_breakpoint.h |  2 +-
 arch/powerpc/kernel/hw_breakpoint.c      |  8 ++++----
 arch/powerpc/kernel/process.c            | 11 +++++++++--
 arch/powerpc/kernel/signal.c             |  2 +-
 arch/powerpc/xmon/xmon.c                 |  2 +-
 6 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index 1d7f966d3b18..a954e4975049 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -47,6 +47,7 @@ static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; }
 #endif
 
 void set_breakpoint(struct arch_hw_breakpoint *brk);
+void __set_breakpoint(struct arch_hw_breakpoint *brk);
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 extern void do_send_trap(struct pt_regs *regs, unsigned long address,
 			 unsigned long error_code, int signal_code, int brkpt);
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h b/arch/powerpc/include/asm/hw_breakpoint.h
index eb0f4ac75c4c..ac6432d9be46 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -79,7 +79,7 @@ static inline void hw_breakpoint_disable(void)
 	brk.address = 0;
 	brk.type = 0;
 	brk.len = 0;
-	set_breakpoint(&brk);
+	__set_breakpoint(&brk);
 }
 extern void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs);
 
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index b0a1792279bb..0bb5918faaaf 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -72,7 +72,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
 	 * If so, DABR will be populated in single_step_dabr_instruction().
 	 */
 	if (current->thread.last_hit_ubp != bp)
-		set_breakpoint(info);
+		__set_breakpoint(info);
 
 	return 0;
 }
@@ -198,7 +198,7 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs)
 
 	info = counter_arch_bp(tsk->thread.last_hit_ubp);
 	regs->msr &= ~MSR_SE;
-	set_breakpoint(info);
+	__set_breakpoint(info);
 	tsk->thread.last_hit_ubp = NULL;
 }
 
@@ -284,7 +284,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
 	if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ))
 		perf_bp_event(bp, regs);
 
-	set_breakpoint(info);
+	__set_breakpoint(info);
 out:
 	rcu_read_unlock();
 	return rc;
@@ -316,7 +316,7 @@ int __kprobes single_step_dabr_instruction(struct die_args *args)
 	if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ))
 		perf_bp_event(bp, regs);
 
-	set_breakpoint(info);
+	__set_breakpoint(info);
 	current->thread.last_hit_ubp = NULL;
 
 	/*
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index c4e3d50e86f8..bc2d668b68ed 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -495,7 +495,7 @@ static inline int set_dawr(struct arch_hw_breakpoint *brk)
 	return 0;
 }
 
-void set_breakpoint(struct arch_hw_breakpoint *brk)
+void __set_breakpoint(struct arch_hw_breakpoint *brk)
 {
 	__get_cpu_var(current_brk) = *brk;
 
@@ -505,6 +505,13 @@ void set_breakpoint(struct arch_hw_breakpoint *brk)
 		set_dabr(brk);
 }
 
+void set_breakpoint(struct arch_hw_breakpoint *brk)
+{
+	preempt_disable();
+	__set_breakpoint(brk);
+	preempt_enable();
+}
+
 #ifdef CONFIG_PPC64
 DEFINE_PER_CPU(struct cpu_usage, cpu_usage_array);
 #endif
@@ -834,7 +841,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
  */
 #ifndef CONFIG_HAVE_HW_BREAKPOINT
 	if (unlikely(!hw_brk_match(&__get_cpu_var(current_brk), &new->thread.hw_brk)))
-		set_breakpoint(&new->thread.hw_brk);
+		__set_breakpoint(&new->thread.hw_brk);
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 #endif
 
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 8fc4177ed65a..1c794cef2883 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -134,7 +134,7 @@ static int do_signal(struct pt_regs *regs)
 	 */
 	if (current->thread.hw_brk.address &&
 		current->thread.hw_brk.type)
-		set_breakpoint(&current->thread.hw_brk);
+		__set_breakpoint(&current->thread.hw_brk);
 #endif
 	/* Re-enable the breakpoints for the signal stack */
 	thread_change_pc(current, regs);
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 08504e75b2c7..d3759b7a5535 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -759,7 +759,7 @@ static void insert_cpu_bpts(void)
 		brk.address = dabr.address;
 		brk.type = (dabr.enabled & HW_BRK_TYPE_DABR) | HW_BRK_TYPE_PRIV_ALL;
 		brk.len = 8;
-		set_breakpoint(&brk);
+		__set_breakpoint(&brk);
 	}
 	if (iabr && cpu_has_feature(CPU_FTR_IABR))
 		mtspr(SPRN_IABR, iabr->address
-- 
1.9.0

^ permalink raw reply related

* [PATCH 0/2] Fix smp_processor_id() in preemptible splat
From: Paul Gortmaker @ 2014-04-29 19:25 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras
  Cc: Paul Gortmaker, linuxppc-dev, Steven Rostedt

This seems to have been around for a while; I tripped over it while
working on testing 3.10-rt with Steven, and then confirmed it still
was an issue on today's mainline. (didn't check -next however...)

I didn't bisect, but it looks like it may have come in from the
changes in v3.9-rc1~100^2~57.  It probably only impacts the older
cores like the 74xx that don't have CONFIG_HAVE_HW_BREAKPOINT,
and I know I haven't booted my sbc8641D in over a year, so no
surprise this went undetected for a while.  Maybe adding a Cc:
stable is warranted?

It is pretty hard to ignore the splats since you get one per process,
so it must be just the older platform never getting much testing.

The 1st patch is just a small cleanup that makes the 2nd patch
a bit smaller.

If the __set_breakpoint() addition isn't liked, then we could
alternatively just wrap the one call site with preempt_dis/en/able.

Paul.
---

Paul Gortmaker (2):
  powerpc: drop return value from set_breakpoint as it is unused
  powerpc: fix smp_processor_id() in preemptible splat in set_breakpoint

 arch/powerpc/include/asm/debug.h         |  3 ++-
 arch/powerpc/include/asm/hw_breakpoint.h |  2 +-
 arch/powerpc/kernel/hw_breakpoint.c      |  8 ++++----
 arch/powerpc/kernel/process.c            | 15 +++++++++++----
 arch/powerpc/kernel/signal.c             |  2 +-
 arch/powerpc/xmon/xmon.c                 |  2 +-
 6 files changed, 20 insertions(+), 12 deletions(-)

-- 
1.9.0

^ permalink raw reply

* [PATCH] powerpc: memcpy optimization for 64bit LE
From: Anton Blanchard @ 2014-04-29 23:12 UTC (permalink / raw)
  To: benh, paulus, felix; +Cc: linuxppc-dev
In-Reply-To: <20140430091054.4de84c9b@kryten>

From: Philippe Bergheaud <felix@linux.vnet.ibm.com>

Unaligned stores take alignment exceptions on POWER7 running in little-endian.
This is a dumb little-endian base memcpy that prevents unaligned stores.
Once booted the feature fixup code switches over to the VMX copy loops
(which are already endian safe).

The question is what we do before that switch over. The base 64bit
memcpy takes alignment exceptions on POWER7 so we can't use it as is.
Fixing the causes of alignment exception would slow it down, because
we'd need to ensure all loads and stores are aligned either through
rotate tricks or bytewise loads and stores. Either would be bad for
all other 64bit platforms.

[ I simplified the loop a bit - Anton ]

Signed-off-by: Philippe Bergheaud <felix@linux.vnet.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/include/asm/string.h |  4 ----
 arch/powerpc/kernel/ppc_ksyms.c   |  2 --
 arch/powerpc/lib/Makefile         |  2 --
 arch/powerpc/lib/memcpy_64.S      | 16 ++++++++++++++++
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 0dffad6..e40010a 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -10,9 +10,7 @@
 #define __HAVE_ARCH_STRNCMP
 #define __HAVE_ARCH_STRCAT
 #define __HAVE_ARCH_MEMSET
-#ifdef __BIG_ENDIAN__
 #define __HAVE_ARCH_MEMCPY
-#endif
 #define __HAVE_ARCH_MEMMOVE
 #define __HAVE_ARCH_MEMCMP
 #define __HAVE_ARCH_MEMCHR
@@ -24,9 +22,7 @@ extern int strcmp(const char *,const char *);
 extern int strncmp(const char *, const char *, __kernel_size_t);
 extern char * strcat(char *, const char *);
 extern void * memset(void *,int,__kernel_size_t);
-#ifdef __BIG_ENDIAN__
 extern void * memcpy(void *,const void *,__kernel_size_t);
-#endif
 extern void * memmove(void *,const void *,__kernel_size_t);
 extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 3bd77ed..8de1c48 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -154,9 +154,7 @@ EXPORT_SYMBOL(__cmpdi2);
 #endif
 long long __bswapdi2(long long);
 EXPORT_SYMBOL(__bswapdi2);
-#ifdef __BIG_ENDIAN__
 EXPORT_SYMBOL(memcpy);
-#endif
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(memcmp);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 95a20e1..59fa2de 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -23,9 +23,7 @@ obj-y			+= checksum_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC64)	+= checksum_wrappers_64.o
 endif
 
-ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),)
 obj-$(CONFIG_PPC64)		+= memcpy_power7.o memcpy_64.o 
-endif
 
 obj-$(CONFIG_PPC_EMULATE_SSTEP)	+= sstep.o ldstfp.o
 
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 72ad055..dc4ba79 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -12,12 +12,27 @@
 	.align	7
 _GLOBAL(memcpy)
 BEGIN_FTR_SECTION
+#ifdef __LITTLE_ENDIAN__
+	cmpdi	cr7,r5,0
+#else
 	std	r3,48(r1)	/* save destination pointer for return value */
+#endif
 FTR_SECTION_ELSE
 #ifndef SELFTEST
 	b	memcpy_power7
 #endif
 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
+#ifdef __LITTLE_ENDIAN__
+	/* dumb little-endian memcpy that will get replaced at runtime */
+	addi r9,r3,-1
+	addi r4,r4,-1
+	beqlr cr7
+	mtctr r5
+1:	lbzu r10,1(r4)
+	stbu r10,1(r9)
+	bdnz 1b
+	blr
+#else
 	PPC_MTOCRF(0x01,r5)
 	cmpldi	cr1,r5,16
 	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
@@ -203,3 +218,4 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	stb	r0,0(r3)
 4:	ld	r3,48(r1)	/* return dest pointer */
 	blr
+#endif
-- 
1.9.1

^ permalink raw reply related

* [PATCH] powerpc: memcpy optimization for 64bit LE
From: Anton Blanchard @ 2014-04-29 23:10 UTC (permalink / raw)
  To: benh, paulus, felix, linuxppc-dev

Unaligned stores take alignment exceptions on POWER7 running in little-endian.
This is a dumb little-endian base memcpy that prevents unaligned stores.
Once booted the feature fixup code switches over to the VMX copy loops
(which are already endian safe).

The question is what we do before that switch over. The base 64bit
memcpy takes alignment exceptions on POWER7 so we can't use it as is.
Fixing the causes of alignment exception would slow it down, because
we'd need to ensure all loads and stores are aligned either through
rotate tricks or bytewise loads and stores. Either would be bad for
all other 64bit platforms.

[ I simplified the loop a bit - Anton ]

Signed-off-by: Philippe Bergheaud <felix@linux.vnet.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
---
 arch/powerpc/include/asm/string.h |  4 ----
 arch/powerpc/kernel/ppc_ksyms.c   |  2 --
 arch/powerpc/lib/Makefile         |  2 --
 arch/powerpc/lib/memcpy_64.S      | 16 ++++++++++++++++
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index 0dffad6..e40010a 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -10,9 +10,7 @@
 #define __HAVE_ARCH_STRNCMP
 #define __HAVE_ARCH_STRCAT
 #define __HAVE_ARCH_MEMSET
-#ifdef __BIG_ENDIAN__
 #define __HAVE_ARCH_MEMCPY
-#endif
 #define __HAVE_ARCH_MEMMOVE
 #define __HAVE_ARCH_MEMCMP
 #define __HAVE_ARCH_MEMCHR
@@ -24,9 +22,7 @@ extern int strcmp(const char *,const char *);
 extern int strncmp(const char *, const char *, __kernel_size_t);
 extern char * strcat(char *, const char *);
 extern void * memset(void *,int,__kernel_size_t);
-#ifdef __BIG_ENDIAN__
 extern void * memcpy(void *,const void *,__kernel_size_t);
-#endif
 extern void * memmove(void *,const void *,__kernel_size_t);
 extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 3bd77ed..8de1c48 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -154,9 +154,7 @@ EXPORT_SYMBOL(__cmpdi2);
 #endif
 long long __bswapdi2(long long);
 EXPORT_SYMBOL(__bswapdi2);
-#ifdef __BIG_ENDIAN__
 EXPORT_SYMBOL(memcpy);
-#endif
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(memcmp);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 95a20e1..59fa2de 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -23,9 +23,7 @@ obj-y			+= checksum_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC64)	+= checksum_wrappers_64.o
 endif
 
-ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),)
 obj-$(CONFIG_PPC64)		+= memcpy_power7.o memcpy_64.o 
-endif
 
 obj-$(CONFIG_PPC_EMULATE_SSTEP)	+= sstep.o ldstfp.o
 
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 72ad055..dc4ba79 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -12,12 +12,27 @@
 	.align	7
 _GLOBAL(memcpy)
 BEGIN_FTR_SECTION
+#ifdef __LITTLE_ENDIAN__
+	cmpdi	cr7,r5,0
+#else
 	std	r3,48(r1)	/* save destination pointer for return value */
+#endif
 FTR_SECTION_ELSE
 #ifndef SELFTEST
 	b	memcpy_power7
 #endif
 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
+#ifdef __LITTLE_ENDIAN__
+	/* dumb little-endian memcpy that will get replaced at runtime */
+	addi r9,r3,-1
+	addi r4,r4,-1
+	beqlr cr7
+	mtctr r5
+1:	lbzu r10,1(r4)
+	stbu r10,1(r9)
+	bdnz 1b
+	blr
+#else
 	PPC_MTOCRF(0x01,r5)
 	cmpldi	cr1,r5,16
 	neg	r6,r3		# LS 3 bits = # bytes to 8-byte dest bdry
@@ -203,3 +218,4 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
 	stb	r0,0(r3)
 4:	ld	r3,48(r1)	/* return dest pointer */
 	blr
+#endif
-- 
1.9.1

^ permalink raw reply related

* [PATCH 1/2] powerpc: drop return value from set_breakpoint as it is unused
From: Paul Gortmaker @ 2014-04-29 19:25 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras
  Cc: Paul Gortmaker, linuxppc-dev, Steven Rostedt
In-Reply-To: <1398799517-47879-1-git-send-email-paul.gortmaker@windriver.com>

None of the callers check the return value, so it might as
well not have one at all.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 arch/powerpc/include/asm/debug.h | 2 +-
 arch/powerpc/kernel/process.c    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index d2516308ed1e..1d7f966d3b18 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -46,7 +46,7 @@ static inline int debugger_break_match(struct pt_regs *regs) { return 0; }
 static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; }
 #endif
 
-int set_breakpoint(struct arch_hw_breakpoint *brk);
+void set_breakpoint(struct arch_hw_breakpoint *brk);
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 extern void do_send_trap(struct pt_regs *regs, unsigned long address,
 			 unsigned long error_code, int signal_code, int brkpt);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 31d021506d21..c4e3d50e86f8 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -495,14 +495,14 @@ static inline int set_dawr(struct arch_hw_breakpoint *brk)
 	return 0;
 }
 
-int set_breakpoint(struct arch_hw_breakpoint *brk)
+void set_breakpoint(struct arch_hw_breakpoint *brk)
 {
 	__get_cpu_var(current_brk) = *brk;
 
 	if (cpu_has_feature(CPU_FTR_DAWR))
-		return set_dawr(brk);
-
-	return set_dabr(brk);
+		set_dawr(brk);
+	else
+		set_dabr(brk);
 }
 
 #ifdef CONFIG_PPC64
-- 
1.9.0

^ permalink raw reply related

* Re: [PATCH v2 1/2] powerpc/pm: add api to get suspend state which is STANDBY or MEM
From: Rafael J. Wysocki @ 2014-04-29 23:07 UTC (permalink / raw)
  To: Scott Wood
  Cc: Zhao Chenhui, linux-pm, Dongsheng Wang, Leo Li,
	正雄 金, linuxppc-dev
In-Reply-To: <1398811632.24575.98.camel@snotra.buserror.net>

On Tuesday, April 29, 2014 05:47:12 PM Scott Wood wrote:
> On Mon, 2014-04-28 at 13:53 +0800, Leo Li wrote:
> > On Sat, Apr 26, 2014 at 5:45 AM, Scott Wood <scottwood@freescale.com> wrote:
> > > On Thu, 2014-04-24 at 14:11 +0800, Dongsheng Wang wrote:
> > >> From: Wang Dongsheng <dongsheng.wang@freescale.com>
> > >>
> > >> Add set_pm_suspend_state & pm_suspend_state functions to set/get
> > >> suspend state. When system going to sleep or deep sleep, devices
> > >> can get the system suspend state(STANDBY/MEM) through pm_suspend_state
> > >> function and to handle different situations.
> > >>
> > >> Signed-off-by: Wang Dongsheng <dongsheng.wang@freescale.com>
> > >> ---
> > >> *v2*
> > >> Move pm api from fsl platform to powerpc general framework.
> > >
> > > What is powerpc-specific about this?
> > 
> > Generally I agree with you.  But I had the discussion about this topic
> > a while ago with the PM maintainer.  He suggestion to go with the
> > platform way.
> > 
> > https://lkml.org/lkml/2013/8/16/505
> 
> If what he meant was whether you could do what this patch does, then you
> can answer him with, "No, because it got nacked as not being platform or
> arch specific."  Oh, and you're still using .valid as the hook to set
> the platform state, which is awful -- I think .begin is what you want to
> use.
> 
> If we did it in powerpc code, then what would we do on ARM?  Copy the
> code?  No.
> 
> Now, a more legitimate objection to putting it in generic code might be
> that "standby" and "mem" are loosely defined and the knowledge of how a
> driver should react to each is platform specific -- but your patch
> doesn't address that.  You still have the driver itself interpret what
> "standby" and "mem" mean.
> 
> As for "in analogy with ACPI suspend operations", can someone familiar
> with ACPI explain what is meant?

ACPI defines sleep states S3 and S1 which are mappend to "mem" and
"standby" currently, but that may change in the future.

Generally speaking the meaning of "mem" and "standby" is platform-specific
except that "mem" should be a deeper (lower-power) sleep state than "standby".

Thanks!

-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.

^ permalink raw reply

* Re: [PATCH v2 1/2] powerpc/pm: add api to get suspend state which is STANDBY or MEM
From: Scott Wood @ 2014-04-29 22:47 UTC (permalink / raw)
  To: Leo Li
  Cc: Zhao Chenhui, linux-pm, Rafael J. Wysocki, Dongsheng Wang,
	正雄 金, linuxppc-dev
In-Reply-To: <CADRPPNTGRyLSSjcdpdZL95NVnRepR_ZqvVk1zrMU3ZOL0_bbNQ@mail.gmail.com>

On Mon, 2014-04-28 at 13:53 +0800, Leo Li wrote:
> On Sat, Apr 26, 2014 at 5:45 AM, Scott Wood <scottwood@freescale.com> wrote:
> > On Thu, 2014-04-24 at 14:11 +0800, Dongsheng Wang wrote:
> >> From: Wang Dongsheng <dongsheng.wang@freescale.com>
> >>
> >> Add set_pm_suspend_state & pm_suspend_state functions to set/get
> >> suspend state. When system going to sleep or deep sleep, devices
> >> can get the system suspend state(STANDBY/MEM) through pm_suspend_state
> >> function and to handle different situations.
> >>
> >> Signed-off-by: Wang Dongsheng <dongsheng.wang@freescale.com>
> >> ---
> >> *v2*
> >> Move pm api from fsl platform to powerpc general framework.
> >
> > What is powerpc-specific about this?
> 
> Generally I agree with you.  But I had the discussion about this topic
> a while ago with the PM maintainer.  He suggestion to go with the
> platform way.
> 
> https://lkml.org/lkml/2013/8/16/505

If what he meant was whether you could do what this patch does, then you
can answer him with, "No, because it got nacked as not being platform or
arch specific."  Oh, and you're still using .valid as the hook to set
the platform state, which is awful -- I think .begin is what you want to
use.

If we did it in powerpc code, then what would we do on ARM?  Copy the
code?  No.

Now, a more legitimate objection to putting it in generic code might be
that "standby" and "mem" are loosely defined and the knowledge of how a
driver should react to each is platform specific -- but your patch
doesn't address that.  You still have the driver itself interpret what
"standby" and "mem" mean.

As for "in analogy with ACPI suspend operations", can someone familiar
with ACPI explain what is meant?

-Scott

^ permalink raw reply

* Re: [PATCH] powerpc/fsl-rio: Fix fsl_rio_setup error paths and use-after-unmap
From: Scott Wood @ 2014-04-29 17:12 UTC (permalink / raw)
  To: Liu Gang-B34182; +Cc: linuxppc-dev@lists.ozlabs.org
In-Reply-To: <dd70c20406d242b694b16e898cc2cddf@BY2PR03MB412.namprd03.prod.outlook.com>

On Mon, 2014-04-28 at 23:04 -0500, Liu Gang-B34182 wrote:
> > -----Original Message-----
> > From: Wood Scott-B07421
> > Sent: Tuesday, April 29, 2014 9:32 AM
> > To: linuxppc-dev@lists.ozlabs.org
> > Cc: Wood Scott-B07421; Liu Gang-B34182
> > Subject: [PATCH] powerpc/fsl-rio: Fix fsl_rio_setup error paths and use-
> > after-unmap
> > 
> > Several of the error paths from fsl_rio_setup are missing error messages.
> > 
> > Worse, fsl_rio_setup initializes several global pointers and does not
> > NULL them out after freeing/unmapping on error.  This caused
> > fsl_rio_mcheck_exception() to crash when accessing rio_regs_win which was
> > non-NULL but had been unmapped.
> > 
> > Signed-off-by: Scott Wood <scottwood@freescale.com>
> > Cc: Liu Gang <Gang.Liu@freescale.com>
> > ---
> > Liu Gang, are you sure all of these error conditions are fatal?  Why does
> > the rio driver fail if rmu is not present (e.g.  on t4240)?
> 
> Hi Scott, I think the errors you modified in the patch are serious and should
> be fixed. Thanks very much!
> And in fact, the rio driver can be used just for the submodule of the SRIO: RMU.
> It should be used with arch/powerpc/sysdev/fsl_rmu.c and there should have the
> RMU module.
> The fsl_rio.c implements some basic and needed works to support the RMU running well.

I don't quite follow -- is it expected that rio can work without rmu, or
not?  As is, fsl_rio_setup() will error out if it doesn't find an
fsl,srio-rmu-handle property.

> In addition, could you please help to apply the following patch:
> 
> http://patchwork.ozlabs.org/patch/337810/

Yes, I'll apply it when I do my next batch of patch applications.

-Scott

^ permalink raw reply

* Re: [PATCH 1/1] powerpc: crtsaveres.o needed only when -Os flag is enabled
From: Brian W Hart @ 2014-04-29 15:38 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <1398729908-15787-1-git-send-email-linuxram@us.ibm.com>

On Mon, Apr 28, 2014 at 05:05:08PM -0700, Ram Pai wrote:
>     powerpc: crtsaveres.o needed only when -Os flag is enabled
>     
>     Currently on powerpc arch, out-of-tree module fails to build without
>     crtsaveres.o, even when the module has no dependency on the symbols
>     provided by the file; when built without the -Os flag.
>     
>     BTW: '-Os' flag is enabled when CONFIG_CC_OPTIMIZE_FOR_SIZE is
>     configured.
>     
>     This patch fixes that problem.
>     
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> 
> diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
> index 4c0cedf..cf12f38 100644
> --- a/arch/powerpc/Makefile
> +++ b/arch/powerpc/Makefile
> @@ -157,7 +157,10 @@ CPP		= $(CC) -E $(KBUILD_CFLAGS)
>  
>  CHECKFLAGS	+= -m$(CONFIG_WORD_SIZE) -D__powerpc__ -D__powerpc$(CONFIG_WORD_SIZE)__
>  
> +ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
>  KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o
> +endif
> +
>  
>  # No AltiVec or VSX instructions when building kernel
>  KBUILD_CFLAGS += $(call cc-option,-mno-altivec)

I didn't try building a kernel or in-tree modules, but I confirmed
that it allows building of out-of-tree modules when crtsavres.o is
not present (e.g. as for a distro install where the kernel headers
are provided by package, rather than being manually prepared from
the sources).

Tested-by: Brian W Hart <hartb@linux.vnet.ibm.com>

^ permalink raw reply

* Re: [PATCH v2 00/21] FDT clean-ups and libfdt support
From: Grant Likely @ 2014-04-29 14:00 UTC (permalink / raw)
  To: Rob Herring, linux-kernel, devicetree
  Cc: linux-mips, Aurelien Jacquiot, H. Peter Anvin, Max Filippov,
	Paul Mackerras, linux, Jonas Bonn, Rob Herring, Russell King,
	linux-c6x-dev, x86, Ingo Molnar, Mark Salter, linux-xtensa,
	James Hogan, Thomas Gleixner, linux-metag, linux-arm-kernel,
	Chris Zankel, Michal Simek, Vineet Gupta, Ralf Baechle,
	linuxppc-dev
In-Reply-To: <1398215901-25609-1-git-send-email-robherring2@gmail.com>

On Tue, 22 Apr 2014 20:18:00 -0500, Rob Herring <robherring2@gmail.com> wrote:
> From: Rob Herring <robh@kernel.org>
> 
> This is a series of clean-ups of architecture FDT code and converts the
> core FDT code over to using libfdt functions. This is in preparation
> to add FDT based address translation parsing functions for early
> console support. This series removes direct access to FDT data from all
> arches except powerpc.
> 
> The current MIPS lantiq and xlp DT code is buggy as built-in DTBs need
> to be copied out of init section. Patches 2 and 3 should be applied to
> 3.15.
> 
> Changes in v2 are relatively minor. There was a bug in the unflattening
> code where walking up the tree was not being handled correctly (thanks
> to Michal Simek). I re-worked things a bit to avoid globally adding
> libfdt include paths.
> 
> A branch is available here[1], and I plan to put into linux-next in a few
> days. Please test! I've compiled on arm, arm64, mips, microblaze, xtensa,
> and powerpc and booted on arm and arm64.

This is pretty great work. I'll read through them again and I may have a
comment or two, but in general you can add my tested by tag:

Tested-by: Grant Likely <grant.likely@linaro.org>

>  40 files changed, 280 insertions(+), 598 deletions(-)

I love the diffstat!

g.

^ permalink raw reply

* Re: [PATCH 1/2] powerpc/powernv: reduce multi-hit of iommu_add_device()
From: Alexey Kardashevskiy @ 2014-04-29 13:11 UTC (permalink / raw)
  To: Wei Yang, Alexey Kardashevskiy; +Cc: linuxppc-dev, gwshan
In-Reply-To: <20140429093719.GB8028@richard>

On 04/29/2014 07:37 PM, Wei Yang wrote:
> On Tue, Apr 29, 2014 at 05:55:48PM +1000, Alexey Kardashevskiy wrote:
>> On 04/29/2014 04:49 PM, Wei Yang wrote:
>>> On Mon, Apr 28, 2014 at 11:35:32PM +1000, Alexey Kardashevskiy wrote:
>>>> On 04/23/2014 12:26 PM, Wei Yang wrote:
>>>>> During the EEH hotplug event, iommu_add_device() will be invoked three times
>>>>> and two of them will trigger warning or error.
>>>>>
>>>>> The three times to invoke the iommu_add_device() are:
>>>>>
>>>>>     pci_device_add
>>>>>        ...
>>>>>        set_iommu_table_base_and_group   <- 1st time, fail
>>>>>     device_add
>>>>>        ...
>>>>>        tce_iommu_bus_notifier           <- 2nd time, succees
>>>>>     pcibios_add_pci_devices
>>>>>        ...
>>>>>        pcibios_setup_bus_devices        <- 3rd time, re-attach
>>>>>
>>>>> The first time fails, since the dev->kobj->sd is not initialized. The
>>>>> dev->kobj->sd is initialized in device_add().
>>>>> The third time's warning is triggered by the re-attach of the iommu_group.
>>>>>
>>>>> After applying this patch, the error
>>>>
>>>> Nack.
>>>>
>>>> The patch still seems incorrect and we actually need to remove
>>>> tce_iommu_bus_notifier completely as pcibios_setup_bus_devices is called
>>> >from another notifier anyway. Could you please test it?
>>>>
>>>>
>>>
>>> Hi, Alexey,
>>>
>>> Nice to see your comment. Let me show what I got fist.
>>>
>>> Generally, when kernel enumerate on the pci device, following functions will
>>> be invoked.
>>>
>>>      pci_device_add
>>>         pcibios_setup_bus_device
>>>         ...
>>>         set_iommu_table_base_and_group   
>>>      device_add
>>>         ...
>>>         tce_iommu_bus_notifier           
>>>      pcibios_fixp_bus
>>>         pcibios_add_pci_devices
>>>         ...
>>>         pcibios_setup_bus_devices        
>>>
>>> >From the call flow, we see for a normall pci device, the
>>> pcibios_setup_bus_device() will be invoked twice.
>>
>>
>> tce_iommu_bus_notifier should not be called for devices during boot-time
>> PCI discovery as the discovery is done from subsys_initcall handler and
>> tce_iommu_bus_notifier is registered as subsys_initcall_sync. Are you sure
>> this is happening? You should see warnings as for PF's EEH but you do not
>> say that.
>>
> 
> Let me make it clearer. I list the call flow for general purpose to show the
> sequence of the call flow.
> 
> The tce_iommu_bus_notifier is not invoked at the boot time. And none of them
> do real thing at boot time.
> 
> I don't understand your last sentence. I see warning and error during EEH
> hotplug. If necessary, I will add this in the commit log.
> 
>>
>>> At the bootup time, none of them succeed to setup the dma, since the PE is not
>>> assigned or the tbl is not set. The iommu tbl and group is setup in
>>> pnv_pci_ioda_setup_DMA().
>>>
>>> This call flow maintains the same when EEH error happens on Bus PE, while the
>>> behavior is a little different. 
>>>
>>>      pci_device_add
>>>         pcibios_setup_bus_device
>>>         ...
>>>         set_iommu_table_base_and_group   <- fail, kobj->sd is not initialized
>>
>>
>> Sorry, what is uninitialized? The only kobj here is the one in iommu_group
>> and it must have been taken care of before adding a device.
> 
> pci_dev->dev->kobj->sd.
> 
> I have explained this in previous discussion. This kobj->sd is initialized in
> device_add().
> 
>>
>>
>>>      device_add
>>>         ...
>>>         tce_iommu_bus_notifier           <- succeed
>>>      pcibios_fixp_bus
>>>         pcibios_add_pci_devices
>>>         ...
>>>         pcibios_setup_bus_devices        <- warning, re-attach
>>
>>
>> This is why I am suggesting getting rid of tce_iommu_bus_notifier - we will
>> avoid the warning.
> 
> Then how about the first time's error?


What do you call "first time error" here?


>>> While this call flow will change a little on a VF. For a VF,
>>> pcibios_fixp_bus() will not be invoked. Current behavior is this.
>>
>>
>> You meant pcibios_fixup_bus? I would expect it to get called (from
>> pci_rescan_bus() or something like that?) and this seems to be the problem
>> here.
> 
> When EEH happens and hotplug the pci bus, we need to do the pci_scan_bridge()
> which will do the pcibios_fixup_bus().

Are you talking now about EEH on PF (physical function) or EEH on VF
(virtual function)?

Are you calling pci_scan_bridge()


> So you suggest to remove it? This is the generic code.


I suggest removing tce_iommu_bus_notifier only. It must go. It was my
mistake at the first place.


>> How are VFs added in terms of code flow? Is there any git tree to look at?
>>
> 
> VF add code flow is a generic code in drivers/pci/iov.c, virtfn_add().

virtfn_add -> pci_device_add -> pcibios_add_device -> pcibios_setup_device
-> ppc_md.pci_dma_dev_setup -> pnv_pci_dma_dev_setup ->
pnv_pci_ioda_dma_dev_setup ->
set_iommu_table_base.


What part of this is missing?


> 
>>
>>
>>>      pci_device_add
>>>         pcibios_setup_bus_device
>>>         ...
>>>         set_iommu_table_base_and_group   <- fail, kobj->sd is not initialized
>>>      device_add
>>>         ...
>>>         tce_iommu_bus_notifier           <- succeed
>>>
>>> And if an EEH error happens just on a VF, I believe the same call flow should
>>> go for recovery. (This is not set down, still under discussion with Gavin.)
>>>
>>> My conclusion is:
>>> 1. remove the tce_iommu_bus_notifier completely will make the VF not work.
>>>    So I choose to revert the code and attach make the iommu group attachment
>>>    just happens in one place.
>>>
>>> BTW, I know my patch is not a perfect one. For a PF, the tbl will still be set
>>> twice. I am not sure why we need to invoke pcibios_setup_device() twice on a
>>> PF, maybe some platform need to fixup some thing after the pci_bus is added.
>>> So I don't remove one of them to solve the problem.
>>>
>>> If you have a better idea, I am glad to take it.



-- 
Alexey

^ permalink raw reply

* Re: [PATCH 0/3] Add new ptrace request macros on PowerPC
From: Anshuman Khandual @ 2014-04-29 12:22 UTC (permalink / raw)
  To: Michael Neuling; +Cc: Linux PPC dev, linux-kernel, avagin, roland, oleg
In-Reply-To: <CAEjGV6y8MRiCfRBtWjNfrwKMccROEAJw2rPVc_u+r=tAo2EcHQ@mail.gmail.com>

On 04/29/2014 01:52 PM, Michael Neuling wrote:
> That's not what that patch does. It shouldn't make any user visible changes
> to DSCR or PPR.

It may not when it runs uninterrupted but after the tracee process has stopped,
thread.dscr reflects the default DSCR value as mentioned before. This can be
proved by changing the "dscr_default" value in arch/powerpc/sysfs.c file.

> 
> Over syscall PPR and DSCR may change. Depending on your test case, that may
> be your problem.

I would guess when the tracee process stops for ptrace analysis, tm_reclaim or
tm_recheckpoint path might be crossed which is causing this dscr_default value
to go into thread_struct.

^ permalink raw reply

* Re: [PATCH V3 2/2] powerpc/pseries: init fault_around_order for pseries
From: Madhavan Srinivasan @ 2014-04-29 10:35 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-arch, riel, rusty, dave.hansen, peterz, x86, linux-kernel,
	linux-mm, ak, paulus, mgorman, Linus Torvalds, akpm, linuxppc-dev,
	kirill.shutemov
In-Reply-To: <20140429070632.GB27951@gmail.com>

On Tuesday 29 April 2014 12:36 PM, Ingo Molnar wrote:
> 
> * Madhavan Srinivasan <maddy@linux.vnet.ibm.com> wrote:
> 
>> Performance data for different FAULT_AROUND_ORDER values from 4 socket
>> Power7 system (128 Threads and 128GB memory). perf stat with repeat of 5
>> is used to get the stddev values. Test ran in v3.14 kernel (Baseline) and
>> v3.15-rc1 for different fault around order values.
>>
>> FAULT_AROUND_ORDER      Baseline        1               3               4               5               8
>>
>> Linux build (make -j64)
>> minor-faults            47,437,359      35,279,286      25,425,347      23,461,275      22,002,189      21,435,836
>> times in seconds        347.302528420   344.061588460   340.974022391   348.193508116   348.673900158   350.986543618
>>  stddev for time        ( +-  1.50% )   ( +-  0.73% )   ( +-  1.13% )   ( +-  1.01% )   ( +-  1.89% )   ( +-  1.55% )
>>  %chg time to baseline                  -0.9%           -1.8%           0.2%            0.39%           1.06%
> 
> Probably too noisy.

Ok. I should have added the formula used for %change to clarify the data
presented. My bad.

Just to clarify, %change here is calculated based on this formula.

((new value - baseline)/baseline)

And in this case, negative %change says it a drop in time and
positive value has increase the time when compared to baseline.

With regards
Maddy

> 
>> Linux rebuild (make -j64)
>> minor-faults            941,552         718,319         486,625         440,124         410,510         397,416
>> times in seconds        30.569834718    31.219637539    31.319370649    31.434285472    31.972367174    31.443043580
>>  stddev for time        ( +-  1.07% )   ( +-  0.13% )   ( +-  0.43% )   ( +-  0.18% )   ( +-  0.95% )   ( +-  0.58% )
>>  %chg time to baseline                  2.1%            2.4%            2.8%            4.58%           2.85%
> 
> Here it looks like a speedup. Optimal value: 5+.
> 
>> Binutils build (make all -j64 )
>> minor-faults            474,821         371,380         269,463         247,715         235,255         228,337
>> times in seconds        53.882492432    53.584289348    53.882773216    53.755816431    53.607824348    53.423759642
>>  stddev for time        ( +-  0.08% )   ( +-  0.56% )   ( +-  0.17% )   ( +-  0.11% )   ( +-  0.60% )   ( +-  0.69% )
>>  %chg time to baseline                  -0.55%          0.0%            -0.23%          -0.51%          -0.85%
> 
> Probably too noisy, but looks like a potential slowdown?
> 
>> Two synthetic tests: access every word in file in sequential/random order.
>>
>> Sequential access 16GiB file
>> FAULT_AROUND_ORDER      Baseline        1               3               4               5               8
>> 1 thread
>>        minor-faults     263,148         131,166         32,908          16,514          8,260           1,093
>>        times in seconds 53.091138345    53.113191672    53.188776177    53.233017218    53.206841347    53.429979442
>>        stddev for time  ( +-  0.06% )   ( +-  0.07% )   ( +-  0.08% )   ( +-  0.09% )   ( +-  0.03% )   ( +-  0.03% )
>>        %chg time to baseline            0.04%           0.18%           0.26%           0.21%           0.63%
> 
> Speedup, optimal value: 8+.
> 
>> 8 threads
>>        minor-faults     2,097,267       1,048,753       262,237         131,397         65,621          8,274
>>        times in seconds 55.173790028    54.591880790    54.824623287    54.802162211    54.969680503    54.790387715
>>        stddev for time  ( +-  0.78% )   ( +-  0.09% )   ( +-  0.08% )   ( +-  0.07% )   ( +-  0.28% )   ( +-  0.05% )
>>        %chg time to baseline            -1.05%          -0.63%          -0.67%          -0.36%          -0.69%
> 
> Looks like a regression?
> 
>> 32 threads
>>        minor-faults     8,388,751       4,195,621       1,049,664       525,461         262,535         32,924
>>        times in seconds 60.431573046    60.669110744    60.485336388    60.697789706    60.077959564    60.588855032
>>        stddev for time  ( +-  0.44% )   ( +-  0.27% )   ( +-  0.46% )   ( +-  0.67% )   ( +-  0.31% )   ( +-  0.49% )
>>        %chg time to baseline            0.39%           0.08%           0.44%           -0.58%          0.25%
> 
> Probably too noisy.
> 
>> 64 threads
>>        minor-faults     16,777,409      8,607,527       2,289,766       1,202,264       598,405         67,587
>>        times in seconds 96.932617720    100.675418760   102.109880836   103.881733383   102.580199555   105.751194041
>>        stddev for time  ( +-  1.39% )   ( +-  1.06% )   ( +-  0.99% )   ( +-  0.76% )   ( +-  1.65% )   ( +-  1.60% )
>>        %chg time to baseline            3.86%           5.34%           7.16%           5.82%           9.09%
> 
> Speedup, optimal value: 4+
> 
>> 128 threads
>>        minor-faults     33,554,705      17,375,375      4,682,462       2,337,245       1,179,007       134,819
>>        times in seconds 128.766704495   115.659225437   120.353046307   115.291871270   115.450886036   113.991902150
>>        stddev for time  ( +-  2.93% )   ( +-  0.30% )   ( +-  2.93% )   ( +-  1.24% )   ( +-  1.03% )   ( +-  0.70% )
>>        %chg time to baseline            -10.17%         -6.53%          -10.46%         -10.34%         -11.47%
> 
> Rather significant regression at order 1 already.
> 
>> Random access 1GiB file
>> FAULT_AROUND_ORDER      Baseline        1               3               4               5               8
>> 1 thread
>>        minor-faults     17,155          8,678           2,126           1,097           581             134
>>        times in seconds 51.904430523    51.658017987    51.919270792    51.560531738    52.354431597    51.976469502
>>        stddev for time  ( +-  3.19% )   ( +-  1.35% )   ( +-  1.56% )   ( +-  0.91% )   ( +-  1.70% )   ( +-  2.02% )
>>        %chg time to baseline            -0.47%          0.02%           -0.66%          0.86%           0.13%
> 
> Probably too noisy.
> 
>> 8 threads
>>        minor-faults     131,844         70,705          17,457          8,505           4,251           598
>>        times in seconds 58.162813956    54.991706305    54.952675791    55.323057492    54.755587379    53.376722828
>>        stddev for time  ( +-  1.44% )   ( +-  0.69% )   ( +-  1.23% )   ( +-  2.78% )   ( +-  1.90% )   ( +-  2.91% )
>>        %chg time to baseline            -5.45%          -5.52%          -4.88%          -5.86%          -8.22%
> 
> Regression.
> 
>> 32 threads
>>        minor-faults     524,437         270,760         67,069          33,414          16,641          2,204
>>        times in seconds 69.981777072    76.539570015    79.753578505    76.245943618    77.254258344    79.072596831
>>        stddev for time  ( +-  2.81% )   ( +-  1.95% )   ( +-  2.66% )   ( +-  0.99% )   ( +-  2.35% )   ( +-  3.22% )
>>        %chg time to baseline            9.37%           13.96%          8.95%           10.39%          12.98%
> 
> Speedup, optimal value hard to tell due to noise - 3+ or 8+.
> 
>> 64 threads
>>        minor-faults     1,049,117       527,451         134,016         66,638          33,391          4,559
>>        times in seconds 108.024517536   117.575067996   115.322659914   111.943998437   115.049450815   119.218450840
>>        stddev for time  ( +-  2.40% )   ( +-  1.77% )   ( +-  1.19% )   ( +-  3.29% )   ( +-  2.32% )   ( +-  1.42% )
>>        %chg time to baseline            8.84%           6.75%           3.62%           6.5%            10.3%
> 
> Speedup, optimal value again hard to tell due to noise.
> 
>> 128 threads
>>        minor-faults     2,097,440       1,054,360       267,042         133,328         66,532          8,652
>>        times in seconds 155.055861167   153.059625968   152.449492156   151.024005282   150.844647770   155.954366718
>>        stddev for time  ( +-  1.32% )   ( +-  1.14% )   ( +-  1.32% )   ( +-  0.81% )   ( +-  0.75% )   ( +-  0.72% )
>>        %chg time to baseline            -1.28%          -1.68%          -2.59%          -2.71%          0.57%
> 
> Slowdown for most orders.
> 
>> Incase of Kernel compilation, fault around order (fao) of 1 and 3 
>> provides fast compilation time when compared to a value of 4. On 
>> closer look, fao of 3 has higher agains. Incase of Sequential access 
>> synthetic tests fao of 1 has higher gains and in Random access test, 
>> fao of 3 has marginal gains. Going by compilation time, fao value of 
>> 3 is suggested in this patch for pseries platform.
> 
> So I'm really at loss to understand where you get the optimal value of 
> '3' from. The data does not seem to match your claim that '1 and 3 
> provides fast compilation time when compared to a value of 4':
> 



>> FAULT_AROUND_ORDER      Baseline        1               3               4               5               8
>>
>> Linux rebuild (make -j64)
>> minor-faults            941,552         718,319         486,625         440,124         410,510         397,416
>> times in seconds        30.569834718    31.219637539    31.319370649    31.434285472    31.972367174    31.443043580
>>  stddev for time        ( +-  1.07% )   ( +-  0.13% )   ( +-  0.43% )   ( +-  0.18% )   ( +-  0.95% )   ( +-  0.58% )
>>  %chg time to baseline                  2.1%            2.4%            2.8%            4.58%           2.85%
> 
> 5 and 8, and probably 6, 7 are better than 4.
> 
> 3 is probably _slower_ than the current default - but it's hard to 
> tell due to inherent noise.
> 
> But the other two build tests were too noisy, and if then they showed 
> a slowdown.
> 
>> Worst case scenario: we touch one page every 16M to demonstrate overhead.
>>
>> Touch only one page in page table in 16GiB file
>> FAULT_AROUND_ORDER      Baseline        1               3               4               5               8
>> 1 thread
>>        minor-faults     1,104           1,090           1,071           1,068           1,065           1,063
>>        times in seconds 0.006583298     0.008531502     0.019733795     0.036033763     0.062300553     0.406857086
>>        stddev for time  ( +-  2.79% )   ( +-  2.42% )   ( +-  3.47% )   ( +-  2.81% )   ( +-  2.01% )   ( +-  1.33% )
>> 8 threads
>>        minor-faults     8,279           8,264           8,245           8,243           8,239           8,240
>>        times in seconds 0.044572398     0.057211811     0.107606306     0.205626815     0.381679120     2.647979955
>>        stddev for time  ( +-  1.95% )   ( +-  2.98% )   ( +-  1.74% )   ( +-  2.80% )   ( +-  2.01% )   ( +-  1.86% )
>> 32 threads
>>        minor-faults     32,879          32,864          32,849          32,845          32,839          32,843
>>        times in seconds 0.197659343     0.218486087     0.445116407     0.694235883     1.296894038     9.127517045
>>        stddev for time  ( +-  3.05% )   ( +-  3.05% )   ( +-  4.33% )   ( +-  3.08% )   ( +-  3.75% )   ( +-  0.56% )
>> 64 threads
>>        minor-faults     65,680          65,664          65,646          65,645          65,640          65,647
>>        times in seconds 0.455537304     0.489688780     0.866490093     1.427393118     2.379628982     17.059295051
>>        stddev for time  ( +-  4.01% )   ( +-  4.13% )   ( +-  2.92% )   ( +-  1.68% )   ( +-  1.79% )   ( +-  0.48% )
>> 128 threads
>>        minor-faults     131,279         131,265         131,250         131,245         131,241         131,254
>>        times in seconds 1.026880651     1.095327536     1.721728274     2.808233068     4.662729948     31.732848290
>>        stddev for time  ( +-  6.85% )   ( +-  4.09% )   ( +-  1.71% )   ( +-  3.45% )   ( +-  2.40% )   ( +-  0.68% )
> 
> There's no '%change' values shown, but the slowdown looks significant, 
> it's the worst case: for example with 1 thread order 3 looks about 
> 300% slower (3x slowdown) compared to order 0.
> 
> All in one, looking at your latest data I don't think the conclusion 
> from your first version of this optimization patch from a month ago is 
> true anymore:
> 
>> +	/* Measured on a 4 socket Power7 system (128 Threads and 128GB memory) */
>> +	fault_around_order = 3;
> 
> As the data is rather conflicting and inconclusive, and if it shows a 
> sweet spot it's not at order 3. New data should in general trigger 
> reanalysis of your first optimization value.
>
> I'm starting to suspect that maybe workloads ought to be given a 
> choice in this matter, via madvise() or such.
> 
> Thanks,
> 
> 	Ingo
> 

^ permalink raw reply

* Re: [git pull] Please pull abiv2 branch
From: Philippe Bergheaud @ 2014-04-29  9:38 UTC (permalink / raw)
  To: Rusty Russell
  Cc: philippe.bergheaud, mikey, amodra, rostedt, ulrich.weigand, mjw,
	Anton Blanchard, paulus, linuxppc-dev
In-Reply-To: <877g69t1my.fsf@rustcorp.com.au>

Rusty Russell wrote:
> Philippe Bergheaud <felix@linux.vnet.ibm.com> writes:
> 
>>Anton Blanchard wrote:
>>
>>>Here are the ABIv2 patches rebased against 3.15-rc2.
>>
>>After recompiling 3.15-rc2 with the ABIv2 patches,
>>I see the following line in Modules.symvers:
>>
>>     0x00000000 TOC. vmlinux EXPORT_SYMBOL
>>
>>Kernel will not load modules because TOC. has no CRC.
>>Is this expected ? Shouldn't TOC. have a CRC ?
> 
> 
> What happens when you try to load a module?  It should work...

My mistake, sorry: kernel 3.15-rc2 crashes at boot, in the SLES12
Beta5 environment that I am using, before any module load attempt.

The problem happens with the SLES12 kernel of the day 3.12.17,
plus the backported ABIv2 patch set.  Boot fails with:
     kernel: ibmveth: no symbol version for TOC.
     kernel: ibmveth: Unknown symbol TOC. (err -22)
     kernel: scsi_mod: no symbol version for TOC.
     kernel: scsi_mod: Unknown symbol TOC. (err -22)

In the rescue shell, repeating a plain modprobe fails again:
     :/# modprobe scsi_mod
     modprobe: ERROR: could not insert 'scsi_mod': Invalid argument

And finally, modprobe succeeds with --force:
     :/# modprobe --force scsi_mod
     scsi_mod: module has bad taint, not creating trace events

Philippe

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox