From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from gate.crashing.org (gate.crashing.org [63.228.1.57]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (Client did not present a certificate) by ozlabs.org (Postfix) with ESMTPS id 82B87B70B8 for ; Fri, 13 Aug 2010 15:27:02 +1000 (EST) Subject: Re: [PATCH] powerpc: Add support for popcnt instructions From: Benjamin Herrenschmidt To: Anton Blanchard In-Reply-To: <20100813022809.GY29316@kryten> References: <20100813022809.GY29316@kryten> Content-Type: text/plain; charset="UTF-8" Date: Fri, 13 Aug 2010 14:29:04 +1000 Message-ID: <1281673744.2987.362.camel@pasglop> Mime-Version: 1.0 Cc: linuxppc-dev@ozlabs.org List-Id: Linux on PowerPC Developers Mail List List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , On Fri, 2010-08-13 at 12:28 +1000, Anton Blanchard wrote: > POWER5 added popcntb, and POWER7 added popcntw and popcntd. As a first step > this patch does all the work out of line, but it would be nice to implement > them as inlines with an out of line fallback. > > The performance issue with hweight was noticed when disabling SMT on a large > (192 thread) POWER7 box. The patch improves that testcase by about 8%. Especially from modules it will suck big time. If kept out of line they should probably be linked-in with each module, but I'd rather have them inlined. Cheers, Ben. > Signed-off-by: Anton Blanchard > --- > > Index: powerpc.git/arch/powerpc/include/asm/cputable.h > =================================================================== > --- powerpc.git.orig/arch/powerpc/include/asm/cputable.h 2010-08-13 11:19:42.691991439 +1000 > +++ powerpc.git/arch/powerpc/include/asm/cputable.h 2010-08-13 11:24:55.510741618 +1000 > @@ -199,6 +199,8 @@ extern const char *powerpc_base_platform > #define CPU_FTR_UNALIGNED_LD_STD LONG_ASM_CONST(0x0080000000000000) > #define CPU_FTR_ASYM_SMT LONG_ASM_CONST(0x0100000000000000) > #define CPU_FTR_STCX_CHECKS_ADDRESS LONG_ASM_CONST(0x0200000000000000) > +#define CPU_FTR_POPCNTB LONG_ASM_CONST(0x0400000000000000) > +#define CPU_FTR_POPCNTD LONG_ASM_CONST(0x0800000000000000) > > #ifndef __ASSEMBLY__ > > @@ -403,21 +405,22 @@ extern const char *powerpc_base_platform > CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ > CPU_FTR_MMCRA | CPU_FTR_SMT | \ > CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \ > - CPU_FTR_PURR | CPU_FTR_STCX_CHECKS_ADDRESS) > + CPU_FTR_PURR | CPU_FTR_STCX_CHECKS_ADDRESS | \ > + CPU_FTR_POPCNTB) > #define CPU_FTRS_POWER6 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ > CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ > CPU_FTR_MMCRA | CPU_FTR_SMT | \ > CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \ > CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ > CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \ > - CPU_FTR_STCX_CHECKS_ADDRESS) > + CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB) > #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ > CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ > CPU_FTR_MMCRA | CPU_FTR_SMT | \ > CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \ > CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ > CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \ > - CPU_FTR_STCX_CHECKS_ADDRESS) > + CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD) > #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ > CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ > CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ > Index: powerpc.git/arch/powerpc/lib/Makefile > =================================================================== > --- powerpc.git.orig/arch/powerpc/lib/Makefile 2010-08-13 11:19:43.653241065 +1000 > +++ powerpc.git/arch/powerpc/lib/Makefile 2010-08-13 11:19:45.930743841 +1000 > @@ -18,7 +18,7 @@ obj-$(CONFIG_HAS_IOMEM) += devres.o > > obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ > memcpy_64.o usercopy_64.o mem_64.o string.o \ > - checksum_wrappers_64.o > + checksum_wrappers_64.o hweight_64.o > obj-$(CONFIG_XMON) += sstep.o ldstfp.o > obj-$(CONFIG_KPROBES) += sstep.o ldstfp.o > obj-$(CONFIG_HAVE_HW_BREAKPOINT) += sstep.o ldstfp.o > Index: powerpc.git/arch/powerpc/lib/hweight_64.S > =================================================================== > --- /dev/null 1970-01-01 00:00:00.000000000 +0000 > +++ powerpc.git/arch/powerpc/lib/hweight_64.S 2010-08-13 11:19:45.940741462 +1000 > @@ -0,0 +1,110 @@ > +/* > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. > + * > + * Copyright (C) IBM Corporation, 2010 > + * > + * Author: Anton Blanchard > + */ > +#include > +#include > + > +/* Note: This code relies on -mminimal-toc */ > + > +_GLOBAL(__arch_hweight8) > +BEGIN_FTR_SECTION > + b .__sw_hweight8 > + nop > + nop > +FTR_SECTION_ELSE > + popcntb r3,r3 > + clrldi r3,r3,64-8 > + blr > +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) > + > +_GLOBAL(__arch_hweight16) > +BEGIN_FTR_SECTION > + b .__sw_hweight16 > + nop > + nop > + nop > + nop > +FTR_SECTION_ELSE > + BEGIN_FTR_SECTION_NESTED(50) > + popcntb r3,r3 > + srdi r4,r3,8 > + add r3,r4,r3 > + clrldi r3,r3,64-8 > + blr > + FTR_SECTION_ELSE_NESTED(50) > + clrlwi r3,r3,16 > + popcntw r3,r3 > + clrldi r3,r3,64-8 > + blr > + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50) > +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) > + > +_GLOBAL(__arch_hweight32) > +BEGIN_FTR_SECTION > + b .__sw_hweight32 > + nop > + nop > + nop > + nop > + nop > + nop > +FTR_SECTION_ELSE > + BEGIN_FTR_SECTION_NESTED(51) > + popcntb r3,r3 > + srdi r4,r3,16 > + add r3,r4,r3 > + srdi r4,r3,8 > + add r3,r4,r3 > + clrldi r3,r3,64-8 > + blr > + FTR_SECTION_ELSE_NESTED(51) > + popcntw r3,r3 > + clrldi r3,r3,64-8 > + blr > + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51) > +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) > + > +_GLOBAL(__arch_hweight64) > +BEGIN_FTR_SECTION > + b .__sw_hweight64 > + nop > + nop > + nop > + nop > + nop > + nop > + nop > + nop > +FTR_SECTION_ELSE > + BEGIN_FTR_SECTION_NESTED(52) > + popcntb r3,r3 > + srdi r4,r3,32 > + add r3,r4,r3 > + srdi r4,r3,16 > + add r3,r4,r3 > + srdi r4,r3,8 > + add r3,r4,r3 > + clrldi r3,r3,64-8 > + blr > + FTR_SECTION_ELSE_NESTED(52) > + popcntd r3,r3 > + clrldi r3,r3,64-8 > + blr > + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52) > +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) > Index: powerpc.git/arch/powerpc/include/asm/bitops.h > =================================================================== > --- powerpc.git.orig/arch/powerpc/include/asm/bitops.h 2010-08-13 11:06:20.991992998 +1000 > +++ powerpc.git/arch/powerpc/include/asm/bitops.h 2010-08-13 11:19:45.940741462 +1000 > @@ -267,7 +267,16 @@ static __inline__ int fls64(__u64 x) > #include > #endif /* __powerpc64__ */ > > +#ifdef CONFIG_PPC64 > +unsigned int __arch_hweight8(unsigned int w); > +unsigned int __arch_hweight16(unsigned int w); > +unsigned int __arch_hweight32(unsigned int w); > +unsigned long __arch_hweight64(__u64 w); > +#include > +#else > #include > +#endif > + > #include > > /* Little-endian versions */ > Index: powerpc.git/arch/powerpc/kernel/ppc_ksyms.c > =================================================================== > --- powerpc.git.orig/arch/powerpc/kernel/ppc_ksyms.c 2010-08-13 11:06:21.011991745 +1000 > +++ powerpc.git/arch/powerpc/kernel/ppc_ksyms.c 2010-08-13 11:19:45.940741462 +1000 > @@ -186,3 +186,10 @@ EXPORT_SYMBOL(__mtdcr); > EXPORT_SYMBOL(__mfdcr); > #endif > EXPORT_SYMBOL(empty_zero_page); > + > +#ifdef CONFIG_PPC64 > +EXPORT_SYMBOL(__arch_hweight8); > +EXPORT_SYMBOL(__arch_hweight16); > +EXPORT_SYMBOL(__arch_hweight32); > +EXPORT_SYMBOL(__arch_hweight64); > +#endif