LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 2/4] powerpc/pseries: Define MCE error event section.
From: Mahesh J Salgaonkar @ 2018-06-06  4:37 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Aneesh Kumar K.V, Michael Ellerman, Laurent Dufour
In-Reply-To: <152825972491.24560.4626614298142965406.stgit@jupiter.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

On pseries, the machine check error details are part of RTAS extended
event log passed under Machine check exception section. This patch adds
the definition of rtas MCE event section and related helper
functions.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/rtas.h |  104 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index ec9dd79398ee..3f2fba7ef23b 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -275,6 +275,7 @@ inline uint32_t rtas_ext_event_company_id(struct rtas_ext_event_log_v6 *ext_log)
 #define PSERIES_ELOG_SECT_ID_CALL_HOME		(('C' << 8) | 'H')
 #define PSERIES_ELOG_SECT_ID_USER_DEF		(('U' << 8) | 'D')
 #define PSERIES_ELOG_SECT_ID_HOTPLUG		(('H' << 8) | 'P')
+#define PSERIES_ELOG_SECT_ID_MCE		(('M' << 8) | 'C')
 
 /* Vendor specific Platform Event Log Format, Version 6, section header */
 struct pseries_errorlog {
@@ -326,6 +327,109 @@ struct pseries_hp_errorlog {
 #define PSERIES_HP_ELOG_ID_DRC_COUNT	3
 #define PSERIES_HP_ELOG_ID_DRC_IC	4
 
+/* RTAS pseries MCE errorlog section */
+#pragma pack(push, 1)
+struct pseries_mc_errorlog {
+	__be32	fru_id;
+	__be32	proc_id;
+	uint8_t	error_type;
+	union {
+		struct {
+			uint8_t	ue_err_type;
+			/* XXXXXXXX
+			 * X		1: Permanent or Transient UE.
+			 *  X		1: Effective address provided.
+			 *   X		1: Logical address provided.
+			 *    XX	2: Reserved.
+			 *      XXX	3: Type of UE error.
+			 */
+			uint8_t	reserved_1[6];
+			__be64	effective_address;
+			__be64	logical_address;
+		} ue_error;
+		struct {
+			uint8_t	soft_err_type;
+			/* XXXXXXXX
+			 * X		1: Effective address provided.
+			 *  XXXXX	5: Reserved.
+			 *       XX	2: Type of SLB/ERAT/TLB error.
+			 */
+			uint8_t	reserved_1[6];
+			__be64	effective_address;
+			uint8_t	reserved_2[8];
+		} soft_error;
+	} u;
+};
+#pragma pack(pop)
+
+/* RTAS pseries MCE error types */
+#define PSERIES_MC_ERROR_TYPE_UE		0x00
+#define PSERIES_MC_ERROR_TYPE_SLB		0x01
+#define PSERIES_MC_ERROR_TYPE_ERAT		0x02
+#define PSERIES_MC_ERROR_TYPE_TLB		0x04
+#define PSERIES_MC_ERROR_TYPE_D_CACHE		0x05
+#define PSERIES_MC_ERROR_TYPE_I_CACHE		0x07
+
+/* RTAS pseries MCE error sub types */
+#define PSERIES_MC_ERROR_UE_INDETERMINATE		0
+#define PSERIES_MC_ERROR_UE_IFETCH			1
+#define PSERIES_MC_ERROR_UE_PAGE_TABLE_WALK_IFETCH	2
+#define PSERIES_MC_ERROR_UE_LOAD_STORE			3
+#define PSERIES_MC_ERROR_UE_PAGE_TABLE_WALK_LOAD_STORE	4
+
+#define PSERIES_MC_ERROR_SLB_PARITY		0
+#define PSERIES_MC_ERROR_SLB_MULTIHIT		1
+#define PSERIES_MC_ERROR_SLB_INDETERMINATE	2
+
+#define PSERIES_MC_ERROR_ERAT_PARITY		1
+#define PSERIES_MC_ERROR_ERAT_MULTIHIT		2
+#define PSERIES_MC_ERROR_ERAT_INDETERMINATE	3
+
+#define PSERIES_MC_ERROR_TLB_PARITY		1
+#define PSERIES_MC_ERROR_TLB_MULTIHIT		2
+#define PSERIES_MC_ERROR_TLB_INDETERMINATE	3
+
+static inline uint8_t rtas_mc_error_type(const struct pseries_mc_errorlog *mlog)
+{
+	return mlog->error_type;
+}
+
+static inline uint8_t rtas_mc_error_sub_type(
+					const struct pseries_mc_errorlog *mlog)
+{
+	switch (mlog->error_type) {
+	case	PSERIES_MC_ERROR_TYPE_UE:
+		return (mlog->u.ue_error.ue_err_type & 0x07);
+	case	PSERIES_MC_ERROR_TYPE_SLB:
+	case	PSERIES_MC_ERROR_TYPE_ERAT:
+	case	PSERIES_MC_ERROR_TYPE_TLB:
+		return (mlog->u.soft_error.soft_err_type & 0x03);
+	default:
+		return 0;
+	}
+}
+
+static inline uint64_t rtas_mc_get_effective_addr(
+					const struct pseries_mc_errorlog *mlog)
+{
+	uint64_t addr = 0;
+
+	switch (mlog->error_type) {
+	case	PSERIES_MC_ERROR_TYPE_UE:
+		if (mlog->u.ue_error.ue_err_type & 0x40)
+			addr = mlog->u.ue_error.effective_address;
+		break;
+	case	PSERIES_MC_ERROR_TYPE_SLB:
+	case	PSERIES_MC_ERROR_TYPE_ERAT:
+	case	PSERIES_MC_ERROR_TYPE_TLB:
+		if (mlog->u.soft_error.soft_err_type & 0x80)
+			addr = mlog->u.soft_error.effective_address;
+	default:
+		break;
+	}
+	return be64_to_cpu(addr);
+}
+
 struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
 					      uint16_t section_id);
 

^ permalink raw reply related

* [RFC PATCH 3/4] powerpc/pseries: Dump and flush SLB contents on SLB MCE errors.
From: Mahesh J Salgaonkar @ 2018-06-06  4:37 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Aneesh Kumar K.V, Michael Ellerman, Aneesh Kumar K.V,
	Michael Ellerman, Laurent Dufour
In-Reply-To: <152825972491.24560.4626614298142965406.stgit@jupiter.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

If we get a machine check exceptions due to SLB errors then dump the
current SLB contents which will be very much helpful in debugging the
root cause of SLB errors. On pseries, as of today system crashes on SLB
errors. These are soft errors and can be fixed by flushing the SLBs so
the kernel can continue to function instead of system crash. This patch
fixes that also.

With this patch the console will log SLB contents like below on SLB MCE
errors:

[  822.711728] slb contents:
[  822.711730] 00 c000000008000000 400ea1b217000500
[  822.711731]   1T  ESID=   c00000  VSID=      ea1b217 LLP:100
[  822.711732] 01 d000000008000000 400d43642f000510
[  822.711733]   1T  ESID=   d00000  VSID=      d43642f LLP:110
[  822.711734] 09 f000000008000000 400a86c85f000500
[  822.711736]   1T  ESID=   f00000  VSID=      a86c85f LLP:100
[  822.711737] 10 00007f0008000000 400d1f26e3000d90
[  822.711738]   1T  ESID=       7f  VSID=      d1f26e3 LLP:110
[  822.711739] 11 0000000018000000 000e3615f520fd90
[  822.711740]  256M ESID=        1  VSID=   e3615f520f LLP:110
[  822.711740] 12 d000000008000000 400d43642f000510
[  822.711741]   1T  ESID=   d00000  VSID=      d43642f LLP:110
[  822.711742] 13 d000000008000000 400d43642f000510
[  822.711743]   1T  ESID=   d00000  VSID=      d43642f LLP:110


Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Suggested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |    1 +
 arch/powerpc/mm/slb.c                         |   35 +++++++++++++++++++++++++
 arch/powerpc/platforms/pseries/ras.c          |   29 ++++++++++++++++++++-
 3 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 50ed64fba4ae..c0da68927235 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -487,6 +487,7 @@ extern void hpte_init_native(void);
 
 extern void slb_initialize(void);
 extern void slb_flush_and_rebolt(void);
+extern void slb_dump_contents(void);
 
 extern void slb_vmalloc_update(void);
 extern void slb_set_size(u16 size);
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 66577cc66dc9..799aa117cec3 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -145,6 +145,41 @@ void slb_flush_and_rebolt(void)
 	get_paca()->slb_cache_ptr = 0;
 }
 
+void slb_dump_contents(void)
+{
+	int i;
+	unsigned long e, v;
+	unsigned long llp;
+
+	pr_err("slb contents:\n");
+	for (i = 0; i < mmu_slb_size; i++) {
+		asm volatile("slbmfee  %0,%1" : "=r" (e) : "r" (i));
+		asm volatile("slbmfev  %0,%1" : "=r" (v) : "r" (i));
+
+		if (!e && !v)
+			continue;
+
+		pr_err("%02d %016lx %016lx", i, e, v);
+
+		if (!(e & SLB_ESID_V)) {
+			pr_err("\n");
+			continue;
+		}
+		llp = v & SLB_VSID_LLP;
+		if (v & SLB_VSID_B_1T) {
+			pr_err("  1T  ESID=%9lx  VSID=%13lx LLP:%3lx\n",
+				GET_ESID_1T(e),
+				(v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
+				llp);
+		} else {
+			pr_err(" 256M ESID=%9lx  VSID=%13lx LLP:%3lx\n",
+				GET_ESID(e),
+				(v & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
+				llp);
+		}
+	}
+}
+
 void slb_vmalloc_update(void)
 {
 	unsigned long vflags;
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 5e1ef9150182..7470a216cd6b 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -422,6 +422,31 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
 	return 0; /* need to perform reset */
 }
 
+static int mce_handle_error(struct rtas_error_log *errp)
+{
+	struct pseries_errorlog *pseries_log;
+	struct pseries_mc_errorlog *mce_log;
+	int disposition = rtas_error_disposition(errp);
+	uint8_t error_type;
+
+	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+	if (pseries_log == NULL)
+		goto out;
+
+	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+	error_type = rtas_mc_error_type(mce_log);
+
+	if ((disposition == RTAS_DISP_NOT_RECOVERED) &&
+			(error_type == PSERIES_MC_ERROR_TYPE_SLB)) {
+		slb_dump_contents();
+		slb_flush_and_rebolt();
+		disposition = RTAS_DISP_FULLY_RECOVERED;
+	}
+
+out:
+	return disposition;
+}
+
 /*
  * See if we can recover from a machine check exception.
  * This is only called on power4 (or above) and only via
@@ -434,7 +459,9 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
 static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
 {
 	int recovered = 0;
-	int disposition = rtas_error_disposition(err);
+	int disposition;
+
+	disposition = mce_handle_error(err);
 
 	if (!(regs->msr & MSR_RI)) {
 		/* If MSR_RI isn't set, we cannot recover */

^ permalink raw reply related

* [RFC PATCH 4/4] powerpc/pseries: Display machine check error details.
From: Mahesh J Salgaonkar @ 2018-06-06  4:37 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Aneesh Kumar K.V, Michael Ellerman, Laurent Dufour
In-Reply-To: <152825972491.24560.4626614298142965406.stgit@jupiter.in.ibm.com>

From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>

Extract the MCE eror details from RTAS extended log and display it to
console.

With this patch you should now see mce logs like below:

[  822.711745] Severe Machine check interrupt [Recovered]
[  822.711746]   Initiator: CPU
[  822.711747]   Error type: SLB [Multihit]
[  822.711747]     Effective address: d00000000c660000

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/rtas.h      |    5 +
 arch/powerpc/platforms/pseries/ras.c |  116 ++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)

diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 3f2fba7ef23b..8100a95c133a 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -190,6 +190,11 @@ static inline uint8_t rtas_error_extended(const struct rtas_error_log *elog)
 	return (elog->byte1 & 0x04) >> 2;
 }
 
+static inline uint8_t rtas_error_initiator(const struct rtas_error_log *elog)
+{
+	return (elog->byte2 & 0xf0) >> 4;
+}
+
 #define rtas_error_type(x)	((x)->byte3)
 
 static inline
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 7470a216cd6b..09a172bb6fdb 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -422,6 +422,121 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
 	return 0; /* need to perform reset */
 }
 
+#define VAL_TO_STRING(ar, val)	((val < ARRAY_SIZE(ar)) ? ar[val] : "Unknown")
+
+static void pseries_print_mce_info(struct rtas_error_log *errp, int disposition)
+{
+	const char *level, *sevstr;
+	struct pseries_errorlog *pseries_log;
+	struct pseries_mc_errorlog *mce_log;
+	uint8_t error_type, err_sub_type;
+	uint8_t initiator = rtas_error_initiator(errp);
+	uint64_t addr;
+
+	static const char * const initiators[] = {
+		"Unknown",
+		"CPU",
+		"PCI",
+		"ISA",
+		"Memory",
+		"Power Mgmt",
+	};
+	static const char * const mc_err_types[] = {
+		"UE",
+		"SLB",
+		"ERAT",
+		"TLB",
+		"D-Cache",
+		"Unknown",
+		"I-Cache",
+	};
+	static const char * const mc_ue_types[] = {
+		"Indeterminate",
+		"Instruction fetch",
+		"Page table walk ifetch",
+		"Load/Store",
+		"Page table walk Load/Store",
+	};
+
+	/* SLB sub errors valid values are 0x0, 0x1, 0x2 */
+	static const char * const mc_slb_types[] = {
+		"Parity",
+		"Multihit",
+		"Indeterminate",
+	};
+
+	/* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */
+	static const char * const mc_soft_types[] = {
+		"Unknown",
+		"Parity",
+		"Multihit",
+		"Indeterminate",
+	};
+
+	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+	if (pseries_log == NULL)
+		return;
+
+	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+
+	error_type = rtas_mc_error_type(mce_log);
+	err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+	switch (rtas_error_severity(errp)) {
+	case RTAS_SEVERITY_NO_ERROR:
+		level = KERN_INFO;
+		sevstr = "Harmless";
+		break;
+	case RTAS_SEVERITY_WARNING:
+		level = KERN_WARNING;
+		sevstr = "";
+		break;
+	case RTAS_SEVERITY_ERROR:
+	case RTAS_SEVERITY_ERROR_SYNC:
+		level = KERN_ERR;
+		sevstr = "Severe";
+		break;
+	case RTAS_SEVERITY_FATAL:
+	default:
+		level = KERN_ERR;
+		sevstr = "Fatal";
+		break;
+	}
+
+	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
+		disposition == RTAS_DISP_FULLY_RECOVERED ?
+		"Recovered" : "Not recovered");
+	printk("%s  Initiator: %s\n", level,
+				VAL_TO_STRING(initiators, initiator));
+
+	switch (error_type) {
+	case PSERIES_MC_ERROR_TYPE_UE:
+		printk("%s  Error type: %s [%s]\n", level,
+			VAL_TO_STRING(mc_err_types, error_type),
+			VAL_TO_STRING(mc_ue_types, err_sub_type));
+		break;
+	case PSERIES_MC_ERROR_TYPE_SLB:
+		printk("%s  Error type: %s [%s]\n", level,
+			VAL_TO_STRING(mc_err_types, error_type),
+			VAL_TO_STRING(mc_slb_types, err_sub_type));
+		break;
+	case PSERIES_MC_ERROR_TYPE_ERAT:
+	case PSERIES_MC_ERROR_TYPE_TLB:
+		printk("%s  Error type: %s [%s]\n", level,
+			VAL_TO_STRING(mc_err_types, error_type),
+			VAL_TO_STRING(mc_soft_types, err_sub_type));
+		break;
+	default:
+		printk("%s  Error type: %s\n", level,
+			VAL_TO_STRING(mc_err_types, error_type));
+		break;
+	}
+
+	addr = rtas_mc_get_effective_addr(mce_log);
+	if (addr)
+		printk("%s    Effective address: %016llx\n", level, addr);
+}
+
 static int mce_handle_error(struct rtas_error_log *errp)
 {
 	struct pseries_errorlog *pseries_log;
@@ -442,6 +557,7 @@ static int mce_handle_error(struct rtas_error_log *errp)
 		slb_flush_and_rebolt();
 		disposition = RTAS_DISP_FULLY_RECOVERED;
 	}
+	pseries_print_mce_info(errp, disposition);
 
 out:
 	return disposition;

^ permalink raw reply related

* Re: [PATCH v7 0/5] powerpc/64: memcmp() optimization
From: Simon Guo @ 2018-06-06  6:21 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev, Paul Mackerras, Naveen N.  Rao, Cyril Bur
In-Reply-To: <877eneasg9.fsf@concordia.ellerman.id.au>

Hi Michael,
On Tue, Jun 05, 2018 at 12:16:22PM +1000, Michael Ellerman wrote:
> Hi Simon,
> 
> wei.guo.simon@gmail.com writes:
> > From: Simon Guo <wei.guo.simon@gmail.com>
> >
> > There is some room to optimize memcmp() in powerpc 64 bits version for
> > following 2 cases:
> > (1) Even src/dst addresses are not aligned with 8 bytes at the beginning,
> > memcmp() can align them and go with .Llong comparision mode without
> > fallback to .Lshort comparision mode do compare buffer byte by byte.
> > (2) VMX instructions can be used to speed up for large size comparision,
> > currently the threshold is set for 4K bytes. Notes the VMX instructions
> > will lead to VMX regs save/load penalty. This patch set includes a
> > patch to add a 32 bytes pre-checking to minimize the penalty.
> >
> > It did the similar with glibc commit dec4a7105e (powerpc: Improve memcmp 
> > performance for POWER8). Thanks Cyril Bur's information.
> > This patch set also updates memcmp selftest case to make it compiled and
> > incorporate large size comparison case.
> 
> I'm seeing a few crashes with this applied, I haven't had time to look
> into what is happening yet, sorry.
> 

The bug is due to memcmp() invokes a C function enter_vmx_ops() who will load 
some PIC value based on r2.

memcmp() doesn't use r2 and if the memcmp() is invoked from kernel
itself, everything is fine. But if memcmp() is invoked from modules[test_user_copy], 
r2 will be required to be setup correctly. Otherwise the enter_vmx_ops() will refer 
to an incorrect/unexisting data location based on wrong r2 value.

Following patch will fix this issue:
------------
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 5eba49744a5a..24d093fa89bb 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -102,7 +102,7 @@
  * 2) src/dst has different offset to the 8 bytes boundary. The handlers
  * are named like .Ldiffoffset_xxxx
  */
-_GLOBAL(memcmp)
+_GLOBAL_TOC(memcmp)
        cmpdi   cr1,r5,0

        /* Use the short loop if the src/dst addresses are not
----------

It means the memcmp() fun entry will have additional 2 instructions. Is there
any way to save these 2 instructions when the memcmp() is actually invoked
from kernel itself?

Again thanks for finding this issue.

Thanks,
- Simon

> [ 2471.300595] kselftest: Running tests in user
> [ 2471.302785] calling  test_user_copy_init+0x0/0xd14 [test_user_copy] @ 44883
> [ 2471.302892] Unable to handle kernel paging request for data at address 0xc008000018553005
> [ 2471.303014] Faulting instruction address: 0xc00000000001f29c
> [ 2471.303119] Oops: Kernel access of bad area, sig: 11 [#1]
> [ 2471.303193] LE SMP NR_CPUS=2048 NUMA PowerNV
> [ 2471.303256] Modules linked in: test_user_copy(+) vxlan ip6_udp_tunnel udp_tunnel 8021q bridge stp llc dummy test_printf test_firmware vmx_crypto crct10dif_vpmsum crct10dif_common crc32c_vpmsum veth [last unloaded: test_static_key_base]
> [ 2471.303532] CPU: 4 PID: 44883 Comm: modprobe Tainted: G        W         4.17.0-rc3-gcc7x-g7204012 #1
> [ 2471.303644] NIP:  c00000000001f29c LR: c00000000001f6e4 CTR: 0000000000000000
> [ 2471.303754] REGS: c000001fddc2b560 TRAP: 0300   Tainted: G        W          (4.17.0-rc3-gcc7x-g7204012)
> [ 2471.303873] MSR:  9000000002009033 <SF,HV,VEC,EE,ME,IR,DR,RI,LE>  CR: 24222844  XER: 00000000
> [ 2471.303996] CFAR: c00000000001f6e0 DAR: c008000018553005 DSISR: 40000000 IRQMASK: 0 
> [ 2471.303996] GPR00: c00000000001f6e4 c000001fddc2b7e0 c008000018529900 0000000002000000 
> [ 2471.303996] GPR04: c000001fe4b90020 000000000000ffe0 0000000000000000 03fffffe01b48000 
> [ 2471.303996] GPR08: 0000000080000000 c008000018553005 c000001fddc28000 c008000018520df0 
> [ 2471.303996] GPR12: c00000000009c430 c000001fffffbc00 0000000020000000 0000000000000000 
> [ 2471.303996] GPR16: c000001fddc2bc20 0000000000000030 c0000000001f7ba0 0000000000000001 
> [ 2471.303996] GPR20: 0000000000000000 c000000000c772b0 c0000000010b4018 0000000000000000 
> [ 2471.303996] GPR24: 0000000000000000 c008000018521c98 0000000000000000 c000001fe4b90000 
> [ 2471.303996] GPR28: fffffffffffffff4 0000000002000000 9000000002009033 9000000002009033 
> [ 2471.304930] NIP [c00000000001f29c] msr_check_and_set+0x3c/0xc0
> [ 2471.305008] LR [c00000000001f6e4] enable_kernel_altivec+0x44/0x100
> [ 2471.305084] Call Trace:
> [ 2471.305122] [c000001fddc2b7e0] [c00000000009baa8] __copy_tofrom_user_base+0x9c/0x574 (unreliable)
> [ 2471.305240] [c000001fddc2b860] [c00000000001f6e4] enable_kernel_altivec+0x44/0x100
> [ 2471.305336] [c000001fddc2b890] [c00000000009ce40] enter_vmx_ops+0x50/0x70
> [ 2471.305418] [c000001fddc2b8b0] [c00000000009c768] memcmp+0x338/0x680
> [ 2471.305501] [c000001fddc2b9b0] [c008000018520190] test_user_copy_init+0x188/0xd14 [test_user_copy]
> [ 2471.305617] [c000001fddc2ba60] [c00000000000de20] do_one_initcall+0x90/0x560
> [ 2471.305710] [c000001fddc2bb30] [c000000000200630] do_init_module+0x90/0x260
> [ 2471.305795] [c000001fddc2bbc0] [c0000000001fec88] load_module+0x1a28/0x1ce0
> [ 2471.305875] [c000001fddc2bd70] [c0000000001ff1e8] sys_finit_module+0xc8/0x110
> [ 2471.305983] [c000001fddc2be30] [c00000000000b528] system_call+0x58/0x6c
> [ 2471.306066] Instruction dump:
> [ 2471.306112] fba1ffe8 fbc1fff0 fbe1fff8 f8010010 f821ff81 7c7d1b78 60000000 60000000 
> [ 2471.306216] 7fe000a6 3d220003 39299705 7ffeeb78 <89290000> 2f890000 419e0044 60000000 
> [ 2471.306326] ---[ end trace daf8d409e65b9841 ]---
> 
> And:
> 
> [   19.096709] test_bpf: test_skb_segment: success in skb_segment!
> [   19.096799] initcall test_bpf_init+0x0/0xae0 [test_bpf] returned 0 after 591217 usecs
> [   19.115869] calling  test_user_copy_init+0x0/0xd14 [test_user_copy] @ 3159
> [   19.116165] Unable to handle kernel paging request for data at address 0xd000000003852805
> [   19.116352] Faulting instruction address: 0xc00000000001f44c
> [   19.116483] Oops: Kernel access of bad area, sig: 11 [#1]
> [   19.116583] LE SMP NR_CPUS=2048 NUMA pSeries
> [   19.116684] Modules linked in: test_user_copy(+) lzo_compress crc_itu_t zstd_compress zstd_decompress test_bpf test_static_keys test_static_key_base xxhash test_firmware af_key cls_bpf act_bpf bridge nf_nat_irc xt_NFLOG nfnetlink_log xt_policy nf_conntrack_netlink nfnetlink xt_nat nf_conntrack_irc xt_mark xt_tcpudp nf_nat_sip xt_TCPMSS xt_LOG nf_nat_ftp nf_conntrack_ftp xt_conntrack nf_conntrack_sip xt_addrtype xt_state 8021q iptable_filter ipt_MASQUERADE nf_log_ipv4 iptable_mangle nf_nat_masquerade_ipv4 ipt_REJECT nf_reject_ipv4 iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack ip_tables x_tables nf_log_arp nf_log_common ah4 ipcomp xfrm4_tunnel esp4 rpcrdma stp p8022 psnap llc xfrm_ipcomp xfrm_user xfrm_algo platform_lcd lcd ocxl virtio_balloon virtio_crypto crypto_engine
> [   19.118040]  vmx_crypto nbd zram zsmalloc virtio_blk st be2iscsi cxgb3i cxgb4i libcxgbi bnx2i ibmvfc sym53c8xx scsi_transport_spi scsi_dh_alua scsi_dh_rdac qla4xxx mpt3sas scsi_transport_sas cxlflash cxl libiscsi_tcp lpfc crc_t10dif crct10dif_generic crct10dif_common qla2xxx iscsi_boot_sysfs raid_class parport_pc parport powernv_op_panel powernv_rng pseries_rng rng_core virtio_console pcspkr input_leds evdev dm_round_robin dm_mirror dm_region_hash dm_log raid10 dm_service_time multipath dm_queue_length dm_multipath dm_thin_pool faulty dm_persistent_data dm_zero dm_crypt dm_bio_prison dm_snapshot dm_bufio raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq rpadlpar_io rpaphp jsm icom hvcs ib_ipoib ib_srp ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_ucm ib_ucm ib_uverbs
> [   19.119505]  rdma_cm iw_cm ib_cm mlx4_ib iw_cxgb3 iw_cxgb4 ib_mthca ib_core leds_powernv led_class vhost_net vhost macvtap macvlan dummy bsd_comp ppp_async crc_ccitt pppoe ppp_synctty pppox ppp_deflate ppp_generic 3c59x s2io bnx2 cnic uio bnx2x libcrc32c i40e ixgbe ixgb cxgb3 libcxgb cxgb cxgb4 pcnet32 netxen_nic qlge be2net acenic mlx4_en mlx4_core myri10ge bonding slhc tap mdio veth vxlan udp_tunnel tun usb_storage usbmon oprofile sha1_powerpc md5_ppc crc32c_vpmsum kvm hvcserver
> [   19.120358] CPU: 4 PID: 3159 Comm: modprobe Not tainted 4.17.0-rc3-gcc7x-g7204012 #1
> [   19.120508] NIP:  c00000000001f44c LR: c00000000001f894 CTR: 0000000000000000
> [   19.120666] REGS: c0000000f8d9f570 TRAP: 0300   Not tainted  (4.17.0-rc3-gcc7x-g7204012)
> [   19.120817] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 24222844  XER: 00000000
> [   19.120984] CFAR: c00000000000c03c DAR: d000000003852805 DSISR: 40000000 IRQMASK: 0 
>                GPR00: c00000000001f894 c0000000f8d9f7f0 d000000003829900 0000000002000000 
>                GPR04: c0000000f9a30048 000000000000ffe0 0000000000000000 03fffffff065dffd 
>                GPR08: 0000000080000000 d000000003852805 c0000000f8d9c000 d000000003820df0 
>                GPR12: c00000000009ebb0 c00000003fffb300 c0000000f8d9fd90 d000000003840000 
>                GPR16: d000000003840000 0000000000000000 c0000000011d6900 d000000003821ad0 
>                GPR20: c000000000bd7860 0000000000000000 c000000000ff9060 00000000014000c0 
>                GPR24: 0000000000000000 0000000000000000 0000000000000100 c0000000f9a30028 
>                GPR28: fffffffffffffff4 0000000002000000 8000000002009033 8000000000009033 
> [   19.122454] NIP [c00000000001f44c] msr_check_and_set+0x3c/0xc0
> [   19.122580] LR [c00000000001f894] enable_kernel_altivec+0x44/0x100
> [   19.122707] Call Trace:
> [   19.122789] [c0000000f8d9f7f0] [c00000000009e228] __copy_tofrom_user_base+0x9c/0x574 (unreliable)
> [   19.122962] [c0000000f8d9f870] [c00000000001f894] enable_kernel_altivec+0x44/0x100
> [   19.123344] [c0000000f8d9f8a0] [c00000000009f740] enter_vmx_ops+0x50/0x70
> [   19.123583] [c0000000f8d9f8c0] [c00000000009eee8] memcmp+0x338/0x680
> [   19.123728] [c0000000f8d9f9c0] [d000000003820190] test_user_copy_init+0x188/0xd14 [test_user_copy]
> [   19.123909] [c0000000f8d9fa70] [c00000000000e37c] do_one_initcall+0x5c/0x2d0
> [   19.124094] [c0000000f8d9fb30] [c00000000020066c] do_init_module+0x90/0x264
> [   19.124234] [c0000000f8d9fbc0] [c0000000001ff084] load_module+0x2f64/0x3600
> [   19.124371] [c0000000f8d9fd70] [c0000000001ff9c8] sys_finit_module+0xc8/0x110
> [   19.124530] [c0000000f8d9fe30] [c00000000000b868] system_call+0x58/0x6c
> [   19.124648] Instruction dump:
> [   19.124721] fba1ffe8 fbc1fff0 fbe1fff8 f8010010 f821ff81 7c7d1b78 60000000 60000000 
> [   19.124869] 7fe000a6 3d220003 39298f05 7ffeeb78 <89290000> 2f890000 419e0044 60000000 
> [   19.125034] ---[ end trace 7c08acedd4b4e6aa ]---
> 
> 
> cheers

^ permalink raw reply related

* Re: [PATCH v7 0/5] powerpc/64: memcmp() optimization
From: Naveen N. Rao @ 2018-06-06  6:36 UTC (permalink / raw)
  To: Michael Ellerman, Simon Guo; +Cc: Cyril Bur, linuxppc-dev
In-Reply-To: <20180606062153.GA7342@simonLocalRHEL7.x64>

Simon Guo wrote:
> Hi Michael,
> On Tue, Jun 05, 2018 at 12:16:22PM +1000, Michael Ellerman wrote:
>> Hi Simon,
>>=20
>> wei.guo.simon@gmail.com writes:
>> > From: Simon Guo <wei.guo.simon@gmail.com>
>> >
>> > There is some room to optimize memcmp() in powerpc 64 bits version for
>> > following 2 cases:
>> > (1) Even src/dst addresses are not aligned with 8 bytes at the beginni=
ng,
>> > memcmp() can align them and go with .Llong comparision mode without
>> > fallback to .Lshort comparision mode do compare buffer byte by byte.
>> > (2) VMX instructions can be used to speed up for large size comparisio=
n,
>> > currently the threshold is set for 4K bytes. Notes the VMX instruction=
s
>> > will lead to VMX regs save/load penalty. This patch set includes a
>> > patch to add a 32 bytes pre-checking to minimize the penalty.
>> >
>> > It did the similar with glibc commit dec4a7105e (powerpc: Improve memc=
mp=20
>> > performance for POWER8). Thanks Cyril Bur's information.
>> > This patch set also updates memcmp selftest case to make it compiled a=
nd
>> > incorporate large size comparison case.
>>=20
>> I'm seeing a few crashes with this applied, I haven't had time to look
>> into what is happening yet, sorry.
>>=20
>=20
> The bug is due to memcmp() invokes a C function enter_vmx_ops() who will =
load=20
> some PIC value based on r2.
>=20
> memcmp() doesn't use r2 and if the memcmp() is invoked from kernel
> itself, everything is fine. But if memcmp() is invoked from modules[test_=
user_copy],=20
> r2 will be required to be setup correctly. Otherwise the enter_vmx_ops() =
will refer=20
> to an incorrect/unexisting data location based on wrong r2 value.
>=20
> Following patch will fix this issue:
> ------------
> diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
> index 5eba49744a5a..24d093fa89bb 100644
> --- a/arch/powerpc/lib/memcmp_64.S
> +++ b/arch/powerpc/lib/memcmp_64.S
> @@ -102,7 +102,7 @@
>   * 2) src/dst has different offset to the 8 bytes boundary. The handlers
>   * are named like .Ldiffoffset_xxxx
>   */
> -_GLOBAL(memcmp)
> +_GLOBAL_TOC(memcmp)
>         cmpdi   cr1,r5,0
>=20
>         /* Use the short loop if the src/dst addresses are not
> ----------
>=20
> It means the memcmp() fun entry will have additional 2 instructions. Is t=
here
> any way to save these 2 instructions when the memcmp() is actually invoke=
d
> from kernel itself?

That will be the case. We will end up entering the function via the=20
local entry point skipping the first two instructions. The Global entry=20
point is only used for cross-module calls.

- Naveen

=

^ permalink raw reply

* Re: [PATCH v6 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision
From: Simon Guo @ 2018-06-06  6:42 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: linuxppc-dev, Naveen N.  Rao, Cyril Bur
In-Reply-To: <20180530090321.GE5951@simonLocalRHEL7.x64>

Hi segher,
On Wed, May 30, 2018 at 05:03:21PM +0800, Simon Guo wrote:
> On Wed, May 30, 2018 at 03:35:40AM -0500, Segher Boessenkool wrote:
> > On Wed, May 30, 2018 at 04:14:02PM +0800, Simon Guo wrote:
> > > Hi Segher,
> > > On Mon, May 28, 2018 at 06:05:59AM -0500, Segher Boessenkool wrote:
> > > > On Fri, May 25, 2018 at 12:07:34PM +0800, wei.guo.simon@gmail.com wrote:
> > > > > +	/* save and restore cr0 */
> > > > > +	mfocrf  r5,64
> > > > > +	EXIT_VMX_OPS
> > > > > +	mtocrf	64,r5
> > > > > +	b	.LcmpAB_lightweight
> > > > 
> > > > That's cr1, not cr0.  You can use mcrf instead, it is cheaper (esp. if
> > > > you have it in a non-volatile CR field before so you need only one, if any).
> > > > 
> > > You are right :) How about using mtcr/mfcr instead, I think they are
> > > fast as well and more readable.
> > 
> > Those are much worse than m[ft]ocrf.
> > 
> > You probably should just shuffle things around so that EXIT_VMX_OPS
> > does not clobber the CR field you need to keep.
> Let me use mcrf then :)

I now felt unformatable to use mcrf like:
mcrf    7,0

since I cannot 100% confident that compiler will not use CR7 or other
CR# in exit_vmx_ops().

Can we switch back to mfocrf/mtocrf with correct CR0 value?
       mfocrf  r5,128
        ...
       mtocrf  128,r5

Thanks,
- Simon

^ permalink raw reply

* Re: Problems building ppc images in v4.14.y and v4.16.y using gcc 7.3.0 / 8.1.0 from kernel.org
From: Christophe LEROY @ 2018-06-06  6:44 UTC (permalink / raw)
  To: Arnd Bergmann, Guenter Roeck; +Cc: Greg Kroah-Hartman, linuxppc-dev, stable
In-Reply-To: <CAK8P3a0CjKCYCyhGejF6e3-Mk0mQ2=x3Kd76U8B4iB1amNr76g@mail.gmail.com>



Le 05/06/2018 à 21:47, Arnd Bergmann a écrit :
> On Tue, Jun 5, 2018 at 6:06 PM, Guenter Roeck <linux@roeck-us.net> wrote:
>> On Tue, Jun 05, 2018 at 04:31:00PM +0200, Arnd Bergmann wrote:
>>> On Tue, Jun 5, 2018 at 3:52 PM, Guenter Roeck <linux@roeck-us.net> wrote:
>>>> Hi Arnd,
>>>>
>>>> when using the ppc64 compiler from kernel.org, I see the following problems
>>>> when trying to compile ppc:allnoconfig in v4.14.y or v4.16.y.
>>>>
>>>> gcc 7.3.0: Compilation of kernel.cpu.o hangs
>>>>
>>>> The problem goes away if I apply the following two patches (tested with
>>>> 4.16.y)
>>>>
>>>> 17a2f1ced028 cpu/hotplug: Merge cpuhp_bp_states and cpuhp_ap_states
>>>> fcb3029a8d89 cpu/hotplug: Fix unused function warning
>>>
>>> This is probably the same as
>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84038
>>>
>>> I thought I had included the fix in my builds.
>>>
>> Guess not.
> 
> I probably had it in one build and then forgot about it when I did a
> rebuild of 7.3 :(
> 
> I'm still planning to do a new set of gcc-7.3 binaries (or maybe 7.4
> if that gets
> released soon) and should try to remember doing that.
> 
>>>
>> I think it may have cached the flags from the other compiler version.
>> "make mrproper" prior to "make defconfig" took care of the issue.
>>
>> However, that doesn't really help - I get lots of
>>          error: 'sys_spu_create' alias between functions of incompatible types
>>          error: 'strncpy' output truncated before terminating nul
>> if I try to use gcc 8.1.0.
>>
>> Oh well. I'll try gcc 6.4.0 next.
> 
> On the upside, those two errors are just a result of arch/power/*/*.c getting
> built with -Werror, they are warnings that gcc-8 introduced that we should
> either shut up or fix.

They are fixed in next:

https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=2479bfc9bc600dcce7f932d52dcfa8d677c41f93
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=c95998811807d897ca112ea62d66716ed733d058

Christophe

> 
>          Arnd
> 

^ permalink raw reply

* Re: [PATCH v7 0/5] powerpc/64: memcmp() optimization
From: Simon Guo @ 2018-06-06  6:53 UTC (permalink / raw)
  To: Naveen N. Rao; +Cc: Michael Ellerman, Cyril Bur, linuxppc-dev
In-Reply-To: <1528266847.dixm3thyfj.naveen@linux.ibm.com>

Hi Naveen,
On Wed, Jun 06, 2018 at 12:06:09PM +0530, Naveen N. Rao wrote:
> Simon Guo wrote:
> >Hi Michael,
> >On Tue, Jun 05, 2018 at 12:16:22PM +1000, Michael Ellerman wrote:
> >>Hi Simon,
> >>
> >>wei.guo.simon@gmail.com writes:
> >>> From: Simon Guo <wei.guo.simon@gmail.com>
> >>>
> >>> There is some room to optimize memcmp() in powerpc 64 bits version for
> >>> following 2 cases:
> >>> (1) Even src/dst addresses are not aligned with 8 bytes at the beginning,
> >>> memcmp() can align them and go with .Llong comparision mode without
> >>> fallback to .Lshort comparision mode do compare buffer byte by byte.
> >>> (2) VMX instructions can be used to speed up for large size comparision,
> >>> currently the threshold is set for 4K bytes. Notes the VMX instructions
> >>> will lead to VMX regs save/load penalty. This patch set includes a
> >>> patch to add a 32 bytes pre-checking to minimize the penalty.
> >>>
> >>> It did the similar with glibc commit dec4a7105e (powerpc:
> >>Improve memcmp > performance for POWER8). Thanks Cyril Bur's
> >>information.
> >>> This patch set also updates memcmp selftest case to make it compiled and
> >>> incorporate large size comparison case.
> >>
> >>I'm seeing a few crashes with this applied, I haven't had time to look
> >>into what is happening yet, sorry.
> >>
> >
> >The bug is due to memcmp() invokes a C function enter_vmx_ops()
> >who will load some PIC value based on r2.
> >
> >memcmp() doesn't use r2 and if the memcmp() is invoked from kernel
> >itself, everything is fine. But if memcmp() is invoked from
> >modules[test_user_copy], r2 will be required to be setup
> >correctly. Otherwise the enter_vmx_ops() will refer to an
> >incorrect/unexisting data location based on wrong r2 value.
> >
> >Following patch will fix this issue:
> >------------
> >diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
> >index 5eba49744a5a..24d093fa89bb 100644
> >--- a/arch/powerpc/lib/memcmp_64.S
> >+++ b/arch/powerpc/lib/memcmp_64.S
> >@@ -102,7 +102,7 @@
> >  * 2) src/dst has different offset to the 8 bytes boundary. The handlers
> >  * are named like .Ldiffoffset_xxxx
> >  */
> >-_GLOBAL(memcmp)
> >+_GLOBAL_TOC(memcmp)
> >        cmpdi   cr1,r5,0
> >
> >        /* Use the short loop if the src/dst addresses are not
> >----------
> >
> >It means the memcmp() fun entry will have additional 2 instructions. Is there
> >any way to save these 2 instructions when the memcmp() is actually invoked
> >from kernel itself?
> 
> That will be the case. We will end up entering the function via the
> local entry point skipping the first two instructions. The Global
> entry point is only used for cross-module calls.
> 

Yes. Thanks :)

- Simon

^ permalink raw reply

* Re: [PATCH 08/11] macintosh/via-pmu: Replace via-pmu68k driver with via-pmu driver
From: Finn Thain @ 2018-06-06  6:57 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: Benjamin Herrenschmidt, Michael Schmitz, linuxppc-dev, linux-m68k,
	Linux Kernel Mailing List
In-Reply-To: <CAMuHMdVheowFa8bx0fg-_Dvx+TMZhPbvouTs82Y3tq1gtSpRMQ@mail.gmail.com>

On Mon, 4 Jun 2018, Geert Uytterhoeven wrote:

> 
> > Don't call pmu_shutdown() or pmu_restart() on early PowerBooks: the 
> > PMU device found in these PowerBooks isn't supported.
> 
> Shouldn't that be a separate patch?
> 
> > --- a/arch/m68k/mac/misc.c
> > +++ b/arch/m68k/mac/misc.c
> 
> > @@ -477,9 +445,8 @@ void mac_poweroff(void)
> >                    macintosh_config->adb_type == MAC_ADB_CUDA) {
> >                 cuda_shutdown();
> >  #endif
> > -#ifdef CONFIG_ADB_PMU68K
> > -       } else if (macintosh_config->adb_type == MAC_ADB_PB1
> > -               || macintosh_config->adb_type == MAC_ADB_PB2) {
> > +#ifdef CONFIG_ADB_PMU
> > +       } else if (macintosh_config->adb_type == MAC_ADB_PB2) {
> >                 pmu_shutdown();
> >  #endif
> >         }
> > @@ -519,9 +486,8 @@ void mac_reset(void)
> >                    macintosh_config->adb_type == MAC_ADB_CUDA) {
> >                 cuda_restart();
> >  #endif
> > -#ifdef CONFIG_ADB_PMU68K
> > -       } else if (macintosh_config->adb_type == MAC_ADB_PB1
> > -               || macintosh_config->adb_type == MAC_ADB_PB2) {
> > +#ifdef CONFIG_ADB_PMU
> > +       } else if (macintosh_config->adb_type == MAC_ADB_PB2) {
> >                 pmu_restart();
> >  #endif
> >         } else if (CPU_IS_030) {
> 

The stability problem here is bigger than just pmu_restart() and 
pmu_shutdown(), so those hunks are insufficient. You need to prevent the 
via-pmu68k driver from loading in the first place and to drop all the 
PMU_68K_V1 cases.

But splitting this patch in that way creates potential merge conflicts, 
which is a hassle. E.g. this hunk:

-       ....
-       switch (macintosh_config->adb_type) {
-       case MAC_ADB_PB1:
-               pmu_kind = PMU_68K_V1;
-               break;
-       case MAC_ADB_PB2:
-               pmu_kind = PMU_68K_V2;
-               break;
-       default:
-               pmu_kind = PMU_UNKNOWN;
-               return -ENODEV;
-       }
-       ...

would get split over two patches.

The way I see it, having no PMU driver for PMU_68K_V1 machines is a bug. 
And loading any of the existing PMU drivers on that hardware is also a 
bug. These bugs are equivalent in that either one means you can't use the 
keyboard, trackpad etc. So there's no value in cherry-picking the other 
bug.

If you are using v4.17 on a PMU_68K_V1 machine, you probably already have 
CONFIG_ADB_PMU68K=n. With that config, here's nothing to be gained from 
bisecting these changes.

Does that make sense? Did I overlook other possible reason(s) to split up 
this patch?

-- 

^ permalink raw reply

* Re: [PATCH 08/11] macintosh/via-pmu: Replace via-pmu68k driver with via-pmu driver
From: Geert Uytterhoeven @ 2018-06-06  7:15 UTC (permalink / raw)
  To: Finn Thain
  Cc: Benjamin Herrenschmidt, Michael Schmitz, linuxppc-dev, linux-m68k,
	Linux Kernel Mailing List
In-Reply-To: <alpine.LNX.2.21.1806061619240.8@nippy.intranet>

Hi Finn,

On Wed, Jun 6, 2018 at 8:57 AM, Finn Thain <fthain@telegraphics.com.au> wrote:
> On Mon, 4 Jun 2018, Geert Uytterhoeven wrote:
>> > Don't call pmu_shutdown() or pmu_restart() on early PowerBooks: the
>> > PMU device found in these PowerBooks isn't supported.
>>
>> Shouldn't that be a separate patch?
>>
>> > --- a/arch/m68k/mac/misc.c
>> > +++ b/arch/m68k/mac/misc.c
>>
>> > @@ -477,9 +445,8 @@ void mac_poweroff(void)
>> >                    macintosh_config->adb_type == MAC_ADB_CUDA) {
>> >                 cuda_shutdown();
>> >  #endif
>> > -#ifdef CONFIG_ADB_PMU68K
>> > -       } else if (macintosh_config->adb_type == MAC_ADB_PB1
>> > -               || macintosh_config->adb_type == MAC_ADB_PB2) {
>> > +#ifdef CONFIG_ADB_PMU
>> > +       } else if (macintosh_config->adb_type == MAC_ADB_PB2) {
>> >                 pmu_shutdown();
>> >  #endif
>> >         }
>> > @@ -519,9 +486,8 @@ void mac_reset(void)
>> >                    macintosh_config->adb_type == MAC_ADB_CUDA) {
>> >                 cuda_restart();
>> >  #endif
>> > -#ifdef CONFIG_ADB_PMU68K
>> > -       } else if (macintosh_config->adb_type == MAC_ADB_PB1
>> > -               || macintosh_config->adb_type == MAC_ADB_PB2) {
>> > +#ifdef CONFIG_ADB_PMU
>> > +       } else if (macintosh_config->adb_type == MAC_ADB_PB2) {
>> >                 pmu_restart();
>> >  #endif
>> >         } else if (CPU_IS_030) {
>>
>
> The stability problem here is bigger than just pmu_restart() and
> pmu_shutdown(), so those hunks are insufficient. You need to prevent the
> via-pmu68k driver from loading in the first place and to drop all the
> PMU_68K_V1 cases.
>
> But splitting this patch in that way creates potential merge conflicts,
> which is a hassle. E.g. this hunk:
>
> -       ....
> -       switch (macintosh_config->adb_type) {
> -       case MAC_ADB_PB1:
> -               pmu_kind = PMU_68K_V1;
> -               break;
> -       case MAC_ADB_PB2:
> -               pmu_kind = PMU_68K_V2;
> -               break;
> -       default:
> -               pmu_kind = PMU_UNKNOWN;
> -               return -ENODEV;
> -       }
> -       ...
>
> would get split over two patches.
>
> The way I see it, having no PMU driver for PMU_68K_V1 machines is a bug.
> And loading any of the existing PMU drivers on that hardware is also a
> bug. These bugs are equivalent in that either one means you can't use the
> keyboard, trackpad etc. So there's no value in cherry-picking the other
> bug.
>
> If you are using v4.17 on a PMU_68K_V1 machine, you probably already have
> CONFIG_ADB_PMU68K=n. With that config, here's nothing to be gained from
> bisecting these changes.
>
> Does that make sense? Did I overlook other possible reason(s) to split up
> this patch?

So 4.17 mac_defconfig won't work on PMU_68K_V1 machines?
Perhaps that should be fixed first.

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH 2/2] powerpc: Add support for function error injection
From: Naveen N. Rao @ 2018-06-06 10:38 UTC (permalink / raw)
  To: Josef Bacik, Masami Hiramatsu, Ingo Molnar, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev
In-Reply-To: <1527793526.g9zv2oo8k4.naveen@linux.ibm.com>

Naveen N. Rao wrote:
> Michael Ellerman wrote:
>>=20
>> I guess if it doesn't already apply to tip you should rebase it. You've
>> probably missed 4.18 anyway.
>=20
> Oh ok. I just tried and it seems to apply just fine. I'll post v2 after=20
> giving this a quick test.

I didn't post a v2 since I have decided against using this approach. The=20
reason for that is Masami's series to remove jprobes. The discussion=20
there reminded me that we can't easily override functions with kprobes=20
on powerpc. Though it works for this particular scenario, we would just=20
be setting a bad example.

As such, I won't be changing generic code, but will simply make the=20
necessary changes in powerpc code.

Sorry for the noise.

- Naveen

=

^ permalink raw reply

* [PATCH v2] powerpc: Add support for function error injection
From: Naveen N. Rao @ 2018-06-06 10:40 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

We implement regs_set_return_value() and override_function_with_return() 
for this purpose.

On powerpc, a return from a function (blr) just branches to the location
contained in the link register. So, we can just update pt_regs rather
than redirecting execution to a dummy function that returns.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
---
 arch/powerpc/Kconfig                       |  1 +
 arch/powerpc/include/asm/error-injection.h | 13 +++++++++++++
 arch/powerpc/include/asm/ptrace.h          |  5 +++++
 arch/powerpc/lib/Makefile                  |  2 ++
 arch/powerpc/lib/error-inject.c            | 12 ++++++++++++
 5 files changed, 33 insertions(+)
 create mode 100644 arch/powerpc/include/asm/error-injection.h
 create mode 100644 arch/powerpc/lib/error-inject.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f674006dea2f..8e61bc0f25cb 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -186,6 +186,7 @@ config PPC
 	select HAVE_EBPF_JIT			if PPC64
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS	if !(CPU_LITTLE_ENDIAN && POWER7_CPU)
 	select HAVE_FTRACE_MCOUNT_RECORD
+	select HAVE_FUNCTION_ERROR_INJECTION
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
 	select HAVE_GCC_PLUGINS
diff --git a/arch/powerpc/include/asm/error-injection.h b/arch/powerpc/include/asm/error-injection.h
new file mode 100644
index 000000000000..740c3075bdf4
--- /dev/null
+++ b/arch/powerpc/include/asm/error-injection.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#ifndef _ASM_ERROR_INJECTION_H
+#define _ASM_ERROR_INJECTION_H
+
+#include <linux/compiler.h>
+#include <linux/linkage.h>
+#include <asm/ptrace.h>
+#include <asm-generic/error-injection.h>
+
+void override_function_with_return(struct pt_regs *regs);
+
+#endif /* _ASM_ERROR_INJECTION_H */
diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h
index e4923686e43a..c0705296c2f0 100644
--- a/arch/powerpc/include/asm/ptrace.h
+++ b/arch/powerpc/include/asm/ptrace.h
@@ -101,6 +101,11 @@ static inline long regs_return_value(struct pt_regs *regs)
 		return -regs->gpr[3];
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+	regs->gpr[3] = rc;
+}
+
 #ifdef __powerpc64__
 #define user_mode(regs) ((((regs)->msr) >> MSR_PR_LG) & 0x1)
 #else
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 653901042ad7..e0ed195eaa4b 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -14,6 +14,8 @@ obj-y += string.o alloc.o code-patching.o feature-fixups.o
 
 obj-$(CONFIG_PPC32)	+= div64.o copy_32.o crtsavres.o
 
+obj-$(CONFIG_FUNCTION_ERROR_INJECTION)	+= error-inject.o
+
 # See corresponding test in arch/powerpc/Makefile
 # 64-bit linker creates .sfpr on demand for final link (vmlinux),
 # so it is only needed for modules, and only for older linkers which
diff --git a/arch/powerpc/lib/error-inject.c b/arch/powerpc/lib/error-inject.c
new file mode 100644
index 000000000000..6d3d57f68024
--- /dev/null
+++ b/arch/powerpc/lib/error-inject.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+
+void override_function_with_return(struct pt_regs *regs)
+{
+	/* Emulate 'blr' instruction */
+	regs->nip = regs->link;
+}
+NOKPROBE_SYMBOL(override_function_with_return);
-- 
2.17.0

^ permalink raw reply related

* RE: [PATCH 09/10] dpaa_eth: add support for hardware timestamping
From: Y.b. Lu @ 2018-06-06 11:48 UTC (permalink / raw)
  To: Richard Cochran
  Cc: netdev@vger.kernel.org, Madalin-cristian Bucur, Rob Herring,
	Shawn Guo, David S . Miller, devicetree@vger.kernel.org,
	linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20180605135748.mlarwiyzf2oe27ax@localhost>

SGkgUmljaGFyZCwNCg0KPiAtLS0tLU9yaWdpbmFsIE1lc3NhZ2UtLS0tLQ0KPiBGcm9tOiBSaWNo
YXJkIENvY2hyYW4gW21haWx0bzpyaWNoYXJkY29jaHJhbkBnbWFpbC5jb21dDQo+IFNlbnQ6IFR1
ZXNkYXksIEp1bmUgNSwgMjAxOCA5OjU4IFBNDQo+IFRvOiBZLmIuIEx1IDx5YW5nYm8ubHVAbnhw
LmNvbT4NCj4gQ2M6IG5ldGRldkB2Z2VyLmtlcm5lbC5vcmc7IE1hZGFsaW4tY3Jpc3RpYW4gQnVj
dXINCj4gPG1hZGFsaW4uYnVjdXJAbnhwLmNvbT47IFJvYiBIZXJyaW5nIDxyb2JoK2R0QGtlcm5l
bC5vcmc+OyBTaGF3biBHdW8NCj4gPHNoYXduZ3VvQGtlcm5lbC5vcmc+OyBEYXZpZCBTIC4gTWls
bGVyIDxkYXZlbUBkYXZlbWxvZnQubmV0PjsNCj4gZGV2aWNldHJlZUB2Z2VyLmtlcm5lbC5vcmc7
IGxpbnV4cHBjLWRldkBsaXN0cy5vemxhYnMub3JnOw0KPiBsaW51eC1hcm0ta2VybmVsQGxpc3Rz
LmluZnJhZGVhZC5vcmc7IGxpbnV4LWtlcm5lbEB2Z2VyLmtlcm5lbC5vcmcNCj4gU3ViamVjdDog
UmU6IFtQQVRDSCAwOS8xMF0gZHBhYV9ldGg6IGFkZCBzdXBwb3J0IGZvciBoYXJkd2FyZSB0aW1l
c3RhbXBpbmcNCj4gDQo+IE9uIFR1ZSwgSnVuIDA1LCAyMDE4IGF0IDAzOjM1OjI4QU0gKzAwMDAs
IFkuYi4gTHUgd3JvdGU6DQo+ID4gW1kuYi4gTHVdIEFjdHVhbGx5IHRoZXNlIHRpbWVzdGFtcGlu
ZyBjb2RlcyBhZmZlY3RlZCBEUEFBIG5ldHdvcmtpbmcNCj4gcGVyZm9ybWFuY2UgaW4gb3VyIHBy
ZXZpb3VzIHBlcmZvcm1hbmNlIHRlc3QuDQo+ID4gVGhhdCdzIHdoeSB3ZSB1c2VkIGlmZGVmIGZv
ciBpdC4NCj4gDQo+IEhvdyBtdWNoIGRvZXMgdGltZSBzdGFtcGluZyBodXJ0IHBlcmZvcm1hbmNl
Pw0KPiANCj4gSWYgdGhlIHRpbWUgc3RhbXBpbmcgaXMgY29tcGlsZWQgaW4gYnV0IG5vdCBlbmFi
bGVkIGF0IHJ1biB0aW1lLCBkb2VzIGl0IHN0aWxsDQo+IGFmZmVjdCBwZXJmb3JtYWNlPw0KDQpb
WS5iLiBMdV0gSSBjYW4ndCByZW1lbWJlciBhbmQgZmluZCB0aGUgb2xkIGRhdGEgc2luY2UgaXQg
aGFkIGJlZW4gYSBsb25nIHRpbWUuDQpJIGp1c3QgZGlkIHRoZSBpcGVyZiB0ZXN0IHRvZGF5IGJl
dHdlZW4gdHdvIDEwRyBwb3J0cy4gSSBkaWRu4oCZdCBzZWUgYW55IHBlcmZvcm1hbmNlIGNoYW5n
ZXMgd2l0aCB0aW1lc3RhbXBpbmcgY29kZSDwn5iKDQpTbywgbGV0J3MgbWUgcmVtb3ZlIHRoZSBp
ZmRlZiBpbiBuZXh0IHZlcnNpb24uDQpUaGFua3MgYSBsb3QuDQoNCg0KPiANCj4gVGhhbmtzLA0K
PiBSaWNoYXJkDQo=

^ permalink raw reply

* [PATCH v4 1/2] powerpc/time: Only set CONFIG_ARCH_HAS_SCALED_CPUTIME on PPC64
From: Christophe Leroy @ 2018-06-06 14:21 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Frederic Weisbecker, Nicholas Piggin
  Cc: linux-kernel, linuxppc-dev

scaled cputime is only meaningfull when the processor has
SPURR and/or PURR, which means only on PPC64.

Removing it on PPC32 significantly reduces the size of
vtime_account_system() and vtime_account_idle() on an 8xx:

Before:
00000000 l     F .text	000000a8 vtime_delta
00000280 g     F .text	0000010c vtime_account_system
0000038c g     F .text	00000048 vtime_account_idle

After:
(vtime_delta gets inlined in the two functions)
000001d8 g     F .text	000000a0 vtime_account_system
00000278 g     F .text	00000038 vtime_account_idle

In terms of performance, we also get approximatly 5% improvement on task switch:
The following small benchmark app is run with perf stat:

void *thread(void *arg)
{
	int i;

	for (i = 0; i < atoi((char*)arg); i++)
		pthread_yield();
}

int main(int argc, char **argv)
{
	pthread_t th1, th2;

	pthread_create(&th1, NULL, thread, argv[1]);
	pthread_create(&th2, NULL, thread, argv[1]);
	pthread_join(th1, NULL);
	pthread_join(th2, NULL);

	return 0;
}

Before the patch:

~# perf stat chrt -f 98 ./sched 100000

 Performance counter stats for 'chrt -f 98 ./sched 100000':

       8622.166272      task-clock (msec)         #    0.955 CPUs utilized
            200027      context-switches          #    0.023 M/sec

After the patch:

~# perf stat chrt -f 98 ./sched 100000

 Performance counter stats for 'chrt -f 98 ./sched 100000':

       8207.090048      task-clock (msec)         #    0.958 CPUs utilized
            200025      context-switches          #    0.024 M/sec

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 v4:
  - Using the correct symbol CONFIG_ARCH_HAS_SCALED_CPUTIME instead of ARCH_HAS_SCALED_CPUTIME
  - Grouped CONFIG_ARCH_HAS_SCALED_CPUTIME related code in dedicated functions to reduce the number of #ifdefs
  - Integrated read_spurr() directly into the related function.
 v3: Rebased following modifications in xmon.c
 v2: added ifdefs in xmon to fix compilation error

 arch/powerpc/Kconfig                  |   2 +-
 arch/powerpc/include/asm/accounting.h |   4 ++
 arch/powerpc/include/asm/cputime.h    |   1 -
 arch/powerpc/kernel/time.c            | 111 +++++++++++++++++++++-------------
 arch/powerpc/xmon/xmon.c              |   4 ++
 5 files changed, 77 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b62a16e2c7cc..735398fd390d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -142,7 +142,7 @@ config PPC
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PMEM_API                if PPC64
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
-	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE
+	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC64
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_STRICT_KERNEL_RWX	if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION)
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/powerpc/include/asm/accounting.h b/arch/powerpc/include/asm/accounting.h
index 3abcf98ed2e0..c607c5d835cc 100644
--- a/arch/powerpc/include/asm/accounting.h
+++ b/arch/powerpc/include/asm/accounting.h
@@ -15,8 +15,10 @@ struct cpu_accounting_data {
 	/* Accumulated cputime values to flush on ticks*/
 	unsigned long utime;
 	unsigned long stime;
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 	unsigned long utime_scaled;
 	unsigned long stime_scaled;
+#endif
 	unsigned long gtime;
 	unsigned long hardirq_time;
 	unsigned long softirq_time;
@@ -25,8 +27,10 @@ struct cpu_accounting_data {
 	/* Internal counters */
 	unsigned long starttime;	/* TB value snapshot */
 	unsigned long starttime_user;	/* TB value on exit to usermode */
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 	unsigned long startspurr;	/* SPURR value snapshot */
 	unsigned long utime_sspurr;	/* ->user_time when ->startspurr set */
+#endif
 };
 
 #endif
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index bc4903badb3f..a48c7b5e5cf9 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -62,7 +62,6 @@ static inline void arch_vtime_task_switch(struct task_struct *prev)
 	struct cpu_accounting_data *acct0 = get_accounting(prev);
 
 	acct->starttime = acct0->starttime;
-	acct->startspurr = acct0->startspurr;
 }
 #endif
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 70f145e02487..7a9f4e2f22c8 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -171,19 +171,6 @@ static void calc_cputime_factors(void)
 	__cputime_usec_factor = res.result_low;
 }
 
-/*
- * Read the SPURR on systems that have it, otherwise the PURR,
- * or if that doesn't exist return the timebase value passed in.
- */
-static unsigned long read_spurr(unsigned long tb)
-{
-	if (cpu_has_feature(CPU_FTR_SPURR))
-		return mfspr(SPRN_SPURR);
-	if (cpu_has_feature(CPU_FTR_PURR))
-		return mfspr(SPRN_PURR);
-	return tb;
-}
-
 #ifdef CONFIG_PPC_SPLPAR
 
 /*
@@ -277,30 +264,27 @@ static inline u64 calculate_stolen_time(u64 stop_tb)
 
 #endif /* CONFIG_PPC_SPLPAR */
 
-/*
- * Account time for a transition between system, hard irq
- * or soft irq state.
- */
-static unsigned long vtime_delta(struct task_struct *tsk,
-				 unsigned long *stime_scaled,
-				 unsigned long *steal_time)
+static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct,
+					unsigned long now, unsigned long stime)
 {
-	unsigned long now, nowscaled, deltascaled;
-	unsigned long stime;
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
+	unsigned long nowscaled, deltascaled;
 	unsigned long utime, utime_scaled;
-	struct cpu_accounting_data *acct = get_accounting(tsk);
+	unsigned long stime_scaled;
 
-	WARN_ON_ONCE(!irqs_disabled());
+	/*
+	 * Read the SPURR on systems that have it, otherwise the PURR,
+	 * or if that doesn't exist user the timebase value passed in.
+	 */
+	if (cpu_has_feature(CPU_FTR_SPURR))
+		nowscaled = mfspr(SPRN_SPURR);
+	else if (cpu_has_feature(CPU_FTR_PURR))
+		nowscaled = mfspr(SPRN_PURR);
+	else
+		nowscaled = now;
 
-	now = mftb();
-	nowscaled = read_spurr(now);
-	stime = now - acct->starttime;
-	acct->starttime = now;
 	deltascaled = nowscaled - acct->startspurr;
 	acct->startspurr = nowscaled;
-
-	*steal_time = calculate_stolen_time(now);
-
 	utime = acct->utime - acct->utime_sspurr;
 	acct->utime_sspurr = acct->utime;
 
@@ -314,18 +298,46 @@ static unsigned long vtime_delta(struct task_struct *tsk,
 	 * the user ticks get saved up in paca->user_time_scaled to be
 	 * used by account_process_tick.
 	 */
-	*stime_scaled = stime;
+	stime_scaled = stime;
 	utime_scaled = utime;
 	if (deltascaled != stime + utime) {
 		if (utime) {
-			*stime_scaled = deltascaled * stime / (stime + utime);
-			utime_scaled = deltascaled - *stime_scaled;
+			stime_scaled = deltascaled * stime / (stime + utime);
+			utime_scaled = deltascaled - stime_scaled;
 		} else {
-			*stime_scaled = deltascaled;
+			stime_scaled = deltascaled;
 		}
 	}
 	acct->utime_scaled += utime_scaled;
 
+	return stime_scaled;
+#else
+	return 0;
+#endif
+}
+
+/*
+ * Account time for a transition between system, hard irq
+ * or soft irq state.
+ */
+static unsigned long vtime_delta(struct task_struct *tsk,
+				 unsigned long *stime_scaled,
+				 unsigned long *steal_time)
+{
+	unsigned long now;
+	unsigned long stime;
+	struct cpu_accounting_data *acct = get_accounting(tsk);
+
+	WARN_ON_ONCE(!irqs_disabled());
+
+	now = mftb();
+	stime = now - acct->starttime;
+	acct->starttime = now;
+
+	*stime_scaled = vtime_delta_scaled(acct, now, stime);
+
+	*steal_time = calculate_stolen_time(now);
+
 	return stime;
 }
 
@@ -341,7 +353,9 @@ void vtime_account_system(struct task_struct *tsk)
 
 	if ((tsk->flags & PF_VCPU) && !irq_count()) {
 		acct->gtime += stime;
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 		acct->utime_scaled += stime_scaled;
+#endif
 	} else {
 		if (hardirq_count())
 			acct->hardirq_time += stime;
@@ -350,7 +364,9 @@ void vtime_account_system(struct task_struct *tsk)
 		else
 			acct->stime += stime;
 
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 		acct->stime_scaled += stime_scaled;
+#endif
 	}
 }
 EXPORT_SYMBOL_GPL(vtime_account_system);
@@ -364,6 +380,21 @@ void vtime_account_idle(struct task_struct *tsk)
 	acct->idle_time += stime + steal_time;
 }
 
+static void vtime_flush_scaled(struct task_struct *tsk,
+			       struct cpu_accounting_data *acct)
+{
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
+	if (acct->utime_scaled)
+		tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
+	if (acct->stime_scaled)
+		tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
+
+	acct->utime_scaled = 0;
+	acct->utime_sspurr = 0;
+	acct->stime_scaled = 0;
+#endif
+}
+
 /*
  * Account the whole cputime accumulated in the paca
  * Must be called with interrupts disabled.
@@ -378,9 +409,6 @@ void vtime_flush(struct task_struct *tsk)
 	if (acct->utime)
 		account_user_time(tsk, cputime_to_nsecs(acct->utime));
 
-	if (acct->utime_scaled)
-		tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
-
 	if (acct->gtime)
 		account_guest_time(tsk, cputime_to_nsecs(acct->gtime));
 
@@ -393,8 +421,6 @@ void vtime_flush(struct task_struct *tsk)
 	if (acct->stime)
 		account_system_index_time(tsk, cputime_to_nsecs(acct->stime),
 					  CPUTIME_SYSTEM);
-	if (acct->stime_scaled)
-		tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
 
 	if (acct->hardirq_time)
 		account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time),
@@ -404,15 +430,14 @@ void vtime_flush(struct task_struct *tsk)
 					  CPUTIME_SOFTIRQ);
 
 	acct->utime = 0;
-	acct->utime_scaled = 0;
-	acct->utime_sspurr = 0;
 	acct->gtime = 0;
 	acct->steal_time = 0;
 	acct->idle_time = 0;
 	acct->stime = 0;
-	acct->stime_scaled = 0;
 	acct->hardirq_time = 0;
 	acct->softirq_time = 0;
+
+	vtime_flush_scaled(tsk, acct);
 }
 
 #else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 47166ad2a669..b1e551d40ee1 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2443,11 +2443,15 @@ static void dump_one_paca(int cpu)
 
 	DUMP(p, accounting.utime, "%#-*lx");
 	DUMP(p, accounting.stime, "%#-*lx");
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 	DUMP(p, accounting.utime_scaled, "%#-*lx");
+#endif
 	DUMP(p, accounting.starttime, "%#-*lx");
 	DUMP(p, accounting.starttime_user, "%#-*lx");
+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
 	DUMP(p, accounting.startspurr, "%#-*lx");
 	DUMP(p, accounting.utime_sspurr, "%#-*lx");
+#endif
 	DUMP(p, accounting.steal_time, "%#-*lx");
 #undef DUMP
 
-- 
2.13.3

^ permalink raw reply related

* [PATCH v4 2/2] powerpc/time: no steal_time when CONFIG_PPC_SPLPAR is not selected
From: Christophe Leroy @ 2018-06-06 14:21 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Frederic Weisbecker, Nicholas Piggin
  Cc: linux-kernel, linuxppc-dev
In-Reply-To: <d9ac8da98f53debb4758b98d0227979aca9196f7.1528292284.git.christophe.leroy@c-s.fr>

If CONFIG_PPC_SPLPAR is not selected, steal_time will always
be NUL, so accounting it is pointless

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 v4: removed the check in vtime_account_system(), the compiler removes the code regardless.
 v3: new

 arch/powerpc/kernel/time.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 7a9f4e2f22c8..eda78b1ed7d3 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -412,9 +412,6 @@ void vtime_flush(struct task_struct *tsk)
 	if (acct->gtime)
 		account_guest_time(tsk, cputime_to_nsecs(acct->gtime));
 
-	if (acct->steal_time)
-		account_steal_time(cputime_to_nsecs(acct->steal_time));
-
 	if (acct->idle_time)
 		account_idle_time(cputime_to_nsecs(acct->idle_time));
 
@@ -431,13 +428,17 @@ void vtime_flush(struct task_struct *tsk)
 
 	acct->utime = 0;
 	acct->gtime = 0;
-	acct->steal_time = 0;
 	acct->idle_time = 0;
 	acct->stime = 0;
 	acct->hardirq_time = 0;
 	acct->softirq_time = 0;
 
 	vtime_flush_scaled(tsk, acct);
+
+	if (IS_ENABLED(CONFIG_PPC_SPLPAR) && acct->steal_time) {
+		account_steal_time(cputime_to_nsecs(acct->steal_time));
+		acct->steal_time = 0;
+	}
 }
 
 #else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-- 
2.13.3

^ permalink raw reply related

* Re: [PATCH v6 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision
From: Segher Boessenkool @ 2018-06-06 20:00 UTC (permalink / raw)
  To: Simon Guo; +Cc: linuxppc-dev, Naveen N.  Rao, Cyril Bur
In-Reply-To: <20180606064227.GB7342@simonLocalRHEL7.x64>

On Wed, Jun 06, 2018 at 02:42:27PM +0800, Simon Guo wrote:
> I now felt unformatable to use mcrf like:
> mcrf    7,0
> 
> since I cannot 100% confident that compiler will not use CR7 or other
> CR# in exit_vmx_ops().

It wasn't clear to me this macro boils down to a function call.

You can use CR2,CR3,CR4, but you'll need to save and restore those at
the start and end of function then, which is just as nasty.

Better is to restructure some code so you don't need that CR field
there anymore.

> Can we switch back to mfocrf/mtocrf with correct CR0 value?
>        mfocrf  r5,128
>         ...
>        mtocrf  128,r5

Sure, I'm not your boss ;-)  It seems a shame to me to have this 12 or
whatever cycle delay here, since the whole point of the patch is to
make things faster, that's all (but it still is faster, right, you
tested it).


Segher

^ permalink raw reply

* Re: powerpc/64s/radix: Fix missing ptesync in flush_cache_vmap
From: Michael Ellerman @ 2018-06-07  0:17 UTC (permalink / raw)
  To: Nicholas Piggin, linuxppc-dev; +Cc: Nicholas Piggin
In-Reply-To: <20180606014008.20363-1-npiggin@gmail.com>

On Wed, 2018-06-06 at 01:40:08 UTC, Nicholas Piggin wrote:
> There is a typo in f1cb8f9beb ("powerpc/64s/radix: avoid ptesync after
> set_pte and ptep_set_access_flags") config ifdef, which results in the
> necessary ptesync not being issued after vmalloc.
> 
> This causes random kernel faults in module load, bpf load, anywhere
> that vmalloc mappings are used.
> 
> After correcting the code, this survives a guest kernel booting
> hundreds of times where previously there would be a crash every few
> boots (I haven't noticed the crash on host, perhaps due to different
> TLB and page table walking behaviour in hardware).
> 
> A memory clobber is also added to the flush, just to be sure it won't
> be reordered with the pte set or the subsequent mapping access.
> 
> Fixes: f1cb8f9beb ("powerpc/64s/radix: avoid ptesync after set_pte and ptep_set_access_flags")
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/ff5bc793e47b537bf3e904fada585e

cheers

^ permalink raw reply

* Re: [PATCH v4 1/2] powerpc/time: Only set CONFIG_ARCH_HAS_SCALED_CPUTIME on PPC64
From: Nicholas Piggin @ 2018-06-07  1:43 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Frederic Weisbecker, linux-kernel, linuxppc-dev
In-Reply-To: <d9ac8da98f53debb4758b98d0227979aca9196f7.1528292284.git.christophe.leroy@c-s.fr>

On Wed,  6 Jun 2018 14:21:08 +0000 (UTC)
Christophe Leroy <christophe.leroy@c-s.fr> wrote:

> scaled cputime is only meaningfull when the processor has
> SPURR and/or PURR, which means only on PPC64.
> 
> Removing it on PPC32 significantly reduces the size of
> vtime_account_system() and vtime_account_idle() on an 8xx:
> 
> Before:
> 00000000 l     F .text	000000a8 vtime_delta
> 00000280 g     F .text	0000010c vtime_account_system
> 0000038c g     F .text	00000048 vtime_account_idle
> 
> After:
> (vtime_delta gets inlined in the two functions)
> 000001d8 g     F .text	000000a0 vtime_account_system
> 00000278 g     F .text	00000038 vtime_account_idle
> 
> In terms of performance, we also get approximatly 5% improvement on task switch:
> The following small benchmark app is run with perf stat:
> 
> void *thread(void *arg)
> {
> 	int i;
> 
> 	for (i = 0; i < atoi((char*)arg); i++)
> 		pthread_yield();
> }
> 
> int main(int argc, char **argv)
> {
> 	pthread_t th1, th2;
> 
> 	pthread_create(&th1, NULL, thread, argv[1]);
> 	pthread_create(&th2, NULL, thread, argv[1]);
> 	pthread_join(th1, NULL);
> 	pthread_join(th2, NULL);
> 
> 	return 0;
> }
> 
> Before the patch:
> 
> ~# perf stat chrt -f 98 ./sched 100000
> 
>  Performance counter stats for 'chrt -f 98 ./sched 100000':
> 
>        8622.166272      task-clock (msec)         #    0.955 CPUs utilized
>             200027      context-switches          #    0.023 M/sec
> 
> After the patch:
> 
> ~# perf stat chrt -f 98 ./sched 100000
> 
>  Performance counter stats for 'chrt -f 98 ./sched 100000':
> 
>        8207.090048      task-clock (msec)         #    0.958 CPUs utilized
>             200025      context-switches          #    0.024 M/sec
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>

This looks okay to me. Nice numbers.

> ---
>  v4:
>   - Using the correct symbol CONFIG_ARCH_HAS_SCALED_CPUTIME instead of ARCH_HAS_SCALED_CPUTIME
>   - Grouped CONFIG_ARCH_HAS_SCALED_CPUTIME related code in dedicated functions to reduce the number of #ifdefs
>   - Integrated read_spurr() directly into the related function.
>  v3: Rebased following modifications in xmon.c
>  v2: added ifdefs in xmon to fix compilation error
> 
>  arch/powerpc/Kconfig                  |   2 +-
>  arch/powerpc/include/asm/accounting.h |   4 ++
>  arch/powerpc/include/asm/cputime.h    |   1 -
>  arch/powerpc/kernel/time.c            | 111 +++++++++++++++++++++-------------
>  arch/powerpc/xmon/xmon.c              |   4 ++
>  5 files changed, 77 insertions(+), 45 deletions(-)
> 
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index b62a16e2c7cc..735398fd390d 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -142,7 +142,7 @@ config PPC
>  	select ARCH_HAS_PHYS_TO_DMA
>  	select ARCH_HAS_PMEM_API                if PPC64
>  	select ARCH_HAS_MEMBARRIER_CALLBACKS
> -	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE
> +	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC64

I wonder if we could make this depend on PPC_PSERIES or even
PPC_SPLPAR as well? (That would be for a later patch)

Thanks,
Nick

^ permalink raw reply

* [PATCH v8 0/5] powerpc/64: memcmp() optimization
From: wei.guo.simon @ 2018-06-07  1:57 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo

From: Simon Guo <wei.guo.simon@gmail.com>

There is some room to optimize memcmp() in powerpc 64 bits version for
following 2 cases:
(1) Even src/dst addresses are not aligned with 8 bytes at the beginning,
memcmp() can align them and go with .Llong comparision mode without
fallback to .Lshort comparision mode do compare buffer byte by byte.
(2) VMX instructions can be used to speed up for large size comparision,
currently the threshold is set for 4K bytes. Notes the VMX instructions
will lead to VMX regs save/load penalty. This patch set includes a
patch to add a 32 bytes pre-checking to minimize the penalty.

It did the similar with glibc commit dec4a7105e (powerpc: Improve memcmp 
performance for POWER8). Thanks Cyril Bur's information.
This patch set also updates memcmp selftest case to make it compiled and
incorporate large size comparison case.

v7 -> v8:
- define memcmp with _GLOBAL_TOC() instead of _GLOBAL() to fix TOC issue.
add _GLOBAL_TOC() definition into selftest so that it can be compiled.
- use mfocrf/mtocrf instead of mcrf to save/restore CR0

v6 -> v7:
- add vcmpequd/vcmpequdb .long macro
- add CPU_FTR pair so that Power7 won't invoke Altivec instrs.
- rework some instructions for higher performance or more readable.

v5 -> v6:
- correct some comments/commit messsage.
- rename VMX_OPS_THRES to VMX_THRESH

v4 -> v5:
- Expand 32 bytes prechk to src/dst different offset case, and remove
KSM specific label/comment.

v3 -> v4:
- Add 32 bytes pre-checking before using VMX instructions.

v2 -> v3:
- add optimization for src/dst with different offset against 8 bytes
boundary.
- renamed some label names.
- reworked some comments from Cyril Bur, such as fill the pipeline, 
and use VMX when size == 4K.
- fix a bug of enter/exit_vmx_ops pairness issue. And revised test 
case to test whether enter/exit_vmx_ops are paired.

v1 -> v2:
- update 8bytes unaligned bytes comparison method.
- fix a VMX comparision bug.
- enhanced the original memcmp() selftest.
- add powerpc/64 to subject/commit message.


Simon Guo (5):
  powerpc/64: Align bytes before fall back to .Lshort in powerpc64
    memcmp()
  powerpc: add vcmpequd/vcmpequb ppc instruction macro
  powerpc/64: enhance memcmp() with VMX instruction for long bytes
    comparision
  powerpc/64: add 32 bytes prechecking before using VMX optimization on
    memcmp()
  powerpc:selftest update memcmp_64 selftest for VMX implementation

 arch/powerpc/include/asm/asm-prototypes.h          |   4 +-
 arch/powerpc/include/asm/ppc-opcode.h              |  11 +
 arch/powerpc/lib/copypage_power7.S                 |   4 +-
 arch/powerpc/lib/memcmp_64.S                       | 414 ++++++++++++++++++++-
 arch/powerpc/lib/memcpy_power7.S                   |   6 +-
 arch/powerpc/lib/vmx-helper.c                      |   4 +-
 .../selftests/powerpc/copyloops/asm/ppc_asm.h      |   4 +-
 .../selftests/powerpc/stringloops/asm/ppc-opcode.h |  39 ++
 .../selftests/powerpc/stringloops/asm/ppc_asm.h    |  25 ++
 .../testing/selftests/powerpc/stringloops/memcmp.c |  98 +++--
 10 files changed, 568 insertions(+), 41 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h

-- 
1.8.3.1

^ permalink raw reply

* [PATCH v8 1/5] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()
From: wei.guo.simon @ 2018-06-07  1:57 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1528336675-10879-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes aligned.
It can be opmitized in 2 situations:

1) if both addresses are with the same offset with 8 bytes boundary:
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

2)  If src/dst addrs are not with the same offset of 8 bytes boundary:
memcmp() can align src addr with 8 bytes, increment dst addr accordingly,
 then load src with aligned mode and load dst with unaligned mode.

This patch optmizes memcmp() behavior in the above 2 situations.

Tested with both little/big endian. Performance result below is based on
little endian.

Following is the test result with src/dst having the same offset case:
(a similar result was observed when src/dst having different offset):
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
	29.773018302 seconds time elapsed                                          ( +- 0.09% )
- with patch
	16.485568173 seconds time elapsed                                          ( +-  0.02% )
		-> There is ~+80% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
 #include <string.h>
 #include "utils.h"

-#define SIZE 256
+#define SIZE 32
 #define ITERATIONS 10000

 int test_memcmp(const void *s1, const void *s2, size_t n);
--------

- Without patch
	0.244746482 seconds time elapsed                                          ( +-  0.36%)
- with patch
	0.215069477 seconds time elapsed                                          ( +-  0.51%)
		-> There is ~+13% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
 #include <string.h>
 #include "utils.h"

-#define SIZE 256
-#define ITERATIONS 10000
+#define SIZE 8
+#define ITERATIONS 1000000

 int test_memcmp(const void *s1, const void *s2, size_t n);
-------
- Without patch
       1.845642503 seconds time elapsed                                          ( +- 0.12% )
- With patch
       1.849767135 seconds time elapsed                                          ( +- 0.26% )
		-> They are nearly the same. (-0.2%)

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/lib/memcmp_64.S | 140 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 133 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..5776f91 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,41 @@
 #define rH	r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH	lhbrx
+#define LW	lwbrx
 #define LD	ldbrx
 #else
+#define LH	lhzx
+#define LW	lwzx
 #define LD	ldx
 #endif
 
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_xxxx
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_xxxx
+ */
 _GLOBAL(memcmp)
 	cmpdi	cr1,r5,0
 
-	/* Use the short loop if both strings are not 8B aligned */
-	or	r6,r3,r4
+	/* Use the short loop if the src/dst addresses are not
+	 * with the same offset of 8 bytes align boundary.
+	 */
+	xor	r6,r3,r4
 	andi.	r6,r6,7
 
-	/* Use the short loop if length is less than 32B */
-	cmpdi	cr6,r5,31
+	/* Fall back to short loop if compare at aligned addrs
+	 * with less than 8 bytes.
+	 */
+	cmpdi   cr6,r5,7
 
 	beq	cr1,.Lzero
-	bne	.Lshort
-	bgt	cr6,.Llong
+	bgt	cr6,.Lno_short
 
 .Lshort:
 	mtctr	r5
-
 1:	lbz	rA,0(r3)
 	lbz	rB,0(r4)
 	subf.	rC,rB,rA
@@ -78,11 +91,89 @@ _GLOBAL(memcmp)
 	li	r3,0
 	blr
 
+.Lno_short:
+	dcbt	0,r3
+	dcbt	0,r4
+	bne	.Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+	/* attempt to compare bytes not aligned with 8 bytes so that
+	 * rest comparison can run based on 8 bytes alignment.
+	 */
+	andi.   r6,r3,7
+
+	/* Try to compare the first double word which is not 8 bytes aligned:
+	 * load the first double word at (src & ~7UL) and shift left appropriate
+	 * bits before comparision.
+	 */
+	rlwinm  r6,r3,3,26,28
+	beq     .Lsameoffset_8bytes_aligned
+	clrrdi	r3,r3,3
+	clrrdi	r4,r4,3
+	LD	rA,0,r3
+	LD	rB,0,r4
+	sld	rA,rA,r6
+	sld	rB,rB,r6
+	cmpld	cr0,rA,rB
+	srwi	r6,r6,3
+	bne	cr0,.LcmpAB_lightweight
+	subfic  r6,r6,8
+	subf.	r5,r6,r5
+	addi	r3,r3,8
+	addi	r4,r4,8
+	beq	.Lzero
+
+.Lsameoffset_8bytes_aligned:
+	/* now we are aligned with 8 bytes.
+	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
+	 */
+	cmpdi   cr6,r5,31
+	bgt	cr6,.Llong
+
+.Lcmp_lt32bytes:
+	/* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */
+	cmpdi   cr5,r5,7
+	srdi    r0,r5,3
+	ble	cr5,.Lcmp_rest_lt8bytes
+
+	/* handle 8 ~ 31 bytes */
+	clrldi  r5,r5,61
+	mtctr   r0
+2:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne	cr0,.LcmpAB_lightweight
+	bdnz	2b
+
+	cmpwi   r5,0
+	beq	.Lzero
+
+.Lcmp_rest_lt8bytes:
+	/* Here we have only less than 8 bytes to compare with. at least s1
+	 * Address is aligned with 8 bytes.
+	 * The next double words are load and shift right with appropriate
+	 * bits.
+	 */
+	subfic  r6,r5,8
+	slwi	r6,r6,3
+	LD	rA,0,r3
+	LD	rB,0,r4
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+
 .Lnon_zero:
 	mr	r3,rC
 	blr
 
 .Llong:
+	/* At least s1 addr is aligned with 8 bytes */
 	li	off8,8
 	li	off16,16
 	li	off24,24
@@ -232,4 +323,39 @@ _GLOBAL(memcmp)
 	ld	r28,-32(r1)
 	ld	r27,-40(r1)
 	blr
+
+.LcmpAB_lightweight:   /* skip NV GPRS restore */
+	li	r3,1
+	bgtlr
+	li	r3,-1
+	blr
+
+.Ldiffoffset_8bytes_make_align_start:
+	/* now try to align s1 with 8 bytes */
+	rlwinm  r6,r3,3,26,28
+	beq     .Ldiffoffset_align_s1_8bytes
+
+	clrrdi	r3,r3,3
+	LD	rA,0,r3
+	LD	rB,0,r4  /* unaligned load */
+	sld	rA,rA,r6
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	srwi	r6,r6,3
+	bne	cr0,.LcmpAB_lightweight
+
+	subfic  r6,r6,8
+	subf.	r5,r6,r5
+	addi	r3,r3,8
+	add	r4,r4,r6
+
+	beq	.Lzero
+
+.Ldiffoffset_align_s1_8bytes:
+	/* now s1 is aligned with 8 bytes. */
+	cmpdi   cr5,r5,31
+	ble	cr5,.Lcmp_lt32bytes
+	b	.Llong
+
 EXPORT_SYMBOL(memcmp)
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v8 2/5] powerpc: add vcmpequd/vcmpequb ppc instruction macro
From: wei.guo.simon @ 2018-06-07  1:57 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1528336675-10879-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

Some old tool chains don't know about instructions like vcmpequd.

This patch adds .long macro for vcmpequd and vcmpequb, which is
a preparation to optimize ppc64 memcmp with VMX instructions.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/include/asm/ppc-opcode.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 18883b8..1866a97 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -366,6 +366,8 @@
 #define PPC_INST_STFDX			0x7c0005ae
 #define PPC_INST_LVX			0x7c0000ce
 #define PPC_INST_STVX			0x7c0001ce
+#define PPC_INST_VCMPEQUD		0x100000c7
+#define PPC_INST_VCMPEQUB		0x10000006
 
 /* macros to insert fields into opcodes */
 #define ___PPC_RA(a)	(((a) & 0x1f) << 16)
@@ -396,6 +398,7 @@
 #define __PPC_BI(s)	(((s) & 0x1f) << 16)
 #define __PPC_CT(t)	(((t) & 0x0f) << 21)
 #define __PPC_SPR(r)	((((r) & 0x1f) << 16) | ((((r) >> 5) & 0x1f) << 11))
+#define __PPC_RC21	(0x1 << 10)
 
 /*
  * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a
@@ -567,4 +570,12 @@
 				       ((IH & 0x7) << 21))
 #define PPC_INVALIDATE_ERAT	PPC_SLBIA(7)
 
+#define VCMPEQUD_RC(vrt, vra, vrb)	stringify_in_c(.long PPC_INST_VCMPEQUD | \
+			      ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+			      ___PPC_RB(vrb) | __PPC_RC21)
+
+#define VCMPEQUB_RC(vrt, vra, vrb)	stringify_in_c(.long PPC_INST_VCMPEQUB | \
+			      ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+			      ___PPC_RB(vrb) | __PPC_RC21)
+
 #endif /* _ASM_POWERPC_PPC_OPCODE_H */
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v8 3/5] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision
From: wei.guo.simon @ 2018-06-07  1:57 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1528336675-10879-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patch add VMX primitives to do memcmp() in case the compare size
is equal or greater than 4K bytes. KSM feature can benefit from this.

Test result with following test program(replace the "^>" with ""):
------
># cat tools/testing/selftests/powerpc/stringloops/memcmp.c
>#include <malloc.h>
>#include <stdlib.h>
>#include <string.h>
>#include <time.h>
>#include "utils.h"
>#define SIZE (1024 * 1024 * 900)
>#define ITERATIONS 40

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
        char *s1;
        char *s2;
        unsigned long i;

        s1 = memalign(128, SIZE);
        if (!s1) {
                perror("memalign");
                exit(1);
        }

        s2 = memalign(128, SIZE);
        if (!s2) {
                perror("memalign");
                exit(1);
        }

        for (i = 0; i < SIZE; i++)  {
                s1[i] = i & 0xff;
                s2[i] = i & 0xff;
        }
        for (i = 0; i < ITERATIONS; i++) {
		int ret = test_memcmp(s1, s2, SIZE);

		if (ret) {
			printf("return %d at[%ld]! should have returned zero\n", ret, i);
			abort();
		}
	}

        return 0;
}

int main(void)
{
        return test_harness(testcase, "memcmp");
}
------
Without this patch (but with the first patch "powerpc/64: Align bytes
before fall back to .Lshort in powerpc64 memcmp()." in the series):
	4.726728762 seconds time elapsed                                          ( +-  3.54%)
With VMX patch:
	4.234335473 seconds time elapsed                                          ( +-  2.63%)
		There is ~+10% improvement.

Testing with unaligned and different offset version (make s1 and s2 shift
random offset within 16 bytes) can archieve higher improvement than 10%..

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/include/asm/asm-prototypes.h |   4 +-
 arch/powerpc/lib/copypage_power7.S        |   4 +-
 arch/powerpc/lib/memcmp_64.S              | 241 +++++++++++++++++++++++++++++-
 arch/powerpc/lib/memcpy_power7.S          |   6 +-
 arch/powerpc/lib/vmx-helper.c             |   4 +-
 5 files changed, 248 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index d9713ad..31fdcee 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,8 +49,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
-void * exit_vmx_copy(void *dest);
+int enter_vmx_ops(void);
+void *exit_vmx_ops(void *dest);
 
 /* Traps */
 long machine_check_early(struct pt_regs *regs);
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index 8fa73b7..e38f956 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
 	addi	r3,r3,128
 	bdnz	1b
 
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 #else
 	li	r0,(PAGE_SIZE/128)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 5776f91..be2f792 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -9,6 +9,7 @@
  */
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/ppc-opcode.h>
 
 #define off8	r6
 #define off16	r7
@@ -27,12 +28,73 @@
 #define LH	lhbrx
 #define LW	lwbrx
 #define LD	ldbrx
+#define LVS	lvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRB,_VRA,_VRC
 #else
 #define LH	lhzx
 #define LW	lwzx
 #define LD	ldx
+#define LVS	lvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
+#define VMX_THRESH 4096
+#define ENTER_VMX_OPS	\
+	mflr    r0;	\
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      enter_vmx_ops; \
+	cmpwi   cr1,r3,0; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+#define EXIT_VMX_OPS \
+	mflr    r0; \
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      exit_vmx_ops; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+/*
+ * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
+ * 16 bytes boundary and permute the result with the 1st 16 bytes.
+
+ *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
+ *    ^                                  ^                                 ^
+ * 0xbbbb10                          0xbbbb20                          0xbbb30
+ *                                 ^
+ *                                _vaddr
+ *
+ *
+ * _vmask is the mask generated by LVS
+ * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
+ *   for example: 0xyyyyyyyyyyyyy012 for big endian
+ * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
+ *   for example: 0x3456789abcdefzzz for big endian
+ * The permute result is saved in _v_res.
+ *   for example: 0x0123456789abcdef for big endian.
+ */
+#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
+        lvx     _v2nd_qw,_vaddr,off16; \
+        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
+
 /*
  * There are 2 categories for memcmp:
  * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
@@ -40,7 +102,7 @@
  * 2) src/dst has different offset to the 8 bytes boundary. The handlers
  * are named like .Ldiffoffset_xxxx
  */
-_GLOBAL(memcmp)
+_GLOBAL_TOC(memcmp)
 	cmpdi	cr1,r5,0
 
 	/* Use the short loop if the src/dst addresses are not
@@ -132,7 +194,7 @@ _GLOBAL(memcmp)
 	bgt	cr6,.Llong
 
 .Lcmp_lt32bytes:
-	/* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */
+	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
 	cmpdi   cr5,r5,7
 	srdi    r0,r5,3
 	ble	cr5,.Lcmp_rest_lt8bytes
@@ -173,6 +235,15 @@ _GLOBAL(memcmp)
 	blr
 
 .Llong:
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	/* Try to use vmx loop if length is equal or greater than 4K */
+	cmpldi  cr6,r5,VMX_THRESH
+	bge	cr6,.Lsameoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Llong_novmx_cmp:
+#endif
 	/* At least s1 addr is aligned with 8 bytes */
 	li	off8,8
 	li	off16,16
@@ -330,7 +401,97 @@ _GLOBAL(memcmp)
 	li	r3,-1
 	blr
 
+#ifdef CONFIG_ALTIVEC
+.Lsameoffset_vmx_cmp:
+	/* Enter with src/dst addrs has the same offset with 8 bytes
+	 * align boundary
+	 */
+	ENTER_VMX_OPS
+	beq     cr1,.Llong_novmx_cmp
+
+3:
+	/* need to check whether r4 has the same offset with r3
+	 * for 16 bytes boundary.
+	 */
+	xor	r0,r3,r4
+	andi.	r0,r0,0xf
+	bne	.Ldiffoffset_vmx_cmp_start
+
+	/* len is no less than 4KB. Need to align with 16 bytes further.
+	 */
+	andi.	rA,r3,8
+	LD	rA,0,r3
+	beq	4f
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	addi	r5,r5,-8
+
+	beq	cr0,4f
+	/* save and restore cr0 */
+	mfocrf  r5,128
+	EXIT_VMX_OPS
+	mtocrf  128,r5
+	b	.LcmpAB_lightweight
+
+4:
+	/* compare 32 bytes for each loop */
+	srdi	r0,r5,5
+	mtctr	r0
+	clrldi  r5,r5,59
+	li	off16,16
+
+.balign 16
+5:
+	lvx 	v0,0,r3
+	lvx 	v1,0,r4
+	VCMPEQUD_RC(v0,v0,v1)
+	bnl	cr6,7f
+	lvx 	v0,off16,r3
+	lvx 	v1,off16,r4
+	VCMPEQUD_RC(v0,v0,v1)
+	bnl	cr6,6f
+	addi	r3,r3,32
+	addi	r4,r4,32
+	bdnz	5b
+
+	EXIT_VMX_OPS
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+6:
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+7:
+	/* diff the last 16 bytes */
+	EXIT_VMX_OPS
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	li	off8,8
+	bne	cr0,.LcmpAB_lightweight
+
+	LD	rA,off8,r3
+	LD	rB,off8,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+#endif
+
 .Ldiffoffset_8bytes_make_align_start:
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	/* only do vmx ops when the size equal or greater than 4K bytes */
+	cmpdi	cr5,r5,VMX_THRESH
+	bge	cr5,.Ldiffoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Ldiffoffset_novmx_cmp:
+#endif
+
 	/* now try to align s1 with 8 bytes */
 	rlwinm  r6,r3,3,26,28
 	beq     .Ldiffoffset_align_s1_8bytes
@@ -356,6 +517,82 @@ _GLOBAL(memcmp)
 	/* now s1 is aligned with 8 bytes. */
 	cmpdi   cr5,r5,31
 	ble	cr5,.Lcmp_lt32bytes
+
+#ifdef CONFIG_ALTIVEC
+	b	.Llong_novmx_cmp
+#else
 	b	.Llong
+#endif
+
+#ifdef CONFIG_ALTIVEC
+.Ldiffoffset_vmx_cmp:
+	ENTER_VMX_OPS
+	beq     cr1,.Ldiffoffset_novmx_cmp
+
+.Ldiffoffset_vmx_cmp_start:
+	/* Firstly try to align r3 with 16 bytes */
+	andi.   r6,r3,0xf
+	li	off16,16
+	beq     .Ldiffoffset_vmx_s1_16bytes_align
 
+	LVS	v3,0,r3
+	LVS	v4,0,r4
+
+	lvx     v5,0,r3
+	lvx     v6,0,r4
+	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+
+	VCMPEQUB_RC(v7,v9,v10)
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	subfic  r6,r6,16
+	subf    r5,r6,r5
+	add     r3,r3,r6
+	add     r4,r4,r6
+
+.Ldiffoffset_vmx_s1_16bytes_align:
+	/* now s1 is aligned with 16 bytes */
+	lvx     v6,0,r4
+	LVS	v4,0,r4
+	srdi	r6,r5,5  /* loop for 32 bytes each */
+	clrldi  r5,r5,59
+	mtctr	r6
+
+.balign	16
+.Ldiffoffset_vmx_32bytesloop:
+	/* the first qw of r4 was saved in v6 */
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	VCMPEQUB_RC(v7,v9,v10)
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	lvx	v9,0,r3
+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
+	VCMPEQUB_RC(v7,v9,v10)
+	vor	v6,v8,v8
+	bnl	cr6,.Ldiffoffset_vmx_diff_found
+
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	bdnz	.Ldiffoffset_vmx_32bytesloop
+
+	EXIT_VMX_OPS
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lcmp_lt32bytes
+
+.Ldiffoffset_vmx_diff_found:
+	EXIT_VMX_OPS
+	/* anyway, the diff will appear in next 16 bytes */
+	li	r5,16
+	b	.Lcmp_lt32bytes
+
+#endif
 EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index df7de9d..070cdf6 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7)
 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	cr1,r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
@@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 
 .Lvmx_unaligned_copy:
 	/* Get the destination 16B aligned */
@@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7)
 
 15:	addi	r1,r1,STACKFRAMESIZE
 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
-	b	exit_vmx_copy		/* tail call optimise */
+	b	exit_vmx_ops		/* tail call optimise */
 #endif /* CONFIG_ALTIVEC */
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cd..9f34049 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
 	return 0;
 }
 
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
 {
 	if (in_interrupt())
 		return 0;
@@ -70,7 +70,7 @@ int enter_vmx_copy(void)
  * passed a pointer to the destination which we return as required by a
  * memcpy implementation.
  */
-void *exit_vmx_copy(void *dest)
+void *exit_vmx_ops(void *dest)
 {
 	disable_kernel_altivec();
 	preempt_enable();
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v8 4/5] powerpc/64: add 32 bytes prechecking before using VMX optimization on memcmp()
From: wei.guo.simon @ 2018-06-07  1:57 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1528336675-10879-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patch is based on the previous VMX patch on memcmp().

To optimize ppc64 memcmp() with VMX instruction, we need to think about
the VMX penalty brought with: If kernel uses VMX instruction, it needs
to save/restore current thread's VMX registers. There are 32 x 128 bits
VMX registers in PPC, which means 32 x 16 = 512 bytes for load and store.

The major concern regarding the memcmp() performance in kernel is KSM,
who will use memcmp() frequently to merge identical pages. So it will
make sense to take some measures/enhancement on KSM to see whether any
improvement can be done here.  Cyril Bur indicates that the memcmp() for
KSM has a higher possibility to fail (unmatch) early in previous bytes
in following mail.
	https://patchwork.ozlabs.org/patch/817322/#1773629
And I am taking a follow-up on this with this patch.

Per some testing, it shows KSM memcmp() will fail early at previous 32
bytes.  More specifically:
    - 76% cases will fail/unmatch before 16 bytes;
    - 83% cases will fail/unmatch before 32 bytes;
    - 84% cases will fail/unmatch before 64 bytes;
So 32 bytes looks a better choice than other bytes for pre-checking.

The early failure is also true for memcmp() for non-KSM case. With a
non-typical call load, it shows ~73% cases fail before first 32 bytes.

This patch adds a 32 bytes pre-checking firstly before jumping into VMX
operations, to avoid the unnecessary VMX penalty. It is not limited to
KSM case. And the testing shows ~20% improvement on memcmp() average
execution time with this patch.

And note the 32B pre-checking is only performed when the compare size
is long enough (>=4K currently) to allow VMX operation.

The detail data and analysis is at:
https://github.com/justdoitqd/publicFiles/blob/master/memcmp/README.md

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/lib/memcmp_64.S | 57 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index be2f792..844d8e7 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -404,8 +404,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 #ifdef CONFIG_ALTIVEC
 .Lsameoffset_vmx_cmp:
 	/* Enter with src/dst addrs has the same offset with 8 bytes
-	 * align boundary
+	 * align boundary.
+	 *
+	 * There is an optimization based on following fact: memcmp()
+	 * prones to fail early at the first 32 bytes.
+	 * Before applying VMX instructions which will lead to 32x128bits
+	 * VMX regs load/restore penalty, we compare the first 32 bytes
+	 * so that we can catch the ~80% fail cases.
 	 */
+
+	li	r0,4
+	mtctr	r0
+.Lsameoffset_prechk_32B_loop:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne     cr0,.LcmpAB_lightweight
+	addi	r5,r5,-8
+	bdnz	.Lsameoffset_prechk_32B_loop
+
 	ENTER_VMX_OPS
 	beq     cr1,.Llong_novmx_cmp
 
@@ -482,16 +501,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 #endif
 
 .Ldiffoffset_8bytes_make_align_start:
-#ifdef CONFIG_ALTIVEC
-BEGIN_FTR_SECTION
-	/* only do vmx ops when the size equal or greater than 4K bytes */
-	cmpdi	cr5,r5,VMX_THRESH
-	bge	cr5,.Ldiffoffset_vmx_cmp
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-
-.Ldiffoffset_novmx_cmp:
-#endif
-
 	/* now try to align s1 with 8 bytes */
 	rlwinm  r6,r3,3,26,28
 	beq     .Ldiffoffset_align_s1_8bytes
@@ -515,6 +524,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 .Ldiffoffset_align_s1_8bytes:
 	/* now s1 is aligned with 8 bytes. */
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	/* only do vmx ops when the size equal or greater than 4K bytes */
+	cmpdi	cr5,r5,VMX_THRESH
+	bge	cr5,.Ldiffoffset_vmx_cmp
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+.Ldiffoffset_novmx_cmp:
+#endif
+
+
 	cmpdi   cr5,r5,31
 	ble	cr5,.Lcmp_lt32bytes
 
@@ -526,6 +546,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_ALTIVEC
 .Ldiffoffset_vmx_cmp:
+	/* perform a 32 bytes pre-checking before
+	 * enable VMX operations.
+	 */
+	li	r0,4
+	mtctr	r0
+.Ldiffoffset_prechk_32B_loop:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bne     cr0,.LcmpAB_lightweight
+	addi	r5,r5,-8
+	bdnz	.Ldiffoffset_prechk_32B_loop
+
 	ENTER_VMX_OPS
 	beq     cr1,.Ldiffoffset_novmx_cmp
 
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH v8 5/5] powerpc:selftest update memcmp_64 selftest for VMX implementation
From: wei.guo.simon @ 2018-06-07  1:57 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Paul Mackerras, Michael Ellerman, Naveen N.  Rao, Cyril Bur,
	Simon Guo
In-Reply-To: <1528336675-10879-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patch reworked selftest memcmp_64 so that memcmp selftest can
cover more test cases.

It adds testcases for:
- memcmp over 4K bytes size.
- s1/s2 with different/random offset on 16 bytes boundary.
- enter/exit_vmx_ops pairness.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 .../selftests/powerpc/copyloops/asm/ppc_asm.h      |  4 +-
 .../selftests/powerpc/stringloops/asm/ppc-opcode.h | 39 +++++++++
 .../selftests/powerpc/stringloops/asm/ppc_asm.h    | 25 ++++++
 .../testing/selftests/powerpc/stringloops/memcmp.c | 98 +++++++++++++++++-----
 4 files changed, 142 insertions(+), 24 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h

diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
index 5ffe04d..dfce161 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
@@ -36,11 +36,11 @@
 	li	r3,0
 	blr
 
-FUNC_START(enter_vmx_copy)
+FUNC_START(enter_vmx_ops)
 	li	r3,1
 	blr
 
-FUNC_START(exit_vmx_copy)
+FUNC_START(exit_vmx_ops)
 	blr
 
 FUNC_START(memcpy_power7)
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
new file mode 100644
index 0000000..9de413c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2009 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * provides masks and opcode images for use by code generation, emulation
+ * and for instructions that older assemblers might not know about
+ */
+#ifndef _ASM_POWERPC_PPC_OPCODE_H
+#define _ASM_POWERPC_PPC_OPCODE_H
+
+
+#  define stringify_in_c(...)	__VA_ARGS__
+#  define ASM_CONST(x)		x
+
+
+#define PPC_INST_VCMPEQUD_RC		0x100000c7
+#define PPC_INST_VCMPEQUB_RC		0x10000006
+
+#define __PPC_RC21     (0x1 << 10)
+
+/* macros to insert fields into opcodes */
+#define ___PPC_RA(a)	(((a) & 0x1f) << 16)
+#define ___PPC_RB(b)	(((b) & 0x1f) << 11)
+#define ___PPC_RS(s)	(((s) & 0x1f) << 21)
+#define ___PPC_RT(t)	___PPC_RS(t)
+
+#define VCMPEQUD_RC(vrt, vra, vrb)	stringify_in_c(.long PPC_INST_VCMPEQUD_RC | \
+			      ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+			      ___PPC_RB(vrb) | __PPC_RC21)
+
+#define VCMPEQUB_RC(vrt, vra, vrb)	stringify_in_c(.long PPC_INST_VCMPEQUB_RC | \
+			      ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+			      ___PPC_RB(vrb) | __PPC_RC21)
+
+#endif /* _ASM_POWERPC_PPC_OPCODE_H */
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
index 136242e..d2c0a91 100644
--- a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -1,4 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PPC_ASM_H
+#define __PPC_ASM_H
 #include <ppc-asm.h>
 
 #ifndef r1
@@ -6,3 +8,26 @@
 #endif
 
 #define _GLOBAL(A) FUNC_START(test_ ## A)
+#define _GLOBAL_TOC(A) FUNC_START(test_ ## A)
+
+#define CONFIG_ALTIVEC
+
+#define R14 r14
+#define R15 r15
+#define R16 r16
+#define R17 r17
+#define R18 r18
+#define R19 r19
+#define R20 r20
+#define R21 r21
+#define R22 r22
+#define R29 r29
+#define R30 r30
+#define R31 r31
+
+#define STACKFRAMESIZE	256
+#define STK_REG(i)	(112 + ((i)-14)*8)
+
+#define BEGIN_FTR_SECTION
+#define END_FTR_SECTION_IFSET(val)
+#endif
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c b/tools/testing/selftests/powerpc/stringloops/memcmp.c
index 8250db2..b5cf717 100644
--- a/tools/testing/selftests/powerpc/stringloops/memcmp.c
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -2,20 +2,40 @@
 #include <malloc.h>
 #include <stdlib.h>
 #include <string.h>
+#include <time.h>
 #include "utils.h"
 
 #define SIZE 256
 #define ITERATIONS 10000
 
+#define LARGE_SIZE (5 * 1024)
+#define LARGE_ITERATIONS 1000
+#define LARGE_MAX_OFFSET 32
+#define LARGE_SIZE_START 4096
+
+#define MAX_OFFSET_DIFF_S1_S2 48
+
+int vmx_count;
+int enter_vmx_ops(void)
+{
+	vmx_count++;
+	return 1;
+}
+
+void exit_vmx_ops(void)
+{
+	vmx_count--;
+}
 int test_memcmp(const void *s1, const void *s2, size_t n);
 
 /* test all offsets and lengths */
-static void test_one(char *s1, char *s2)
+static void test_one(char *s1, char *s2, unsigned long max_offset,
+		unsigned long size_start, unsigned long max_size)
 {
 	unsigned long offset, size;
 
-	for (offset = 0; offset < SIZE; offset++) {
-		for (size = 0; size < (SIZE-offset); size++) {
+	for (offset = 0; offset < max_offset; offset++) {
+		for (size = size_start; size < (max_size - offset); size++) {
 			int x, y;
 			unsigned long i;
 
@@ -35,70 +55,104 @@ static void test_one(char *s1, char *s2)
 				printf("\n");
 				abort();
 			}
+
+			if (vmx_count != 0) {
+				printf("vmx enter/exit not paired.(offset:%ld size:%ld s1:%p s2:%p vc:%d\n",
+					offset, size, s1, s2, vmx_count);
+				printf("\n");
+				abort();
+			}
 		}
 	}
 }
 
-static int testcase(void)
+static int testcase(bool islarge)
 {
 	char *s1;
 	char *s2;
 	unsigned long i;
 
-	s1 = memalign(128, SIZE);
+	unsigned long comp_size = (islarge ? LARGE_SIZE : SIZE);
+	unsigned long alloc_size = comp_size + MAX_OFFSET_DIFF_S1_S2;
+	int iterations = islarge ? LARGE_ITERATIONS : ITERATIONS;
+
+	s1 = memalign(128, alloc_size);
 	if (!s1) {
 		perror("memalign");
 		exit(1);
 	}
 
-	s2 = memalign(128, SIZE);
+	s2 = memalign(128, alloc_size);
 	if (!s2) {
 		perror("memalign");
 		exit(1);
 	}
 
-	srandom(1);
+	srandom(time(0));
 
-	for (i = 0; i < ITERATIONS; i++) {
+	for (i = 0; i < iterations; i++) {
 		unsigned long j;
 		unsigned long change;
+		char *rand_s1 = s1;
+		char *rand_s2 = s2;
 
-		for (j = 0; j < SIZE; j++)
+		for (j = 0; j < alloc_size; j++)
 			s1[j] = random();
 
-		memcpy(s2, s1, SIZE);
+		rand_s1 += random() % MAX_OFFSET_DIFF_S1_S2;
+		rand_s2 += random() % MAX_OFFSET_DIFF_S1_S2;
+		memcpy(rand_s2, rand_s1, comp_size);
 
 		/* change one byte */
-		change = random() % SIZE;
-		s2[change] = random() & 0xff;
-
-		test_one(s1, s2);
+		change = random() % comp_size;
+		rand_s2[change] = random() & 0xff;
+
+		if (islarge)
+			test_one(rand_s1, rand_s2, LARGE_MAX_OFFSET,
+					LARGE_SIZE_START, comp_size);
+		else
+			test_one(rand_s1, rand_s2, SIZE, 0, comp_size);
 	}
 
-	srandom(1);
+	srandom(time(0));
 
-	for (i = 0; i < ITERATIONS; i++) {
+	for (i = 0; i < iterations; i++) {
 		unsigned long j;
 		unsigned long change;
+		char *rand_s1 = s1;
+		char *rand_s2 = s2;
 
-		for (j = 0; j < SIZE; j++)
+		for (j = 0; j < alloc_size; j++)
 			s1[j] = random();
 
-		memcpy(s2, s1, SIZE);
+		rand_s1 += random() % MAX_OFFSET_DIFF_S1_S2;
+		rand_s2 += random() % MAX_OFFSET_DIFF_S1_S2;
+		memcpy(rand_s2, rand_s1, comp_size);
 
 		/* change multiple bytes, 1/8 of total */
-		for (j = 0; j < SIZE / 8; j++) {
-			change = random() % SIZE;
+		for (j = 0; j < comp_size / 8; j++) {
+			change = random() % comp_size;
 			s2[change] = random() & 0xff;
 		}
 
-		test_one(s1, s2);
+		if (islarge)
+			test_one(rand_s1, rand_s2, LARGE_MAX_OFFSET,
+					LARGE_SIZE_START, comp_size);
+		else
+			test_one(rand_s1, rand_s2, SIZE, 0, comp_size);
 	}
 
 	return 0;
 }
 
+static int testcases(void)
+{
+	testcase(0);
+	testcase(1);
+	return 0;
+}
+
 int main(void)
 {
-	return test_harness(testcase, "memcmp");
+	return test_harness(testcases, "memcmp");
 }
-- 
1.8.3.1

^ permalink raw reply related

* [v2, 00/10] Support DPAA PTP clock and timestamping
From: Yangbo Lu @ 2018-06-07  3:22 UTC (permalink / raw)
  To: netdev, madalin.bucur, Richard Cochran, Rob Herring, Shawn Guo,
	David S . Miller
  Cc: devicetree, linuxppc-dev, linux-arm-kernel, linux-kernel,
	Yangbo Lu

This patchset is to support DPAA FMAN PTP clock and HW timestamping.
- The patch #1 to patch #5 are to support DPAA FMAN 1588 timer in
  ptp_qoriq driver.
- The patch #6 to patch #10 are to add HW timestamping support in
  DPAA ethernet driver.

Yangbo Lu (10):
  fsl/fman: share the event interrupt
  ptp: support DPAA FMan 1588 timer in ptp_qoriq
  dt-binding: ptp_qoriq: add DPAA FMan support
  powerpc/mpc85xx: move ptp timer out of fman in dts
  arm64: dts: fsl: move ptp timer out of fman
  fsl/fman: add set_tstamp interface
  fsl/fman_port: support getting timestamp field
  fsl/fman: define frame description command UPD
  dpaa_eth: add support for hardware timestamping
  dpaa_eth: add the get_ts_info interface for ethtool

 Documentation/devicetree/bindings/net/fsl-fman.txt |   25 +-----
 .../devicetree/bindings/ptp/ptp-qoriq.txt          |   15 +++-
 arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi   |   14 ++-
 arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi        |   14 ++-
 arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi        |   14 ++-
 arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi       |   14 ++-
 arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi       |   14 ++-
 arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi      |   14 ++-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c     |  101 ++++++++++++++++++-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.h     |    3 +
 drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c |   39 ++++++++
 drivers/net/ethernet/freescale/fman/fman.c         |    3 +-
 drivers/net/ethernet/freescale/fman/fman.h         |    1 +
 drivers/net/ethernet/freescale/fman/fman_dtsec.c   |   27 +++++
 drivers/net/ethernet/freescale/fman/fman_dtsec.h   |    1 +
 drivers/net/ethernet/freescale/fman/fman_memac.c   |    5 +
 drivers/net/ethernet/freescale/fman/fman_memac.h   |    1 +
 drivers/net/ethernet/freescale/fman/fman_port.c    |   12 +++
 drivers/net/ethernet/freescale/fman/fman_port.h    |    3 +
 drivers/net/ethernet/freescale/fman/fman_tgec.c    |   21 ++++
 drivers/net/ethernet/freescale/fman/fman_tgec.h    |    1 +
 drivers/net/ethernet/freescale/fman/mac.c          |    3 +
 drivers/net/ethernet/freescale/fman/mac.h          |    1 +
 drivers/ptp/Kconfig                                |    2 +-
 drivers/ptp/ptp_qoriq.c                            |  104 ++++++++++++-------
 include/linux/fsl/ptp_qoriq.h                      |   38 ++++++--
 26 files changed, 375 insertions(+), 115 deletions(-)

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox