[RFC PATCH 0/3] kernel: add support for 256-bit IO access

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH 0/3] kernel: add support for 256-bit IO access
@ 2018-03-19 14:20 Rahul Lakkireddy
  2018-03-19 14:20 ` [RFC PATCH 1/3] include/linux: add 256-bit IO accessors Rahul Lakkireddy
                   ` (4 more replies)
  0 siblings, 5 replies; 47+ messages in thread
From: Rahul Lakkireddy @ 2018-03-19 14:20 UTC (permalink / raw)
  To: x86, linux-kernel, netdev
  Cc: tglx, mingo, hpa, davem, akpm, torvalds, ganeshgr, nirranjan,
	indranil, Rahul Lakkireddy

This series of patches add support for 256-bit IO read and write.
The APIs are readqq and writeqq (quad quadword - 4 x 64), that read
and write 256-bits at a time from IO, respectively.

Patch 1 adds u256 type and adds necessary non-atomic accessors.  Also
adds byteorder conversion APIs.

Patch 2 adds 256-bit read and write to x86 via VMOVDQU AVX CPU
instructions.

Patch 3 updates cxgb4 driver to use the readqq API to speed up
reading on-chip memory 256-bits at a time.

Feedback and suggestions will be much appreciated.

Thanks,
Rahul

Rahul Lakkireddy (3):
  include/linux: add 256-bit IO accessors
  x86/io: implement 256-bit IO read and write
  cxgb4: read on-chip memory 256-bits at a time

 arch/x86/include/asm/io.h                      | 57 ++++++++++++++++++++++++-
 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 16 +++----
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h     |  6 +++
 include/linux/byteorder/generic.h              | 48 +++++++++++++++++++++
 include/linux/io-64-nonatomic-hi-lo.h          | 59 ++++++++++++++++++++++++++
 include/linux/io-64-nonatomic-lo-hi.h          | 59 ++++++++++++++++++++++++++
 include/linux/types.h                          |  7 +++
 7 files changed, 243 insertions(+), 9 deletions(-)

-- 
2.14.1

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [RFC PATCH 1/3] include/linux: add 256-bit IO accessors
  2018-03-19 14:20 [RFC PATCH 0/3] kernel: add support for 256-bit IO access Rahul Lakkireddy
@ 2018-03-19 14:20 ` Rahul Lakkireddy
  2018-03-19 14:20 ` [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write Rahul Lakkireddy
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 47+ messages in thread
From: Rahul Lakkireddy @ 2018-03-19 14:20 UTC (permalink / raw)
  To: x86, linux-kernel, netdev
  Cc: tglx, mingo, hpa, davem, akpm, torvalds, ganeshgr, nirranjan,
	indranil, Rahul Lakkireddy

Add readqq and writeqq (quad quadword - 4 x 64) IO non-atomic
accessors.  Also add byteorder conversions for 256-bit.

Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
 include/linux/byteorder/generic.h     | 48 ++++++++++++++++++++++++++++
 include/linux/io-64-nonatomic-hi-lo.h | 59 +++++++++++++++++++++++++++++++++++
 include/linux/io-64-nonatomic-lo-hi.h | 59 +++++++++++++++++++++++++++++++++++
 include/linux/types.h                 |  7 +++++
 4 files changed, 173 insertions(+)

diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index 451aaa0786ae..c6c936818a3a 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -187,4 +187,52 @@ static inline void be32_to_cpu_array(u32 *dst, const __be32 *src, size_t len)
 		dst[i] = be32_to_cpu(src[i]);
 }
 
+static inline __le256 cpu_to_le256(u256 val)
+{
+	__le256 ret;
+
+	ret.a = cpu_to_le64(val.a);
+	ret.b = cpu_to_le64(val.b);
+	ret.c = cpu_to_le64(val.c);
+	ret.d = cpu_to_le64(val.d);
+
+	return ret;
+}
+
+static inline u256 le256_to_cpu(__le256 val)
+{
+	u256 ret;
+
+	ret.a = le64_to_cpu(val.a);
+	ret.b = le64_to_cpu(val.b);
+	ret.c = le64_to_cpu(val.c);
+	ret.d = le64_to_cpu(val.d);
+
+	return ret;
+}
+
+static inline __be256 cpu_to_be256(u256 val)
+{
+	__be256 ret;
+
+	ret.a = cpu_to_be64(val.d);
+	ret.b = cpu_to_be64(val.c);
+	ret.c = cpu_to_be64(val.b);
+	ret.d = cpu_to_be64(val.a);
+
+	return ret;
+}
+
+static inline u256 be256_to_cpu(__be256 val)
+{
+	u256 ret;
+
+	ret.a = be64_to_cpu(val.d);
+	ret.b = be64_to_cpu(val.c);
+	ret.c = be64_to_cpu(val.b);
+	ret.d = be64_to_cpu(val.a);
+
+	return ret;
+}
+
 #endif /* _LINUX_BYTEORDER_GENERIC_H */
diff --git a/include/linux/io-64-nonatomic-hi-lo.h b/include/linux/io-64-nonatomic-hi-lo.h
index 862d786a904f..9bdc42132020 100644
--- a/include/linux/io-64-nonatomic-hi-lo.h
+++ b/include/linux/io-64-nonatomic-hi-lo.h
@@ -55,4 +55,63 @@ static inline void hi_lo_writeq_relaxed(__u64 val, volatile void __iomem *addr)
 #define writeq_relaxed hi_lo_writeq_relaxed
 #endif
 
+static inline __u256 hi_lo_readqq(const volatile void __iomem *addr)
+{
+	const volatile u64 __iomem *p = addr;
+	__u256 ret;
+
+	ret.d = readq(p + 3);
+	ret.c = readq(p + 2);
+	ret.b = readq(p + 1);
+	ret.a = readq(p);
+
+	return ret;
+}
+
+static inline void hi_lo_writeqq(__u256 val, volatile void __iomem *addr)
+{
+	writeq(val.d, addr + 24);
+	writeq(val.c, addr + 16);
+	writeq(val.b, addr + 8);
+	writeq(val.a, addr);
+}
+
+static inline __u256 hi_lo_readqq_relaxed(const volatile void __iomem *addr)
+{
+	const volatile u64 __iomem *p = addr;
+	__u256 ret;
+
+	ret.d = readq_relaxed(p + 3);
+	ret.c = readq_relaxed(p + 2);
+	ret.b = readq_relaxed(p + 1);
+	ret.a = readq_relaxed(p);
+
+	return ret;
+}
+
+static inline void hi_lo_writeqq_relaxed(__u256 val,
+					 volatile void __iomem *addr)
+{
+	writeq_relaxed(val.d, addr + 24);
+	writeq_relaxed(val.c, addr + 16);
+	writeq_relaxed(val.b, addr + 8);
+	writeq_relaxed(val.a, addr);
+}
+
+#ifndef readqq
+#define readqq hi_lo_readqq
+#endif
+
+#ifndef writeqq
+#define writeqq hi_lo_writeqq
+#endif
+
+#ifndef readqq_relaxed
+#define readqq_relaxed hi_lo_readqq_relaxed
+#endif
+
+#ifndef writeqq_relaxed
+#define writeqq_relaxed hi_lo_writeqq_relaxed
+#endif
+
 #endif	/* _LINUX_IO_64_NONATOMIC_HI_LO_H_ */
diff --git a/include/linux/io-64-nonatomic-lo-hi.h b/include/linux/io-64-nonatomic-lo-hi.h
index d042e7bb5adb..8aaf734e1d51 100644
--- a/include/linux/io-64-nonatomic-lo-hi.h
+++ b/include/linux/io-64-nonatomic-lo-hi.h
@@ -55,4 +55,63 @@ static inline void lo_hi_writeq_relaxed(__u64 val, volatile void __iomem *addr)
 #define writeq_relaxed lo_hi_writeq_relaxed
 #endif
 
+static inline __u256 lo_hi_readqq(const volatile void __iomem *addr)
+{
+	const volatile u64 __iomem *p = addr;
+	__u256 ret;
+
+	ret.a = readq(p);
+	ret.b = readq(p + 1);
+	ret.c = readq(p + 2);
+	ret.d = readq(p + 3);
+
+	return ret;
+}
+
+static inline void lo_hi_writeqq(__u256 val, volatile void __iomem *addr)
+{
+	writeq(val.a, addr);
+	writeq(val.b, addr + 8);
+	writeq(val.c, addr + 16);
+	writeq(val.d, addr + 24);
+}
+
+static inline __u256 lo_hi_readqq_relaxed(const volatile void __iomem *addr)
+{
+	const volatile u64 __iomem *p = addr;
+	__u256 ret;
+
+	ret.a = readq_relaxed(p);
+	ret.b = readq_relaxed(p + 1);
+	ret.c = readq_relaxed(p + 2);
+	ret.d = readq_relaxed(p + 3);
+
+	return ret;
+}
+
+static inline void lo_hi_writeqq_relaxed(__u256 val,
+					 volatile void __iomem *addr)
+{
+	writeq_relaxed(val.a, addr);
+	writeq_relaxed(val.b, addr + 8);
+	writeq_relaxed(val.c, addr + 16);
+	writeq_relaxed(val.d, addr + 24);
+}
+
+#ifndef readqq
+#define readqq lo_hi_readqq
+#endif
+
+#ifndef writeqq
+#define writeqq lo_hi_writeqq
+#endif
+
+#ifndef readqq_relaxed
+#define readqq_relaxed lo_hi_readqq_relaxed
+#endif
+
+#ifndef writeqq_relaxed
+#define writeqq_relaxed lo_hi_writeqq_relaxed
+#endif
+
 #endif	/* _LINUX_IO_64_NONATOMIC_LO_HI_H_ */
diff --git a/include/linux/types.h b/include/linux/types.h
index c94d59ef96cc..f4ee2857d253 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -114,6 +114,13 @@ typedef		__u64		u_int64_t;
 typedef		__s64		int64_t;
 #endif
 
+typedef struct {
+	__u64 a, b, c, d;
+} __u256;
+typedef __u256 u256;
+typedef __u256 __le256;
+typedef __u256 __be256;
+
 /* this is a special 64bit data type that is 8-byte aligned */
 #define aligned_u64 __u64 __attribute__((aligned(8)))
 #define aligned_be64 __be64 __attribute__((aligned(8)))
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-19 14:20 [RFC PATCH 0/3] kernel: add support for 256-bit IO access Rahul Lakkireddy
  2018-03-19 14:20 ` [RFC PATCH 1/3] include/linux: add 256-bit IO accessors Rahul Lakkireddy
@ 2018-03-19 14:20 ` Rahul Lakkireddy
  2018-03-19 14:43   ` Thomas Gleixner
  2018-03-19 14:20 ` [RFC PATCH 3/3] cxgb4: read on-chip memory 256-bits at a time Rahul Lakkireddy
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 47+ messages in thread
From: Rahul Lakkireddy @ 2018-03-19 14:20 UTC (permalink / raw)
  To: x86, linux-kernel, netdev
  Cc: tglx, mingo, hpa, davem, akpm, torvalds, ganeshgr, nirranjan,
	indranil, Rahul Lakkireddy

Use VMOVDQU AVX CPU instruction when available to do 256-bit
IO read and write.

Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
 arch/x86/include/asm/io.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 95e948627fd0..b04f417b3374 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -109,7 +109,62 @@ build_mmio_write(__writeq, "q", unsigned long, "r", )
 #define readq			readq
 #define writeq			writeq
 
-#endif
+#ifdef CONFIG_AS_AVX
+#include <asm/fpu/api.h>
+
+static inline u256 __readqq(const volatile void __iomem *addr)
+{
+	u256 ret;
+
+	kernel_fpu_begin();
+	asm volatile("vmovdqu %0, %%ymm0" :
+		     : "m" (*(volatile u256 __force *)addr));
+	asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret));
+	kernel_fpu_end();
+	return ret;
+}
+
+static inline u256 readqq(const volatile void __iomem *addr)
+{
+	u256 ret;
+
+	kernel_fpu_begin();
+	asm volatile("vmovdqu %0, %%ymm0" :
+		     : "m" (*(volatile u256 __force *)addr));
+	asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret) : : "memory");
+	kernel_fpu_end();
+	return ret;
+}
+
+#define __raw_readqq __readqq
+#define readqq_relaxed(a) __readqq(a)
+#define readqq readqq
+
+static inline void __writeqq(u256 val, volatile void __iomem *addr)
+{
+	kernel_fpu_begin();
+	asm volatile("vmovdqu %0, %%ymm0" : : "m" (val));
+	asm volatile("vmovdqu %%ymm0, %0"
+		     : "=m" (*(volatile u256 __force *)addr));
+	kernel_fpu_end();
+}
+
+static inline void writeqq(u256 val, volatile void __iomem *addr)
+{
+	kernel_fpu_begin();
+	asm volatile("vmovdqu %0, %%ymm0" : : "m" (val));
+	asm volatile("vmovdqu %%ymm0, %0"
+		     : "=m" (*(volatile u256 __force *)addr)
+		     : : "memory");
+	kernel_fpu_end();
+}
+
+#define __raw_writeqq __writeqq
+#define writeqq_relaxed(a) __writeqq(a)
+#define writeqq writeqq
+#endif /* CONFIG_AS_AVX */
+
+#endif /* CONFIG_X86_64 */
 
 #define ARCH_HAS_VALID_PHYS_ADDR_RANGE
 extern int valid_phys_addr_range(phys_addr_t addr, size_t size);
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-19 14:20 ` [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write Rahul Lakkireddy
@ 2018-03-19 14:43   ` Thomas Gleixner
  2018-03-20 13:32     ` Rahul Lakkireddy
  0 siblings, 1 reply; 47+ messages in thread
From: Thomas Gleixner @ 2018-03-19 14:43 UTC (permalink / raw)
  To: Rahul Lakkireddy
  Cc: x86, linux-kernel, netdev, mingo, hpa, davem, akpm, torvalds,
	ganeshgr, nirranjan, indranil

On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:

> Use VMOVDQU AVX CPU instruction when available to do 256-bit
> IO read and write.

That's not what the patch does. See below.

> Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
> Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>

That Signed-off-by chain is wrong....

> +#ifdef CONFIG_AS_AVX
> +#include <asm/fpu/api.h>
> +
> +static inline u256 __readqq(const volatile void __iomem *addr)
> +{
> +	u256 ret;
> +
> +	kernel_fpu_begin();
> +	asm volatile("vmovdqu %0, %%ymm0" :
> +		     : "m" (*(volatile u256 __force *)addr));
> +	asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret));
> +	kernel_fpu_end();
> +	return ret;

You _cannot_ assume that the instruction is available just because
CONFIG_AS_AVX is set. The availability is determined by the runtime
evaluated CPU feature flags, i.e. X86_FEATURE_AVX.

Aside of that I very much doubt that this is faster than 4 consecutive
64bit reads/writes as you have the full overhead of
kernel_fpu_begin()/end() for each access.

You did not provide any numbers for this so its even harder to
determine.

As far as I can tell the code where you are using this is a debug
facility. What's the point? Debug is hardly a performance critical problem.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-19 14:43   ` Thomas Gleixner
@ 2018-03-20 13:32     ` Rahul Lakkireddy
  2018-03-20 13:44       ` Andy Shevchenko
                         ` (2 more replies)
  0 siblings, 3 replies; 47+ messages in thread
From: Rahul Lakkireddy @ 2018-03-20 13:32 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

On Monday, March 03/19/18, 2018 at 20:13:10 +0530, Thomas Gleixner wrote:
> On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:
> 
> > Use VMOVDQU AVX CPU instruction when available to do 256-bit
> > IO read and write.
> 
> That's not what the patch does. See below.
> 
> > Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
> > Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
> 
> That Signed-off-by chain is wrong....
> 
> > +#ifdef CONFIG_AS_AVX
> > +#include <asm/fpu/api.h>
> > +
> > +static inline u256 __readqq(const volatile void __iomem *addr)
> > +{
> > +	u256 ret;
> > +
> > +	kernel_fpu_begin();
> > +	asm volatile("vmovdqu %0, %%ymm0" :
> > +		     : "m" (*(volatile u256 __force *)addr));
> > +	asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret));
> > +	kernel_fpu_end();
> > +	return ret;
> 
> You _cannot_ assume that the instruction is available just because
> CONFIG_AS_AVX is set. The availability is determined by the runtime
> evaluated CPU feature flags, i.e. X86_FEATURE_AVX.
> 

Ok.  Will add boot_cpu_has(X86_FEATURE_AVX) check as well.

> Aside of that I very much doubt that this is faster than 4 consecutive
> 64bit reads/writes as you have the full overhead of
> kernel_fpu_begin()/end() for each access.
> 
> You did not provide any numbers for this so its even harder to
> determine.
> 

Sorry about that.  Here are the numbers with and without this series.

When reading up to 2 GB on-chip memory via MMIO, the time taken:

Without Series        With Series
(64-bit read)         (256-bit read)

52 seconds            26 seconds

As can be seen, we see good improvement with doing 256-bits at a
time.

> As far as I can tell the code where you are using this is a debug
> facility. What's the point? Debug is hardly a performance critical problem.
> 

On High Availability Server, the logs of the failing system must be
collected as quickly as possible.  So, we're concerned with the amount
of time taken to collect our large on-chip memory.  We see improvement
in doing 256-bit reads at a time.

Thanks,
Rahul

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-20 13:32     ` Rahul Lakkireddy
@ 2018-03-20 13:44       ` Andy Shevchenko
  2018-03-21 12:27         ` Rahul Lakkireddy
  2018-03-20 14:40       ` David Laight
  2018-03-20 14:42       ` Alexander Duyck
  2 siblings, 1 reply; 47+ messages in thread
From: Andy Shevchenko @ 2018-03-20 13:44 UTC (permalink / raw)
  To: Rahul Lakkireddy
  Cc: Thomas Gleixner, x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

On Tue, Mar 20, 2018 at 3:32 PM, Rahul Lakkireddy
<rahul.lakkireddy@chelsio.com> wrote:
> On Monday, March 03/19/18, 2018 at 20:13:10 +0530, Thomas Gleixner wrote:
>> On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:

>> Aside of that I very much doubt that this is faster than 4 consecutive
>> 64bit reads/writes as you have the full overhead of
>> kernel_fpu_begin()/end() for each access.
>>
>> You did not provide any numbers for this so its even harder to
>> determine.
>>
>
> Sorry about that.  Here are the numbers with and without this series.
>
> When reading up to 2 GB on-chip memory via MMIO, the time taken:
>
> Without Series        With Series
> (64-bit read)         (256-bit read)
>
> 52 seconds            26 seconds
>
> As can be seen, we see good improvement with doing 256-bits at a
> time.

But this is kinda synthetic test, right?
If you run in a normal use case where kernel not only collecting logs,
but doing something else, especially with frequent userspace
interaction, would be trend the same?

-- 
With Best Regards,
Andy Shevchenko

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-20 13:44       ` Andy Shevchenko
@ 2018-03-21 12:27         ` Rahul Lakkireddy
  0 siblings, 0 replies; 47+ messages in thread
From: Rahul Lakkireddy @ 2018-03-21 12:27 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: Thomas Gleixner, x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

On Tuesday, March 03/20/18, 2018 at 19:14:46 +0530, Andy Shevchenko wrote:
> On Tue, Mar 20, 2018 at 3:32 PM, Rahul Lakkireddy
> <rahul.lakkireddy@chelsio.com> wrote:
> > On Monday, March 03/19/18, 2018 at 20:13:10 +0530, Thomas Gleixner wrote:
> >> On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:
> 
> >> Aside of that I very much doubt that this is faster than 4 consecutive
> >> 64bit reads/writes as you have the full overhead of
> >> kernel_fpu_begin()/end() for each access.
> >>
> >> You did not provide any numbers for this so its even harder to
> >> determine.
> >>
> >
> > Sorry about that.  Here are the numbers with and without this series.
> >
> > When reading up to 2 GB on-chip memory via MMIO, the time taken:
> >
> > Without Series        With Series
> > (64-bit read)         (256-bit read)
> >
> > 52 seconds            26 seconds
> >
> > As can be seen, we see good improvement with doing 256-bits at a
> > time.
> 
> But this is kinda synthetic test, right?
> If you run in a normal use case where kernel not only collecting logs,
> but doing something else, especially with frequent userspace
> interaction, would be trend the same?
> 

We see same improvement when collecting logs while running
heavy IO with iozone.

Thanks,
Rahul

^ permalink raw reply	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-20 13:32     ` Rahul Lakkireddy
  2018-03-20 13:44       ` Andy Shevchenko
@ 2018-03-20 14:40       ` David Laight
  2018-03-21 12:28         ` Rahul Lakkireddy
  2018-03-20 14:42       ` Alexander Duyck
  2 siblings, 1 reply; 47+ messages in thread
From: David Laight @ 2018-03-20 14:40 UTC (permalink / raw)
  To: 'Rahul Lakkireddy', Thomas Gleixner
  Cc: x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

From: Rahul Lakkireddy
> Sent: 20 March 2018 13:32
...
> On High Availability Server, the logs of the failing system must be
> collected as quickly as possible.  So, we're concerned with the amount
> of time taken to collect our large on-chip memory.  We see improvement
> in doing 256-bit reads at a time.

Two other options:

1) Get the device to DMA into host memory.

2) Use mmap() (and vm_iomap_memory() in your driver) to get direct
   userspace access to the (I assume) PCIe memory space.
   You can then use whatever copy instructions the cpu has.
   (Just don't use memcpy().)

	David

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-20 14:40       ` David Laight
@ 2018-03-21 12:28         ` Rahul Lakkireddy
  0 siblings, 0 replies; 47+ messages in thread
From: Rahul Lakkireddy @ 2018-03-21 12:28 UTC (permalink / raw)
  To: David Laight
  Cc: Thomas Gleixner, x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

On Tuesday, March 03/20/18, 2018 at 20:10:19 +0530, David Laight wrote:
> From: Rahul Lakkireddy
> > Sent: 20 March 2018 13:32
> ...
> > On High Availability Server, the logs of the failing system must be
> > collected as quickly as possible.  So, we're concerned with the amount
> > of time taken to collect our large on-chip memory.  We see improvement
> > in doing 256-bit reads at a time.
> 
> Two other options:
> 
> 1) Get the device to DMA into host memory.
> 

Unfortunately, our device doesn't support doing DMA of on-chip memory.

> 2) Use mmap() (and vm_iomap_memory() in your driver) to get direct
>    userspace access to the (I assume) PCIe memory space.
>    You can then use whatever copy instructions the cpu has.
>    (Just don't use memcpy().)
> 

We also need to collect this in kernel space i.e. from crash recovery
kernel.

Thanks,
Rahul

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-20 13:32     ` Rahul Lakkireddy
  2018-03-20 13:44       ` Andy Shevchenko
  2018-03-20 14:40       ` David Laight
@ 2018-03-20 14:42       ` Alexander Duyck
  2018-03-21 12:28         ` Rahul Lakkireddy
  2018-03-22  1:26         ` Linus Torvalds
  2 siblings, 2 replies; 47+ messages in thread
From: Alexander Duyck @ 2018-03-20 14:42 UTC (permalink / raw)
  To: Rahul Lakkireddy
  Cc: Thomas Gleixner, x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

On Tue, Mar 20, 2018 at 6:32 AM, Rahul Lakkireddy
<rahul.lakkireddy@chelsio.com> wrote:
> On Monday, March 03/19/18, 2018 at 20:13:10 +0530, Thomas Gleixner wrote:
>> On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:
>>
>> > Use VMOVDQU AVX CPU instruction when available to do 256-bit
>> > IO read and write.
>>
>> That's not what the patch does. See below.
>>
>> > Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
>> > Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
>>
>> That Signed-off-by chain is wrong....
>>
>> > +#ifdef CONFIG_AS_AVX
>> > +#include <asm/fpu/api.h>
>> > +
>> > +static inline u256 __readqq(const volatile void __iomem *addr)
>> > +{
>> > +   u256 ret;
>> > +
>> > +   kernel_fpu_begin();
>> > +   asm volatile("vmovdqu %0, %%ymm0" :
>> > +                : "m" (*(volatile u256 __force *)addr));
>> > +   asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret));
>> > +   kernel_fpu_end();
>> > +   return ret;
>>
>> You _cannot_ assume that the instruction is available just because
>> CONFIG_AS_AVX is set. The availability is determined by the runtime
>> evaluated CPU feature flags, i.e. X86_FEATURE_AVX.
>>
>
> Ok.  Will add boot_cpu_has(X86_FEATURE_AVX) check as well.
>
>> Aside of that I very much doubt that this is faster than 4 consecutive
>> 64bit reads/writes as you have the full overhead of
>> kernel_fpu_begin()/end() for each access.
>>
>> You did not provide any numbers for this so its even harder to
>> determine.
>>
>
> Sorry about that.  Here are the numbers with and without this series.
>
> When reading up to 2 GB on-chip memory via MMIO, the time taken:
>
> Without Series        With Series
> (64-bit read)         (256-bit read)
>
> 52 seconds            26 seconds
>
> As can be seen, we see good improvement with doing 256-bits at a
> time.

Instead of framing this as an enhanced version of the read/write ops
why not look at replacing or extending something like the
memcpy_fromio or memcpy_toio operations? It would probably be more
comparable to what you are doing if you are wanting to move large
chunks of memory from one region to another, and it should translate
into something like AVX instructions once the CPU optimizations kick
in for a memcpy.

- Alex

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-20 14:42       ` Alexander Duyck
@ 2018-03-21 12:28         ` Rahul Lakkireddy
  2018-03-22  1:26         ` Linus Torvalds
  1 sibling, 0 replies; 47+ messages in thread
From: Rahul Lakkireddy @ 2018-03-21 12:28 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Thomas Gleixner, x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

On Tuesday, March 03/20/18, 2018 at 20:12:15 +0530, Alexander Duyck wrote:
> On Tue, Mar 20, 2018 at 6:32 AM, Rahul Lakkireddy
> <rahul.lakkireddy@chelsio.com> wrote:
> > On Monday, March 03/19/18, 2018 at 20:13:10 +0530, Thomas Gleixner wrote:
> >> On Mon, 19 Mar 2018, Rahul Lakkireddy wrote:
> >>
> >> > Use VMOVDQU AVX CPU instruction when available to do 256-bit
> >> > IO read and write.
> >>
> >> That's not what the patch does. See below.
> >>
> >> > Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
> >> > Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
> >>
> >> That Signed-off-by chain is wrong....
> >>
> >> > +#ifdef CONFIG_AS_AVX
> >> > +#include <asm/fpu/api.h>
> >> > +
> >> > +static inline u256 __readqq(const volatile void __iomem *addr)
> >> > +{
> >> > +   u256 ret;
> >> > +
> >> > +   kernel_fpu_begin();
> >> > +   asm volatile("vmovdqu %0, %%ymm0" :
> >> > +                : "m" (*(volatile u256 __force *)addr));
> >> > +   asm volatile("vmovdqu %%ymm0, %0" : "=m" (ret));
> >> > +   kernel_fpu_end();
> >> > +   return ret;
> >>
> >> You _cannot_ assume that the instruction is available just because
> >> CONFIG_AS_AVX is set. The availability is determined by the runtime
> >> evaluated CPU feature flags, i.e. X86_FEATURE_AVX.
> >>
> >
> > Ok.  Will add boot_cpu_has(X86_FEATURE_AVX) check as well.
> >
> >> Aside of that I very much doubt that this is faster than 4 consecutive
> >> 64bit reads/writes as you have the full overhead of
> >> kernel_fpu_begin()/end() for each access.
> >>
> >> You did not provide any numbers for this so its even harder to
> >> determine.
> >>
> >
> > Sorry about that.  Here are the numbers with and without this series.
> >
> > When reading up to 2 GB on-chip memory via MMIO, the time taken:
> >
> > Without Series        With Series
> > (64-bit read)         (256-bit read)
> >
> > 52 seconds            26 seconds
> >
> > As can be seen, we see good improvement with doing 256-bits at a
> > time.
> 
> Instead of framing this as an enhanced version of the read/write ops
> why not look at replacing or extending something like the
> memcpy_fromio or memcpy_toio operations? It would probably be more
> comparable to what you are doing if you are wanting to move large
> chunks of memory from one region to another, and it should translate
> into something like AVX instructions once the CPU optimizations kick
> in for a memcpy.
> 

Ok. Will look into this approach.

Thanks,
Rahul

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-20 14:42       ` Alexander Duyck
  2018-03-21 12:28         ` Rahul Lakkireddy
@ 2018-03-22  1:26         ` Linus Torvalds
  2018-03-22 10:48           ` David Laight
  1 sibling, 1 reply; 47+ messages in thread
From: Linus Torvalds @ 2018-03-22  1:26 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Rahul Lakkireddy, Thomas Gleixner, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

On Tue, Mar 20, 2018 at 7:42 AM, Alexander Duyck
<alexander.duyck@gmail.com> wrote:
>
> Instead of framing this as an enhanced version of the read/write ops
> why not look at replacing or extending something like the
> memcpy_fromio or memcpy_toio operations?

Yes, doing something like "memcpy_fromio_avx()" is much more
palatable, in that it works like the crypto functions do - if you do
big chunks, the "kernel_fpu_begin/end()" isn't nearly the issue it can
be otherwise.

Note that we definitely have seen hardware that *depends* on the
regular memcpy_fromio()" not doing big reads. I don't know how
hardware people screw it up, but it's clearly possible.

So it really needs to be an explicitly named function that basically a
driver can use to say "my hardware really likes big aligned accesses"
and explicitly ask for some AVX version if possible.

                    Linus

^ permalink raw reply	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-22  1:26         ` Linus Torvalds
@ 2018-03-22 10:48           ` David Laight
  2018-03-22 17:16             ` Linus Torvalds
  0 siblings, 1 reply; 47+ messages in thread
From: David Laight @ 2018-03-22 10:48 UTC (permalink / raw)
  To: 'Linus Torvalds', Alexander Duyck
  Cc: Rahul Lakkireddy, Thomas Gleixner, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

From: Linus Torvalds
> Sent: 22 March 2018 01:27
> On Tue, Mar 20, 2018 at 7:42 AM, Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
> >
> > Instead of framing this as an enhanced version of the read/write ops
> > why not look at replacing or extending something like the
> > memcpy_fromio or memcpy_toio operations?
> 
> Yes, doing something like "memcpy_fromio_avx()" is much more
> palatable, in that it works like the crypto functions do - if you do
> big chunks, the "kernel_fpu_begin/end()" isn't nearly the issue it can
> be otherwise.
> 
> Note that we definitely have seen hardware that *depends* on the
> regular memcpy_fromio()" not doing big reads. I don't know how
> hardware people screw it up, but it's clearly possible.

I wonder if that hardware works with the current kernel on recent cpus?
I bet it doesn't like the byte accesses that get generated either.

> So it really needs to be an explicitly named function that basically a
> driver can use to say "my hardware really likes big aligned accesses"
> and explicitly ask for some AVX version if possible.

For x86 being able to request a copy done as 'rep movsx' (for each x)
would be useful.
For io copies the cost of the memory access is probably much smaller
that the io access, so really fancy copies are unlikely make much
difference unless the width of the io access changes.

	David

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write
  2018-03-22 10:48           ` David Laight
@ 2018-03-22 17:16             ` Linus Torvalds
  0 siblings, 0 replies; 47+ messages in thread
From: Linus Torvalds @ 2018-03-22 17:16 UTC (permalink / raw)
  To: David Laight
  Cc: Alexander Duyck, Rahul Lakkireddy, Thomas Gleixner,
	x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org, Ganesh GR,
	Nirranjan Kirubaharan, Indranil Choudhury

On Thu, Mar 22, 2018 at 3:48 AM, David Laight <David.Laight@aculab.com> wrote:
> From: Linus Torvalds
>>
>> Note that we definitely have seen hardware that *depends* on the
>> regular memcpy_fromio()" not doing big reads. I don't know how
>> hardware people screw it up, but it's clearly possible.

[ That should have been "big writes" ]

> I wonder if that hardware works with the current kernel on recent cpus?
> I bet it doesn't like the byte accesses that get generated either.

The one case we knew about we just fixed to use the special "16-bit
word at a time" scr_memcpyw().

If I recall correctly, it was some "enterprise graphics console".

If it's something I've found over the years, is that "enterprise"
hardware is absolutely the dregs. It's the low-volume stuff that
almost nobody uses and where the customer is used to the whole notion
of boutique hardware, so they're used to having to do special things
for special hardware.

And I very much use the word "special" in the "my mom told me I was
special" sense. It's not the *good* kind of "special". It's the "short
bus" kind of special.

> For x86 being able to request a copy done as 'rep movsx' (for each x)
> would be useful.

The portability issues are horrendous. Particularly if the memory area
(source or dest) might be unaligned.

The rule of thumb should be: don't use PIO, and if you *do* use PIO,
don't be picky about what you get.

And most *definitely* don't complain about performance to software
people. Blame the hardware people. Get a competent piece of hardware,
or a competent hardware designer.

Let's face it, PIO is stupid. Use it for command and status words. Not
for data transfer.

                 Linus

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [RFC PATCH 3/3] cxgb4: read on-chip memory 256-bits at a time
  2018-03-19 14:20 [RFC PATCH 0/3] kernel: add support for 256-bit IO access Rahul Lakkireddy
  2018-03-19 14:20 ` [RFC PATCH 1/3] include/linux: add 256-bit IO accessors Rahul Lakkireddy
  2018-03-19 14:20 ` [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write Rahul Lakkireddy
@ 2018-03-19 14:20 ` Rahul Lakkireddy
  2018-03-19 14:53 ` [RFC PATCH 0/3] kernel: add support for 256-bit IO access David Laight
  2018-03-19 15:27 ` Christoph Hellwig
  4 siblings, 0 replies; 47+ messages in thread
From: Rahul Lakkireddy @ 2018-03-19 14:20 UTC (permalink / raw)
  To: x86, linux-kernel, netdev
  Cc: tglx, mingo, hpa, davem, akpm, torvalds, ganeshgr, nirranjan,
	indranil, Rahul Lakkireddy

Use readqq() to read on-chip memory 256-bits at a time.

Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 16 ++++++++--------
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h     |  6 ++++++
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
index 9da6f57901a9..3a319b16c8a3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
@@ -885,7 +885,7 @@ static int cudbg_memory_read(struct cudbg_init *pdbg_init, int win,
 	struct adapter *adap = pdbg_init->adap;
 	u32 pos, offset, resid;
 	u32 *res_buf;
-	u64 *buf;
+	u256 *buf;
 	int ret;
 
 	/* Argument sanity checks ...
@@ -893,10 +893,10 @@ static int cudbg_memory_read(struct cudbg_init *pdbg_init, int win,
 	if (addr & 0x3 || (uintptr_t)hbuf & 0x3)
 		return -EINVAL;
 
-	buf = (u64 *)hbuf;
+	buf = (u256 *)hbuf;
 
-	/* Try to do 64-bit reads.  Residual will be handled later. */
-	resid = len & 0x7;
+	/* Try to do 256-bit reads.  Residual will be handled later. */
+	resid = len & 0x1f;
 	len -= resid;
 
 	ret = t4_memory_rw_init(adap, win, mtype, &memoffset, &mem_base,
@@ -917,10 +917,10 @@ static int cudbg_memory_read(struct cudbg_init *pdbg_init, int win,
 
 	/* Transfer data from the adapter */
 	while (len > 0) {
-		*buf++ = le64_to_cpu((__force __le64)
-				     t4_read_reg64(adap, mem_base + offset));
-		offset += sizeof(u64);
-		len -= sizeof(u64);
+		*buf++ = le256_to_cpu((__force __le256)
+				      t4_read_reg256(adap, mem_base + offset));
+		offset += sizeof(u256);
+		len -= sizeof(u256);
 
 		/* If we've reached the end of our current window aperture,
 		 * move the PCI-E Memory Window on to the next.
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index a5c0a649f3c7..0035ed0a2ec9 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -51,6 +51,7 @@
 #include <linux/ptp_clock_kernel.h>
 #include <linux/ptp_classify.h>
 #include <asm/io.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
 #include "t4_chip_type.h"
 #include "cxgb4_uld.h"
 
@@ -1234,6 +1235,11 @@ static inline u64 t4_read_reg64(struct adapter *adap, u32 reg_addr)
 	return readq(adap->regs + reg_addr);
 }
 
+static inline u256 t4_read_reg256(struct adapter *adap, u32 reg_addr)
+{
+	return readqq(adap->regs + reg_addr);
+}
+
 static inline void t4_write_reg64(struct adapter *adap, u32 reg_addr, u64 val)
 {
 	writeq(val, adap->regs + reg_addr);
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-19 14:20 [RFC PATCH 0/3] kernel: add support for 256-bit IO access Rahul Lakkireddy
                   ` (2 preceding siblings ...)
  2018-03-19 14:20 ` [RFC PATCH 3/3] cxgb4: read on-chip memory 256-bits at a time Rahul Lakkireddy
@ 2018-03-19 14:53 ` David Laight
  2018-03-19 15:05   ` Thomas Gleixner
  2018-03-19 15:27 ` Christoph Hellwig
  4 siblings, 1 reply; 47+ messages in thread
From: David Laight @ 2018-03-19 14:53 UTC (permalink / raw)
  To: 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org
  Cc: tglx@linutronix.de, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com

From: Rahul Lakkireddy
> Sent: 19 March 2018 14:21
> 
> This series of patches add support for 256-bit IO read and write.
> The APIs are readqq and writeqq (quad quadword - 4 x 64), that read
> and write 256-bits at a time from IO, respectively.

Why not use the AVX2 registers to get 512bit accesses.

> Patch 1 adds u256 type and adds necessary non-atomic accessors.  Also
> adds byteorder conversion APIs.
> 
> Patch 2 adds 256-bit read and write to x86 via VMOVDQU AVX CPU
> instructions.
> 
> Patch 3 updates cxgb4 driver to use the readqq API to speed up
> reading on-chip memory 256-bits at a time.

Calling kernel_fpu_begin() is likely to be slow.
I doubt you want to do it every time around a loop of accesses.

In principle it ought to be possible to get access to one or two
(eg) AVX registers by saving them to stack and telling the fpu
save code where you've put them.
Then the IPI fp save code could then copy the saved values over
the current values if asked to save the fp state for a process.
This should be reasonable cheap - especially if there isn't an
fp save IPI.

OTOH, for x86, if the code always runs in process context (eg from a
system call) then, since the ABI defines them all as caller-saved
the AVX(2) registers, it is only necessary to ensure that the current
FPU registers belong to the current process once.
The registers can be set to zero by an 'invalidate' instruction on
system call entry (hope this is done) and after use.

	David

^ permalink raw reply	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-19 14:53 ` [RFC PATCH 0/3] kernel: add support for 256-bit IO access David Laight
@ 2018-03-19 15:05   ` Thomas Gleixner
  2018-03-19 15:19     ` David Laight
  0 siblings, 1 reply; 47+ messages in thread
From: Thomas Gleixner @ 2018-03-19 15:05 UTC (permalink / raw)
  To: David Laight
  Cc: 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com

On Mon, 19 Mar 2018, David Laight wrote:
> From: Rahul Lakkireddy
> In principle it ought to be possible to get access to one or two
> (eg) AVX registers by saving them to stack and telling the fpu
> save code where you've put them.

No. We have functions for this and we are not adding new ad hoc magic.

> OTOH, for x86, if the code always runs in process context (eg from a
> system call) then, since the ABI defines them all as caller-saved
> the AVX(2) registers, it is only necessary to ensure that the current
> FPU registers belong to the current process once.
> The registers can be set to zero by an 'invalidate' instruction on
> system call entry (hope this is done) and after use.

Why would a system call touch the FPU registers? The kernel normally does
not use FPU instructions and the code which explicitely does has to take
care of save/restore. It would be performance madness to fiddle with the
FPU stuff unconditionally if nothing uses it.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-19 15:05   ` Thomas Gleixner
@ 2018-03-19 15:19     ` David Laight
  2018-03-19 15:37       ` Thomas Gleixner
  0 siblings, 1 reply; 47+ messages in thread
From: David Laight @ 2018-03-19 15:19 UTC (permalink / raw)
  To: 'Thomas Gleixner'
  Cc: 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com

From: Thomas Gleixner
> Sent: 19 March 2018 15:05
> 
> On Mon, 19 Mar 2018, David Laight wrote:
> > From: Rahul Lakkireddy
> > In principle it ought to be possible to get access to one or two
> > (eg) AVX registers by saving them to stack and telling the fpu
> > save code where you've put them.
> 
> No. We have functions for this and we are not adding new ad hoc magic.

I was thinking that a real API might do this...
Useful also for code that needs AVX-like registers to do things like CRCs.

> > OTOH, for x86, if the code always runs in process context (eg from a
> > system call) then, since the ABI defines them all as caller-saved
> > the AVX(2) registers, it is only necessary to ensure that the current
> > FPU registers belong to the current process once.
> > The registers can be set to zero by an 'invalidate' instruction on
> > system call entry (hope this is done) and after use.
> 
> Why would a system call touch the FPU registers? The kernel normally does
> not use FPU instructions and the code which explicitely does has to take
> care of save/restore. It would be performance madness to fiddle with the
> FPU stuff unconditionally if nothing uses it.

If system call entry reset the AVX registers then any FP save/restore
would be faster because the AVX registers wouldn't need to be saved
(and the cpu won't save them).
I believe the instruction to reset the AVX registers is fast.
The AVX registers only ever need saving if the process enters the
kernel through an interrupt.

	David

^ permalink raw reply	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-19 15:19     ` David Laight
@ 2018-03-19 15:37       ` Thomas Gleixner
  2018-03-19 15:53         ` David Laight
  2018-03-20  8:26         ` Ingo Molnar
  0 siblings, 2 replies; 47+ messages in thread
From: Thomas Gleixner @ 2018-03-19 15:37 UTC (permalink / raw)
  To: David Laight
  Cc: 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com

On Mon, 19 Mar 2018, David Laight wrote:
> From: Thomas Gleixner
> > Sent: 19 March 2018 15:05
> > 
> > On Mon, 19 Mar 2018, David Laight wrote:
> > > From: Rahul Lakkireddy
> > > In principle it ought to be possible to get access to one or two
> > > (eg) AVX registers by saving them to stack and telling the fpu
> > > save code where you've put them.
> > 
> > No. We have functions for this and we are not adding new ad hoc magic.
> 
> I was thinking that a real API might do this...

We have a real API and that's good enough for the stuff we have using AVX
in the kernel.

> Useful also for code that needs AVX-like registers to do things like CRCs.

x86/crypto/ has a lot of AVX optimized code.

> > > OTOH, for x86, if the code always runs in process context (eg from a
> > > system call) then, since the ABI defines them all as caller-saved
> > > the AVX(2) registers, it is only necessary to ensure that the current
> > > FPU registers belong to the current process once.
> > > The registers can be set to zero by an 'invalidate' instruction on
> > > system call entry (hope this is done) and after use.
> > 
> > Why would a system call touch the FPU registers? The kernel normally does
> > not use FPU instructions and the code which explicitely does has to take
> > care of save/restore. It would be performance madness to fiddle with the
> > FPU stuff unconditionally if nothing uses it.
> 
> If system call entry reset the AVX registers then any FP save/restore
> would be faster because the AVX registers wouldn't need to be saved
> (and the cpu won't save them).
> I believe the instruction to reset the AVX registers is fast.
> The AVX registers only ever need saving if the process enters the
> kernel through an interrupt.

Wrong. The x8664 ABI clearly states:

   Linux Kernel code is not allowed to change the x87 and SSE units. If
   those are changed by kernel code, they have to be restored properly
   before sleeping or leav- ing the kernel.

That means the syscall interface relies on FPU state being not changed by
the kernel. So if you want to clear AVX on syscall entry you need to save
it first and then restore before returning. That would be a huge
performance hit.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-19 15:37       ` Thomas Gleixner
@ 2018-03-19 15:53         ` David Laight
  2018-03-19 16:29           ` Linus Torvalds
  2018-03-20  8:26         ` Ingo Molnar
  1 sibling, 1 reply; 47+ messages in thread
From: David Laight @ 2018-03-19 15:53 UTC (permalink / raw)
  To: 'Thomas Gleixner'
  Cc: 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com

From: Thomas Gleixner
> Sent: 19 March 2018 15:37
...
> > If system call entry reset the AVX registers then any FP save/restore
> > would be faster because the AVX registers wouldn't need to be saved
> > (and the cpu won't save them).
> > I believe the instruction to reset the AVX registers is fast.
> > The AVX registers only ever need saving if the process enters the
> > kernel through an interrupt.
> 
> Wrong. The x8664 ABI clearly states:
> 
>    Linux Kernel code is not allowed to change the x87 and SSE units. If
>    those are changed by kernel code, they have to be restored properly
>    before sleeping or leav- ing the kernel.
> 
> That means the syscall interface relies on FPU state being not changed by
> the kernel. So if you want to clear AVX on syscall entry you need to save
> it first and then restore before returning. That would be a huge
> performance hit.

The x87 and SSE registers can't be changed - they can contain callee-saved
registers.
But (IIRC) the AVX and AVX2 registers are all caller-saved.
So the system call entry stub functions are allowed to change them.
Which means that the syscall entry code can also change them.
Of course it must not leak kernel values back to userspace.

It is a few years since I looked at the AVX and fpu save code.

	David

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-19 15:53         ` David Laight
@ 2018-03-19 16:29           ` Linus Torvalds
  0 siblings, 0 replies; 47+ messages in thread
From: Linus Torvalds @ 2018-03-19 16:29 UTC (permalink / raw)
  To: David Laight
  Cc: Thomas Gleixner, Rahul Lakkireddy, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com

On Mon, Mar 19, 2018 at 8:53 AM, David Laight <David.Laight@aculab.com> wrote:
>
> The x87 and SSE registers can't be changed - they can contain callee-saved
> registers.
> But (IIRC) the AVX and AVX2 registers are all caller-saved.

No.

The kernel entry is not the usual function call.

On kernel entry, *all* registers are callee-saved.

Of course, some may be return values, and I have a slight hope that I
can trash %eflags. But basically, a system call is simply not a
function call. We have different calling conventions on the argument
side too. In fact, the arguments are in different registers depending
on just *which* system call you take.

                  Linus

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-19 15:37       ` Thomas Gleixner
  2018-03-19 15:53         ` David Laight
@ 2018-03-20  8:26         ` Ingo Molnar
  2018-03-20  8:38           ` Thomas Gleixner
                             ` (2 more replies)
  1 sibling, 3 replies; 47+ messages in thread
From: Ingo Molnar @ 2018-03-20  8:26 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: David Laight, 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Thomas Gleixner, Fenghua Yu,
	Eric Biggers

* Thomas Gleixner <tglx@linutronix.de> wrote:

> > Useful also for code that needs AVX-like registers to do things like CRCs.
> 
> x86/crypto/ has a lot of AVX optimized code.

Yeah, that's true, but the crypto code is processing fundamentally bigger blocks 
of data, which amortizes the cost of using kernel_fpu_begin()/_end().

kernel_fpu_begin()/_end() is a pretty heavy operation because it does a full FPU 
save/restore via the XSAVE[S] and XRSTOR[S] instructions, which can easily copy a 
thousand bytes around! So kernel_fpu_begin()/_end() is probably a non-starter for 
something small, like a single 256-bit or 512-bit word access.

But there's actually a new thing in modern kernels: we got rid of (most of) lazy 
save/restore FPU code, our new x86 FPU model is very "direct" with no FPU faults 
taken normally.

So assuming the target driver will only load on modern FPUs I *think* it should 
actually be possible to do something like (pseudocode):

	vmovdqa %ymm0, 40(%rsp)
	vmovdqa %ymm1, 80(%rsp)

	...
	# use ymm0 and ymm1
	...

	vmovdqa 80(%rsp), %ymm1
	vmovdqa 40(%rsp), %ymm0

... without using the heavy XSAVE/XRSTOR instructions.

Note that preemption probably still needs to be disabled and possibly there are 
other details as well, but there should be no 'heavy' FPU operations.

I think this should still preserve all user-space FPU state and shouldn't muck up 
any 'weird' user-space FPU state (such as pending exceptions, legacy x87 running 
code, NaN registers or weird FPU control word settings) we might have interrupted 
either.

But I could be wrong, it should be checked whether this sequence is safe. 
Worst-case we might have to save/restore the FPU control and tag words - but those 
operations should still be much faster than a full XSAVE/XRSTOR pair.

So I do think we could do more in this area to improve driver performance, if the 
code is correct and if there's actual benchmarks that are showing real benefits.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20  8:26         ` Ingo Molnar
@ 2018-03-20  8:38           ` Thomas Gleixner
  2018-03-20  9:08             ` Ingo Molnar
  2018-03-20 14:57           ` Andy Lutomirski
  2018-03-20 18:01           ` Linus Torvalds
  2 siblings, 1 reply; 47+ messages in thread
From: Thomas Gleixner @ 2018-03-20  8:38 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Laight, 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua Yu, Eric Biggers

On Tue, 20 Mar 2018, Ingo Molnar wrote:
> * Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> > > Useful also for code that needs AVX-like registers to do things like CRCs.
> > 
> > x86/crypto/ has a lot of AVX optimized code.
> 
> Yeah, that's true, but the crypto code is processing fundamentally bigger blocks 
> of data, which amortizes the cost of using kernel_fpu_begin()/_end().

Correct.

> So assuming the target driver will only load on modern FPUs I *think* it should 
> actually be possible to do something like (pseudocode):
> 
> 	vmovdqa %ymm0, 40(%rsp)
> 	vmovdqa %ymm1, 80(%rsp)
> 
> 	...
> 	# use ymm0 and ymm1
> 	...
> 
> 	vmovdqa 80(%rsp), %ymm1
> 	vmovdqa 40(%rsp), %ymm0
> 
> ... without using the heavy XSAVE/XRSTOR instructions.
> 
> Note that preemption probably still needs to be disabled and possibly there are 
> other details as well, but there should be no 'heavy' FPU operations.

Emphasis on should :)

> I think this should still preserve all user-space FPU state and shouldn't muck up 
> any 'weird' user-space FPU state (such as pending exceptions, legacy x87 running 
> code, NaN registers or weird FPU control word settings) we might have interrupted 
> either.
> 
> But I could be wrong, it should be checked whether this sequence is safe. 
> Worst-case we might have to save/restore the FPU control and tag words - but those 
> operations should still be much faster than a full XSAVE/XRSTOR pair.

Fair enough.

> So I do think we could do more in this area to improve driver performance, if the 
> code is correct and if there's actual benchmarks that are showing real benefits.

If it's about hotpath performance I'm all for it, but the use case here is
a debug facility...

And if we go down that road then we want a AVX based memcpy()
implementation which is runtime conditional on the feature bit(s) and
length dependent. Just slapping a readqq() at it and use it in a loop does
not make any sense.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20  8:38           ` Thomas Gleixner
@ 2018-03-20  9:08             ` Ingo Molnar
  2018-03-20  9:41               ` Thomas Gleixner
  0 siblings, 1 reply; 47+ messages in thread
From: Ingo Molnar @ 2018-03-20  9:08 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: David Laight, 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua Yu, Eric Biggers


* Thomas Gleixner <tglx@linutronix.de> wrote:

> > So I do think we could do more in this area to improve driver performance, if the 
> > code is correct and if there's actual benchmarks that are showing real benefits.
> 
> If it's about hotpath performance I'm all for it, but the use case here is
> a debug facility...
> 
> And if we go down that road then we want a AVX based memcpy()
> implementation which is runtime conditional on the feature bit(s) and
> length dependent. Just slapping a readqq() at it and use it in a loop does
> not make any sense.

Yeah, so generic memcpy() replacement is only feasible I think if the most 
optimistic implementation is actually correct:

 - if no preempt disable()/enable() is required

 - if direct access to the AVX[2] registers does not disturb legacy FPU state in 
   any fashion

 - if direct access to the AVX[2] registers cannot raise weird exceptions or have
   weird behavior if the FPU control word is modified to non-standard values by 
   untrusted user-space

If we have to touch the FPU tag or control words then it's probably only good for 
a specialized API.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20  9:08             ` Ingo Molnar
@ 2018-03-20  9:41               ` Thomas Gleixner
  2018-03-20  9:59                 ` David Laight
  2018-03-20 10:54                 ` Ingo Molnar
  0 siblings, 2 replies; 47+ messages in thread
From: Thomas Gleixner @ 2018-03-20  9:41 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Laight, 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua Yu, Eric Biggers

On Tue, 20 Mar 2018, Ingo Molnar wrote:
> * Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> > > So I do think we could do more in this area to improve driver performance, if the 
> > > code is correct and if there's actual benchmarks that are showing real benefits.
> > 
> > If it's about hotpath performance I'm all for it, but the use case here is
> > a debug facility...
> > 
> > And if we go down that road then we want a AVX based memcpy()
> > implementation which is runtime conditional on the feature bit(s) and
> > length dependent. Just slapping a readqq() at it and use it in a loop does
> > not make any sense.
> 
> Yeah, so generic memcpy() replacement is only feasible I think if the most 
> optimistic implementation is actually correct:
> 
>  - if no preempt disable()/enable() is required
> 
>  - if direct access to the AVX[2] registers does not disturb legacy FPU state in 
>    any fashion
> 
>  - if direct access to the AVX[2] registers cannot raise weird exceptions or have
>    weird behavior if the FPU control word is modified to non-standard values by 
>    untrusted user-space
> 
> If we have to touch the FPU tag or control words then it's probably only good for 
> a specialized API.

I did not mean to have a general memcpy replacement. Rather something like
magic_memcpy() which falls back to memcpy when AVX is not usable or the
length does not justify the AVX stuff at all.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20  9:41               ` Thomas Gleixner
@ 2018-03-20  9:59                 ` David Laight
  2018-03-20 10:54                 ` Ingo Molnar
  1 sibling, 0 replies; 47+ messages in thread
From: David Laight @ 2018-03-20  9:59 UTC (permalink / raw)
  To: 'Thomas Gleixner', Ingo Molnar
  Cc: 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua Yu, Eric Biggers

From: Thomas Gleixner
> Sent: 20 March 2018 09:41
> On Tue, 20 Mar 2018, Ingo Molnar wrote:
> > * Thomas Gleixner <tglx@linutronix.de> wrote:
...
> > > And if we go down that road then we want a AVX based memcpy()
> > > implementation which is runtime conditional on the feature bit(s) and
> > > length dependent. Just slapping a readqq() at it and use it in a loop does
> > > not make any sense.
> >
> > Yeah, so generic memcpy() replacement is only feasible I think if the most
> > optimistic implementation is actually correct:
> >
> >  - if no preempt disable()/enable() is required
> >
> >  - if direct access to the AVX[2] registers does not disturb legacy FPU state in
> >    any fashion
> >
> >  - if direct access to the AVX[2] registers cannot raise weird exceptions or have
> >    weird behavior if the FPU control word is modified to non-standard values by
> >    untrusted user-space
> >
> > If we have to touch the FPU tag or control words then it's probably only good for
> > a specialized API.
> 
> I did not mean to have a general memcpy replacement. Rather something like
> magic_memcpy() which falls back to memcpy when AVX is not usable or the
> length does not justify the AVX stuff at all.

There is probably no point for memcpy().

Where it would make a big difference is memcpy_fromio() for PCIe devices
(where longer TLP make a big difference).
But any code belongs in its implementation not in every driver.
The implementation of memcpy_toio() is nothing like as critical.

If might be the code would need to fallback to 64bit accesses
if the AVX(2) registers can't currently be accessed - maybe some
obscure state....

However memcpy_to/fromio() are both horrid at the moment because
they result in byte copies!

	David

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20  9:41               ` Thomas Gleixner
  2018-03-20  9:59                 ` David Laight
@ 2018-03-20 10:54                 ` Ingo Molnar
  2018-03-20 13:30                   ` David Laight
  2018-04-03  8:49                   ` Pavel Machek
  1 sibling, 2 replies; 47+ messages in thread
From: Ingo Molnar @ 2018-03-20 10:54 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: David Laight, 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua Yu, Eric Biggers


* Thomas Gleixner <tglx@linutronix.de> wrote:

> On Tue, 20 Mar 2018, Ingo Molnar wrote:
> > * Thomas Gleixner <tglx@linutronix.de> wrote:
> > 
> > > > So I do think we could do more in this area to improve driver performance, if the 
> > > > code is correct and if there's actual benchmarks that are showing real benefits.
> > > 
> > > If it's about hotpath performance I'm all for it, but the use case here is
> > > a debug facility...
> > > 
> > > And if we go down that road then we want a AVX based memcpy()
> > > implementation which is runtime conditional on the feature bit(s) and
> > > length dependent. Just slapping a readqq() at it and use it in a loop does
> > > not make any sense.
> > 
> > Yeah, so generic memcpy() replacement is only feasible I think if the most 
> > optimistic implementation is actually correct:
> > 
> >  - if no preempt disable()/enable() is required
> > 
> >  - if direct access to the AVX[2] registers does not disturb legacy FPU state in 
> >    any fashion
> > 
> >  - if direct access to the AVX[2] registers cannot raise weird exceptions or have
> >    weird behavior if the FPU control word is modified to non-standard values by 
> >    untrusted user-space
> > 
> > If we have to touch the FPU tag or control words then it's probably only good for 
> > a specialized API.
> 
> I did not mean to have a general memcpy replacement. Rather something like
> magic_memcpy() which falls back to memcpy when AVX is not usable or the
> length does not justify the AVX stuff at all.

OK, fair enough.

Note that a generic version might still be worth trying out, if and only if it's 
safe to access those vector registers directly: modern x86 CPUs will do their 
non-constant memcpy()s via the common memcpy_erms() function - which could in 
theory be an easy common point to be (cpufeatures-) patched to an AVX2 variant, if 
size (and alignment, perhaps) is a multiple of 32 bytes or so.

Assuming it's correct with arbitrary user-space FPU state and if it results in any 
measurable speedups, which might not be the case: ERMS is supposed to be very 
fast.

So even if it's possible (which it might not be), it could end up being slower 
than the ERMS version.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20 10:54                 ` Ingo Molnar
@ 2018-03-20 13:30                   ` David Laight
  2018-04-03  8:49                   ` Pavel Machek
  1 sibling, 0 replies; 47+ messages in thread
From: David Laight @ 2018-03-20 13:30 UTC (permalink / raw)
  To: 'Ingo Molnar', Thomas Gleixner
  Cc: 'Rahul Lakkireddy', x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua Yu, Eric Biggers

From: Ingo Molnar
> Sent: 20 March 2018 10:54
...
> Note that a generic version might still be worth trying out, if and only if it's
> safe to access those vector registers directly: modern x86 CPUs will do their
> non-constant memcpy()s via the common memcpy_erms() function - which could in
> theory be an easy common point to be (cpufeatures-) patched to an AVX2 variant, if
> size (and alignment, perhaps) is a multiple of 32 bytes or so.
> 
> Assuming it's correct with arbitrary user-space FPU state and if it results in any
> measurable speedups, which might not be the case: ERMS is supposed to be very
> fast.
> 
> So even if it's possible (which it might not be), it could end up being slower
> than the ERMS version.

Last I checked memcpy() was implemented as 'rep movsb' on the latest Intel cpus.
Since memcpy_to/fromio() get aliased to memcpy() this generates byte copies.
The previous 'fastest' version of memcpy() was ok for uncached locations.

For PCIe I suspect that the actual instructions don't make a massive difference.
I'm not even sure interleaving two transfers makes any difference.
What makes a huge difference for memcpy_fromio() is the size of the register.
The time taken for a read will be largely independent of the width of the
register used.

	David

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20 10:54                 ` Ingo Molnar
  2018-03-20 13:30                   ` David Laight
@ 2018-04-03  8:49                   ` Pavel Machek
  2018-04-03 10:36                     ` Ingo Molnar
  1 sibling, 1 reply; 47+ messages in thread
From: Pavel Machek @ 2018-04-03  8:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, David Laight, 'Rahul Lakkireddy',
	x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Andy Lutomirski,
	Peter Zijlstra, Fenghua Yu, Eric Biggers

[-- Attachment #1: Type: text/plain, Size: 2512 bytes --]

Hi!

> > On Tue, 20 Mar 2018, Ingo Molnar wrote:
> > > * Thomas Gleixner <tglx@linutronix.de> wrote:
> > > 
> > > > > So I do think we could do more in this area to improve driver performance, if the 
> > > > > code is correct and if there's actual benchmarks that are showing real benefits.
> > > > 
> > > > If it's about hotpath performance I'm all for it, but the use case here is
> > > > a debug facility...
> > > > 
> > > > And if we go down that road then we want a AVX based memcpy()
> > > > implementation which is runtime conditional on the feature bit(s) and
> > > > length dependent. Just slapping a readqq() at it and use it in a loop does
> > > > not make any sense.
> > > 
> > > Yeah, so generic memcpy() replacement is only feasible I think if the most 
> > > optimistic implementation is actually correct:
> > > 
> > >  - if no preempt disable()/enable() is required
> > > 
> > >  - if direct access to the AVX[2] registers does not disturb legacy FPU state in 
> > >    any fashion
> > > 
> > >  - if direct access to the AVX[2] registers cannot raise weird exceptions or have
> > >    weird behavior if the FPU control word is modified to non-standard values by 
> > >    untrusted user-space
> > > 
> > > If we have to touch the FPU tag or control words then it's probably only good for 
> > > a specialized API.
> > 
> > I did not mean to have a general memcpy replacement. Rather something like
> > magic_memcpy() which falls back to memcpy when AVX is not usable or the
> > length does not justify the AVX stuff at all.
> 
> OK, fair enough.
> 
> Note that a generic version might still be worth trying out, if and only if it's 
> safe to access those vector registers directly: modern x86 CPUs will do their 
> non-constant memcpy()s via the common memcpy_erms() function - which could in 
> theory be an easy common point to be (cpufeatures-) patched to an AVX2 variant, if 
> size (and alignment, perhaps) is a multiple of 32 bytes or so.

How is AVX2 supposed to help the memcpy speed?

If the copy is small, constant overhead will dominate, and I don't
think AVX2 is going to be win there.

If the copy is big, well, the copy loop will likely run out of L1 and
maybe even out of L2, and at that point speed of the loop does not
matter because memory is slow...?

Best regards,
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-04-03  8:49                   ` Pavel Machek
@ 2018-04-03 10:36                     ` Ingo Molnar
  0 siblings, 0 replies; 47+ messages in thread
From: Ingo Molnar @ 2018-04-03 10:36 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Thomas Gleixner, David Laight, 'Rahul Lakkireddy',
	x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Andy Lutomirski,
	Peter Zijlstra, Fenghua Yu, Eric Biggers


* Pavel Machek <pavel@ucw.cz> wrote:

> > > > Yeah, so generic memcpy() replacement is only feasible I think if the most 
> > > > optimistic implementation is actually correct:
> > > > 
> > > >  - if no preempt disable()/enable() is required
> > > > 
> > > >  - if direct access to the AVX[2] registers does not disturb legacy FPU state in 
> > > >    any fashion
> > > > 
> > > >  - if direct access to the AVX[2] registers cannot raise weird exceptions or have
> > > >    weird behavior if the FPU control word is modified to non-standard values by 
> > > >    untrusted user-space
> > > > 
> > > > If we have to touch the FPU tag or control words then it's probably only good for 
> > > > a specialized API.
> > > 
> > > I did not mean to have a general memcpy replacement. Rather something like
> > > magic_memcpy() which falls back to memcpy when AVX is not usable or the
> > > length does not justify the AVX stuff at all.
> > 
> > OK, fair enough.
> > 
> > Note that a generic version might still be worth trying out, if and only if it's 
> > safe to access those vector registers directly: modern x86 CPUs will do their 
> > non-constant memcpy()s via the common memcpy_erms() function - which could in 
> > theory be an easy common point to be (cpufeatures-) patched to an AVX2 variant, if 
> > size (and alignment, perhaps) is a multiple of 32 bytes or so.
> 
> How is AVX2 supposed to help the memcpy speed?
> 
> If the copy is small, constant overhead will dominate, and I don't
> think AVX2 is going to be win there.

There are several advantages:

1)

"REP; MOVS" (also called ERMS) has a significant constant "setup cost".

In the scheme I suggested (and if it's possible) then single-register AVX2 access 
on the other hand has a setup cost on the "few cycles" order of magnitude.

2)

AVX2 have various non-temporary load and store behavioral variants - while "REP; 
MOVS" doesn't (or rather, any such caching optimizations, to the extent they 
exist, are hidden in the microcode).

> If the copy is big, well, the copy loop will likely run out of L1 and maybe even 
> out of L2, and at that point speed of the loop does not matter because memory is 
> slow...?

In many cases "memory" will be something very fast, such as another level of 
cache. Also, on NUMA "memory" can also be something locally wired to the CPU - 
again accessible at ridiculous bandwidths.

Nevertheless ERMS is probably wins for the regular bulk memcpy by a few percentage 
points, so I don't think AVX2 is a win in the generic large-memcpy case, as long 
as continued caching of both the loads and the stores is beneficial.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20  8:26         ` Ingo Molnar
  2018-03-20  8:38           ` Thomas Gleixner
@ 2018-03-20 14:57           ` Andy Lutomirski
  2018-03-20 15:10             ` David Laight
  2018-03-20 18:01           ` Linus Torvalds
  2 siblings, 1 reply; 47+ messages in thread
From: Andy Lutomirski @ 2018-03-20 14:57 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, David Laight, Rahul Lakkireddy, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua Yu, Eric Biggers,
	Rik van Riel

On Tue, Mar 20, 2018 at 8:26 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Thomas Gleixner <tglx@linutronix.de> wrote:
>
>> > Useful also for code that needs AVX-like registers to do things like CRCs.
>>
>> x86/crypto/ has a lot of AVX optimized code.
>
> Yeah, that's true, but the crypto code is processing fundamentally bigger blocks
> of data, which amortizes the cost of using kernel_fpu_begin()/_end().
>
> kernel_fpu_begin()/_end() is a pretty heavy operation because it does a full FPU
> save/restore via the XSAVE[S] and XRSTOR[S] instructions, which can easily copy a
> thousand bytes around! So kernel_fpu_begin()/_end() is probably a non-starter for
> something small, like a single 256-bit or 512-bit word access.
>
> But there's actually a new thing in modern kernels: we got rid of (most of) lazy
> save/restore FPU code, our new x86 FPU model is very "direct" with no FPU faults
> taken normally.
>
> So assuming the target driver will only load on modern FPUs I *think* it should
> actually be possible to do something like (pseudocode):
>
>         vmovdqa %ymm0, 40(%rsp)
>         vmovdqa %ymm1, 80(%rsp)
>
>         ...
>         # use ymm0 and ymm1
>         ...
>
>         vmovdqa 80(%rsp), %ymm1
>         vmovdqa 40(%rsp), %ymm0
>

I think this kinda sorts works, but only kinda sorta:

 - I'm a bit worried about newer CPUs where, say, a 256-bit vector
operation will implicitly clear the high 768 bits of the regs.  (IIRC
that's how it works for the new VEX stuff.)

 - This code will cause XINUSE to be set, which is user-visible and
may have performance and correctness effects.  I think the kernel may
already have some but where it occasionally sets XINUSE on its own,
and this caused problems for rr in the past.  This might not be a
total showstopper, but it's odd.

I'd rather see us finally finish the work that Rik started to rework
this differently.  I'd like kernel_fpu_begin() to look like:

if (test_thread_flag(TIF_NEED_FPU_RESTORE)) {
  return; // we're already okay.  maybe we need to check
in_interrupt() or something, though?
} else {
  XSAVES/XSAVEOPT/XSAVE;
  set_thread_flag(TIF_NEED_FPU_RESTORE):
}

and kernel_fpu_end() does nothing at all.

We take the full performance hit for a *single* kernel_fpu_begin() on
an otherwise short syscall or interrupt, but there's no additional
cost for more of them or for long-enough-running things that we
schedule in the middle.

As I remember, the main hangup was that this interacts a bit oddly
with PKRU, but that's manageable.

--Andy

^ permalink raw reply	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20 14:57           ` Andy Lutomirski
@ 2018-03-20 15:10             ` David Laight
  2018-03-21  0:39               ` Andy Lutomirski
  0 siblings, 1 reply; 47+ messages in thread
From: David Laight @ 2018-03-20 15:10 UTC (permalink / raw)
  To: 'Andy Lutomirski', Ingo Molnar
  Cc: Thomas Gleixner, Rahul Lakkireddy, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, torvalds@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Peter Zijlstra, Fenghua Yu, Eric Biggers, Rik van Riel

From: Andy Lutomirski
> Sent: 20 March 2018 14:57
...
> I'd rather see us finally finish the work that Rik started to rework
> this differently.  I'd like kernel_fpu_begin() to look like:
> 
> if (test_thread_flag(TIF_NEED_FPU_RESTORE)) {
>   return; // we're already okay.  maybe we need to check
> in_interrupt() or something, though?
> } else {
>   XSAVES/XSAVEOPT/XSAVE;
>   set_thread_flag(TIF_NEED_FPU_RESTORE):
> }
> 
> and kernel_fpu_end() does nothing at all.

I guess it might need to set (clear?) the CFLAGS bit for a process
that isn't using the fpu at all - which seems a sensible feature.
 
> We take the full performance hit for a *single* kernel_fpu_begin() on
> an otherwise short syscall or interrupt, but there's no additional
> cost for more of them or for long-enough-running things that we
> schedule in the middle.

It might be worth adding a parameter to kernel_fpu_begin() to indicate
which registers are needed, and a return value to say what has been
granted.
Then a driver could request AVX2 (for example) and use a fallback path
if the register set isn't available (for any reason).
A call from an ISR could always fail.

> As I remember, the main hangup was that this interacts a bit oddly
> with PKRU, but that's manageable.

WTF PKRU ??

	Dvaid

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20 15:10             ` David Laight
@ 2018-03-21  0:39               ` Andy Lutomirski
  0 siblings, 0 replies; 47+ messages in thread
From: Andy Lutomirski @ 2018-03-21  0:39 UTC (permalink / raw)
  To: David Laight
  Cc: Andy Lutomirski, Ingo Molnar, Thomas Gleixner, Rahul Lakkireddy,
	x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Peter Zijlstra,
	Fenghua Yu, Eric Biggers, Rik van Riel

On Tue, Mar 20, 2018 at 3:10 PM, David Laight <David.Laight@aculab.com> wrote:
> From: Andy Lutomirski
>> Sent: 20 March 2018 14:57
> ...
>> I'd rather see us finally finish the work that Rik started to rework
>> this differently.  I'd like kernel_fpu_begin() to look like:
>>
>> if (test_thread_flag(TIF_NEED_FPU_RESTORE)) {
>>   return; // we're already okay.  maybe we need to check
>> in_interrupt() or something, though?
>> } else {
>>   XSAVES/XSAVEOPT/XSAVE;
>>   set_thread_flag(TIF_NEED_FPU_RESTORE):
>> }
>>
>> and kernel_fpu_end() does nothing at all.
>
> I guess it might need to set (clear?) the CFLAGS bit for a process
> that isn't using the fpu at all - which seems a sensible feature.

What do you mean "CFLAGS"?

But we no longer have any concept of "isn't using the fpu at all" --
we got rid of that.

>
>> We take the full performance hit for a *single* kernel_fpu_begin() on
>> an otherwise short syscall or interrupt, but there's no additional
>> cost for more of them or for long-enough-running things that we
>> schedule in the middle.
>
> It might be worth adding a parameter to kernel_fpu_begin() to indicate
> which registers are needed, and a return value to say what has been
> granted.
> Then a driver could request AVX2 (for example) and use a fallback path
> if the register set isn't available (for any reason).
> A call from an ISR could always fail.

Last time I benchmarked it, XSAVEC on none of the state wasn't a whole
lot faster than XSAVEC for all of it.

>
>> As I remember, the main hangup was that this interacts a bit oddly
>> with PKRU, but that's manageable.
>
> WTF PKRU ??

PKRU is uniquely demented.  All the rest of the XSAVEC state only
affects code that explicitly references that state.  PKRU affects
every single access to user pages, so we need PKRU to match the
current task at all times in the kernel.  This means that, if we start
deferring XRSTORS until prepare_exit_to_usermode(), we need to start
restoring PKRU using WRPKRU in __switch_to().  Of course, *that*
interacts a bit oddly with XINUSE, but maybe we don't care.

Phooey on you, Intel, for putting PKRU into xstate and not giving a
fast instruction to control XINUSE.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20  8:26         ` Ingo Molnar
  2018-03-20  8:38           ` Thomas Gleixner
  2018-03-20 14:57           ` Andy Lutomirski
@ 2018-03-20 18:01           ` Linus Torvalds
  2018-03-21  6:32             ` Ingo Molnar
  2018-03-21  7:46             ` Ingo Molnar
  2 siblings, 2 replies; 47+ messages in thread
From: Linus Torvalds @ 2018-03-20 18:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, David Laight, Rahul Lakkireddy, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Andy Lutomirski,
	Peter Zijlstra, Fenghua Yu, Eric Biggers

On Tue, Mar 20, 2018 at 1:26 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> So assuming the target driver will only load on modern FPUs I *think* it should
> actually be possible to do something like (pseudocode):
>
>         vmovdqa %ymm0, 40(%rsp)
>         vmovdqa %ymm1, 80(%rsp)
>
>         ...
>         # use ymm0 and ymm1
>         ...
>
>         vmovdqa 80(%rsp), %ymm1
>         vmovdqa 40(%rsp), %ymm0
>
> ... without using the heavy XSAVE/XRSTOR instructions.

No. The above is buggy. It may *work*, but it won't work in the long run.

Pretty much every single vector extension has traditionally meant that
touching "old" registers breaks any new register use. Even if you save
the registers carefully like in your example code, it will do magic
and random things to the "future extended" version.

So I absolutely *refuse* to have anything to do with the vector unit.
You can only touch it in the kernel if you own it entirely (ie that
"kernel_fpu_begin()/_end()" thing). Anything else is absolutely
guaranteed to cause problems down the line.

And even if you ignore that "maintenance problems down the line" issue
("we can fix them when they happen") I don't want to see games like
this, because I'm pretty sure it breaks the optimized xsave by tagging
the state as being dirty.

So no. Don't use vector stuff in the kernel. It's not worth the pain.

The *only* valid use is pretty much crypto, and even there it has had
issues. Benchmarks use big arrays and/or dense working sets etc to
"prove" how good the vector version is, and then you end up in
situations where it's used once per fairly small packet for an
interrupt, and it's actually much worse than doing it by hand.

               Linus

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20 18:01           ` Linus Torvalds
@ 2018-03-21  6:32             ` Ingo Molnar
  2018-03-21 15:45               ` Andy Lutomirski
  2018-03-21  7:46             ` Ingo Molnar
  1 sibling, 1 reply; 47+ messages in thread
From: Ingo Molnar @ 2018-03-21  6:32 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Gleixner, David Laight, Rahul Lakkireddy, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Andy Lutomirski,
	Peter Zijlstra, Fenghua Yu, Eric Biggers


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> And even if you ignore that "maintenance problems down the line" issue
> ("we can fix them when they happen") I don't want to see games like
> this, because I'm pretty sure it breaks the optimized xsave by tagging
> the state as being dirty.

That's true - and it would penalize the context switch cost of the affected task 
for the rest of its lifetime, as I don't think there's much that clears XINUSE 
other than a FINIT, which is rarely done by user-space.

> So no. Don't use vector stuff in the kernel. It's not worth the pain.

I agree, but:

> The *only* valid use is pretty much crypto, and even there it has had issues. 
> Benchmarks use big arrays and/or dense working sets etc to "prove" how good the 
> vector version is, and then you end up in situations where it's used once per 
> fairly small packet for an interrupt, and it's actually much worse than doing it 
> by hand.

That's mainly because the XSAVE/XRESTOR done by kernel_fpu_begin()/end() is so 
expensive, so this argument is somewhat circular.

IFF it was safe to just use the vector unit then vector unit based crypto would be 
very fast for small buffer as well, and would be even faster for larger buffer 
sizes as well. Saving and restoring up to ~1.5K of context is not cheap.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-21  6:32             ` Ingo Molnar
@ 2018-03-21 15:45               ` Andy Lutomirski
  2018-03-22  9:36                 ` Ingo Molnar
  0 siblings, 1 reply; 47+ messages in thread
From: Andy Lutomirski @ 2018-03-21 15:45 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Thomas Gleixner, David Laight, Rahul Lakkireddy,
	x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua Yu, Eric Biggers

On Wed, Mar 21, 2018 at 6:32 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
>> And even if you ignore that "maintenance problems down the line" issue
>> ("we can fix them when they happen") I don't want to see games like
>> this, because I'm pretty sure it breaks the optimized xsave by tagging
>> the state as being dirty.
>
> That's true - and it would penalize the context switch cost of the affected task
> for the rest of its lifetime, as I don't think there's much that clears XINUSE
> other than a FINIT, which is rarely done by user-space.
>
>> So no. Don't use vector stuff in the kernel. It's not worth the pain.
>
> I agree, but:
>
>> The *only* valid use is pretty much crypto, and even there it has had issues.
>> Benchmarks use big arrays and/or dense working sets etc to "prove" how good the
>> vector version is, and then you end up in situations where it's used once per
>> fairly small packet for an interrupt, and it's actually much worse than doing it
>> by hand.
>
> That's mainly because the XSAVE/XRESTOR done by kernel_fpu_begin()/end() is so
> expensive, so this argument is somewhat circular.

If we do the deferred restore, then the XSAVE/XRSTOR happens at most
once per kernel entry, which isn't so bad IMO.  Also, with PTI, kernel
entries are already so slow that this will be mostly in the noise :(

--Andy

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-21 15:45               ` Andy Lutomirski
@ 2018-03-22  9:36                 ` Ingo Molnar
  0 siblings, 0 replies; 47+ messages in thread
From: Ingo Molnar @ 2018-03-22  9:36 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Linus Torvalds, Thomas Gleixner, David Laight, Rahul Lakkireddy,
	x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Peter Zijlstra, Fenghua Yu, Eric Biggers


* Andy Lutomirski <luto@kernel.org> wrote:

> On Wed, Mar 21, 2018 at 6:32 AM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > * Linus Torvalds <torvalds@linux-foundation.org> wrote:
> >
> >> And even if you ignore that "maintenance problems down the line" issue
> >> ("we can fix them when they happen") I don't want to see games like
> >> this, because I'm pretty sure it breaks the optimized xsave by tagging
> >> the state as being dirty.
> >
> > That's true - and it would penalize the context switch cost of the affected task
> > for the rest of its lifetime, as I don't think there's much that clears XINUSE
> > other than a FINIT, which is rarely done by user-space.
> >
> >> So no. Don't use vector stuff in the kernel. It's not worth the pain.
> >
> > I agree, but:
> >
> >> The *only* valid use is pretty much crypto, and even there it has had issues.
> >> Benchmarks use big arrays and/or dense working sets etc to "prove" how good the
> >> vector version is, and then you end up in situations where it's used once per
> >> fairly small packet for an interrupt, and it's actually much worse than doing it
> >> by hand.
> >
> > That's mainly because the XSAVE/XRESTOR done by kernel_fpu_begin()/end() is so
> > expensive, so this argument is somewhat circular.
> 
> If we do the deferred restore, then the XSAVE/XRSTOR happens at most
> once per kernel entry, which isn't so bad IMO.  Also, with PTI, kernel
> entries are already so slow that this will be mostly in the noise :(

For performance/scalability work we should just ignore the PTI overhead: it 
doesn't exist on AMD CPUs and Intel has announced Meltdown-fixed CPUs, to be 
released later this year:

   https://www.anandtech.com/show/12533/intel-spectre-meltdown

By the time any kernel changes we are talking about today get to distros and users 
the newest hardware won't have the Meltdown bug.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-20 18:01           ` Linus Torvalds
  2018-03-21  6:32             ` Ingo Molnar
@ 2018-03-21  7:46             ` Ingo Molnar
  2018-03-21 18:15               ` Linus Torvalds
  1 sibling, 1 reply; 47+ messages in thread
From: Ingo Molnar @ 2018-03-21  7:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Gleixner, David Laight, Rahul Lakkireddy, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Andy Lutomirski,
	Peter Zijlstra, Fenghua Yu, Eric Biggers

So I poked around a bit and I'm having second thoughts:

* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Tue, Mar 20, 2018 at 1:26 AM, Ingo Molnar <mingo@kernel.org> wrote:
> >
> > So assuming the target driver will only load on modern FPUs I *think* it should
> > actually be possible to do something like (pseudocode):
> >
> >         vmovdqa %ymm0, 40(%rsp)
> >         vmovdqa %ymm1, 80(%rsp)
> >
> >         ...
> >         # use ymm0 and ymm1
> >         ...
> >
> >         vmovdqa 80(%rsp), %ymm1
> >         vmovdqa 40(%rsp), %ymm0
> >
> > ... without using the heavy XSAVE/XRSTOR instructions.
> 
> No. The above is buggy. It may *work*, but it won't work in the long run.
> 
> Pretty much every single vector extension has traditionally meant that
> touching "old" registers breaks any new register use. Even if you save
> the registers carefully like in your example code, it will do magic
> and random things to the "future extended" version.

This should be relatively straightforward to solve via a proper CPU features 
check: for example by only patching in the AVX routines for 'known compatible' 
fpu_kernel_xstate_size values. Future extensions of register width will extend
the XSAVE area.

It's not fool-proof: in theory there could be semantic extensions to the vector 
unit that does not increase the size of the context - but the normal pattern is to 
increase the number of XINUSE bits and bump up the maximum context area size.

If that's a worry then an even safer compatibility check would be to explicitly 
list CPU models - we do track them pretty accurately anyway these days, mostly due 
to perf PMU support defaulting to a safe but dumb variant if a CPU model is not 
specifically listed.

That method, although more maintenance-intense, should be pretty fool-proof 
AFAICS.

> So I absolutely *refuse* to have anything to do with the vector unit.
> You can only touch it in the kernel if you own it entirely (ie that
> "kernel_fpu_begin()/_end()" thing). Anything else is absolutely
> guaranteed to cause problems down the line.
> 
> And even if you ignore that "maintenance problems down the line" issue
> ("we can fix them when they happen") I don't want to see games like
> this, because I'm pretty sure it breaks the optimized xsave by tagging
> the state as being dirty.

So I added a bit of instrumentation and the current state of things is that on 
64-bit x86 every single task has an initialized FPU, every task has the exact 
same, fully filled in xfeatures (XINUSE) value:

 [root@galatea ~]# grep -h fpu /proc/*/task/*/fpu | sort | uniq -c
    504 x86/fpu: initialized                             :                    1
    504 x86/fpu: xfeatures_mask                          :                    7

So our latest FPU model is *really* simple and user-space should not be able to 
observe any changes in the XINUSE bits of the XSAVE header, because (at least for 
the basic vector CPU features) all bits are maxed out all the time.

Note that this is with an AVX (128-bit) supporting CPU:

[    0.000000] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
[    0.000000] x86/fpu: xstate_offset[2]:  576, xstate_sizes[2]:  256
[    0.000000] x86/fpu: Enabled xstate features 0x7, context size is 832 bytes, using 'standard' format.

But note that it probably wouldn't make sense to make use of XINUSE optimizations 
on most systems for the AVX space, as glibc will use the highest-bitness vector 
ops for its regular memcpy(), and every user task makes use of memcpy.

It does make sense for some of the more optional XSAVE based features such as 
pkeys. But I don't have any newer Intel system with a wider xsave feature set to 
check.

> So no. Don't use vector stuff in the kernel. It's not worth the pain.

That might still be true, but still I'm torn:

 - Broad areas of user-space has seemlessly integrated vector ops and is using 
   them all the time they can find an excuse to use them.

 - The vector registers are fundamentally callee-saved, so in most synchronous 
   calls the vector unit registers are unused. Asynchronous interruptions of 
   context (interrupts, faults, preemption, etc.) can still use them as well, as 
   long as they save/restore register contents.

So other than Intel not making it particularly easy to make a forwards compatible 
vector register granular save/restore pattern (but see above for how we could 
handle that) for asynchronous contexts, I don't see too many other complications.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-21  7:46             ` Ingo Molnar
@ 2018-03-21 18:15               ` Linus Torvalds
  2018-03-22  9:33                 ` Ingo Molnar
  2018-03-22 10:35                 ` David Laight
  0 siblings, 2 replies; 47+ messages in thread
From: Linus Torvalds @ 2018-03-21 18:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Thomas Gleixner, David Laight, Rahul Lakkireddy, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Andy Lutomirski,
	Peter Zijlstra, Fenghua Yu, Eric Biggers

On Wed, Mar 21, 2018 at 12:46 AM, Ingo Molnar <mingo@kernel.org> wrote:
>
> So I added a bit of instrumentation and the current state of things is that on
> 64-bit x86 every single task has an initialized FPU, every task has the exact
> same, fully filled in xfeatures (XINUSE) value:

Bah. Your CPU is apparently some old crud that barely has AVX. Of
course all those features are enabled.

> Note that this is with an AVX (128-bit) supporting CPU:

That's weak even by modern standards.

I have MPX bounds and MPX CSR on my old Skylake.

And the real worry is things like AVX-512 etc, which is exactly when
things like "save and restore one ymm register" will quite likely
clear the upper bits of the zmm register.

And yes, we can have some statically patched code that takes that into
account, and saves the whole zmm register when AVX512 is on, but the
whole *point* of the dynamic XSAVES thing is actually that Intel wants
to be able enable new user-space features without having to wait for
OS support. Literally. That's why and how it was designed.

And saving a couple of zmm registers is actually pretty hard. They're
big. Do you want to allocate 128 bytes of stack space, preferably
64-byte aligned, for a save area? No. So now it needs to be some kind
of per-thread (or maybe per-CPU, if we're willing to continue to not
preempt) special save area too.

And even then, it doesn't solve the real worry of "maybe there will be
odd interactions with future extensions that we don't even know of".

All this to do a 32-byte PIO access, with absolutely zero data right
now on what the win is?

Yes, yes, I can find an Intel white-paper that talks about setting WC
and then using xmm and ymm instructions to write a single 64-byte
burst over PCIe, and I assume that is where the code and idea came
from. But I don't actually see any reason why a burst of 8 regular
quad-word bytes wouldn't cause a 64-byte burst write too.

So right now this is for _one_ odd rdma controller, with absolutely
_zero_ performance numbers, and a very high likelihood that it does
not matter in the least.

And if there are some atomicity concerns ("we need to guarantee a
single atomic access for race conditions with the hardware"), they are
probably bogus and misplaced anyway, since

 (a) we can't guarantee that AVX2 exists in the first place

 (b) we can't guarantee that %ymm register write will show up on any
bus as a single write transaction anyway

So as far as I can tell, there are basically *zero* upsides, and a lot
of potential downsides.

                Linus

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-21 18:15               ` Linus Torvalds
@ 2018-03-22  9:33                 ` Ingo Molnar
  2018-03-22 17:40                   ` Alexei Starovoitov
  2018-03-22 10:35                 ` David Laight
  1 sibling, 1 reply; 47+ messages in thread
From: Ingo Molnar @ 2018-03-22  9:33 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Thomas Gleixner, David Laight, Rahul Lakkireddy, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Andy Lutomirski,
	Peter Zijlstra, Fenghua Yu, Eric Biggers

* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> And the real worry is things like AVX-512 etc, which is exactly when
> things like "save and restore one ymm register" will quite likely
> clear the upper bits of the zmm register.

Yeah, I think the only valid save/restore pattern is to 100% correctly enumerate 
the width of the vector registers, and use full width instructions.

Using partial registers, even though it's possible in some cases is probably a bad 
idea not just due to most instructions auto-zeroing the upper portion to reduce 
false dependencies, but also because 'mixed' use of partial and full register 
access is known to result in penalties on a wide range of Intel CPUs, at least 
according to the Agner PDFs. On AMD CPUs there's no penalty.

So what I think could be done at best is to define a full register save/restore 
API, which falls back to XSAVE*/XRSTOR* if we don't have the routines for the 
native vector register width. (I.e. if old kernel is used on very new CPU.)

Note that the actual AVX code could still use partial width, it's the save/restore 
primitives that has to handle full width registers.

> And yes, we can have some statically patched code that takes that into account, 
> and saves the whole zmm register when AVX512 is on, but the whole *point* of the 
> dynamic XSAVES thing is actually that Intel wants to be able enable new 
> user-space features without having to wait for OS support. Literally. That's why 
> and how it was designed.

This aspect wouldn't be hurt AFAICS: to me it appears that due to glibc using 
vector instructions in its memset() the AVX bits get used early on and to the 
maximum, so the XINUSE for them is set for every task.

The optionality of other XSAVE based features like MPX wouldn't be hurt if the 
kernel only uses vector registers.

> And saving a couple of zmm registers is actually pretty hard. They're big. Do 
> you want to allocate 128 bytes of stack space, preferably 64-byte aligned, for a 
> save area? No. So now it needs to be some kind of per-thread (or maybe per-CPU, 
> if we're willing to continue to not preempt) special save area too.

Hm, that's indeed a nasty complication:

 - While a single 128 bytes slot might work - in practice at least two vector
   registers are needed to have enough parallelism and hide latencies.

 - &current->thread.fpu.state.xsave is available almost all the time: with our 
   current 'direct' FPU context switching code the only time there's live data in
   &current->thread.fpu is when the task is not running. But it's not IRQ-safe.

We could probably allow irq save/restore sections to use it, as 
local_irq_save()/restore() is still *much* faster than a 1-1.5K FPU context 
save/restore pattern.

But I was hoping for a less restrictive model ... :-/

To have a better model and avoid the local_irq_save()/restore we could perhaps 
change the IRQ model to have a per IRQ 'current' value (we have separate IRQ 
stacks already), but that's quite a bit of work to transform all code that 
operates on the interrupted task (scheduler and timer code).

But it's work that would be useful for other reasons as well.

With such a separation in place &current->thread.fpu.state.xsave would become a 
generic, natural vector register save area.

> And even then, it doesn't solve the real worry of "maybe there will be odd 
> interactions with future extensions that we don't even know of".

Yes, that's true, but I think we could avoid these dangers by using CPU model 
based enumeration. The cost would be that vector ops would only be available on 
new CPU models after an explicit opt-in. In many cases it will be a single new 
constant to an existing switch() statement, easily backported as well.

> All this to do a 32-byte PIO access, with absolutely zero data right
> now on what the win is?

Ok, so that's not what I'd use it for, I'd use it:

 - Speed up existing AVX (crypto, RAID) routines for smaller buffer sizes.
   Right now the XSAVE*+XRSTOR* cost is significant:

     x86/fpu: Cost of: XSAVE   insn:   104 cycles
     x86/fpu: Cost of: XRSTOR  insn:    80 cycles

   ... and that's with just 128-bit AVX and a ~0.8K XSAVE area. The Agner PDF 
   lists Skylake XSAVE+XRSTOR costs at 107+122 cycles, plus there's probably a
   significant amount of L1 cache churn caused by XSAVE/XRSTOR.

   Most of the relevant vector instructions have a single cycle cost
   on the other hand.

 - To use vector ops in bulk, well-aligned memcpy(), which in many workloads
   is a fair chunk of all memset() activity. A usage profile on a typical system:

            galatea:~> cat /proc/sched_debug  | grep hist | grep -E '[[:digit:]]{4,}$' | grep '0\]'
            hist[0x0000]:    1514272
            hist[0x0010]:    1905248
            hist[0x0020]:      99471
            hist[0x0030]:     343309
            hist[0x0040]:     177874
            hist[0x0080]:     190052
            hist[0x00a0]:       5258
            hist[0x00b0]:       2387
            hist[0x00c0]:       6975
            hist[0x00d0]:       5872
            hist[0x0100]:       3229
            hist[0x0140]:       4813
            hist[0x0160]:       9323
            hist[0x0200]:      12540
            hist[0x0230]:      37488
            hist[0x1000]:      17136
            hist[0x1d80]:     225199

   First column is length of the area copied, the column is usage count.

   To do this I wouldn't complicate the main memset() interface in any way to 
   branch it off to vector ops, I'd isolate specific memcpy()'s and memset()s
   (such as page table copying and page clearing) and use the simpler
   vector register based primitives there.

   For example we have clear_page() which is used by GFP_ZERO and by other places
   is implemented on modern x86 CPUs as:

     ENTRY(clear_page_erms)
        movl $4096,%ecx
        xorl %eax,%eax
        rep stosb
        ret

   While for such buffer sizes the enhanced-REP string instructions are supposed 
   to be slightly faster than 128-bit AVX ops, for such exact page granular ops 
   I'm pretty sure 256-bit (and 512-bit) vector ops are faster.

 - For page granular memset/memcpy it would also be interesting to investigate 
   whether non-temporal, cache-preserving vector ops for such known-large bulk 
   ops, such as VMOVNTQA, are beneficial in certain circumstances.

   On x86 there's only a single non-temporal instruction to GP registers: 
   MOVNTI, and for stores only.

   The vector instructions space is a lot richer in that regard, allowing 
   non-temporal loads as well which utilize fill buffers to move chunks of memory 
   into vector registers.

   Random example: in do_cow_fault() we use copy_user_highpage() to copy the page, 
   which uses copy_user_page() -> copy_page(), which uses:

     ENTRY(copy_page)
        ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
        movl    $4096/8, %ecx
        rep     movsq
        ret

   But in this COW copy case it's pretty obvious that we shouldn't keep the 
   _source_ page in cache. So we could use non-temporal load, which appear to make 
   a difference on more recent uarchs even on write-back memory ranges:

      https://stackoverflow.com/questions/40096894/do-current-x86-architectures-support-non-temporal-loads-from-normal-memory

   See the final graph in that entry and now the 'NT load' variant results in the 
   best execution time in the 4K case - and this is a limited benchmark that
   doesn't measure the lower cache eviction pressure by NT loads.

   ( The store part is probably better done into the cache, not just due to the
     SFENCE cost (which is relatively low at 40 cycles), but because it's probably
     beneficial to prime the cache with a freshly COW-ed page, it's going to get
     used in the near future once we return from the fault. )

   etc.

 - But more broadly, if we open up vector ops for smaller buffer sizes as well
   then I think other kernel code would start using them as well:

 - I think the BPF JIT, whose byte code machine languge is used by an
   increasing number of kernel subsystems, could benefit from having vector ops.
   It would possibly allow the handling of floating point types.

 - We could consider implementing vector ops based copy-to-user and copy-from-user 
   primitives as well, for cases where we know that the dominant usage pattern is 
   for larger, well-aligned chunks of memory.

 - Maybe we could introduce a floating point library (which falls back to a C 
   implementation) and simplify scheduler math. We go to ridiculous lengths to 
   maintain precision across a wide range of parameters, essentially implementing 
   128-bit fixed point math. Even 32-bit floating point math would possibly be 
   better than that, even if it was done via APIs.

etc.: I think the large vector processor available in modern x86 CPUs could be 
utilized by the kernel as well for various purposes.

But I think that's only worth doing if vector registers and their save areas are 
easily accessibly and the accesses are fundamentally IRQ safe.

> Yes, yes, I can find an Intel white-paper that talks about setting WC and then 
> using xmm and ymm instructions to write a single 64-byte burst over PCIe, and I 
> assume that is where the code and idea came from. But I don't actually see any 
> reason why a burst of 8 regular quad-word bytes wouldn't cause a 64-byte burst 
> write too.

Yeah, I'm not too convinced about the wide readq/writeq usecase either, I just 
used the opportunity to outline these (very vague) plans about utilizing vector 
instructions more broadly within the kernel.

> So as far as I can tell, there are basically *zero* upsides, and a lot of 
> potential downsides.

I agree about the potential downsides and I think most of them can be addressed 
adequately - and I think my list of upsides above is potentially significant, 
especially once we have lightweight APIs to utilize individual vector registers 
without having to do a full save/restore of ~1K large vector register context.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-22  9:33                 ` Ingo Molnar
@ 2018-03-22 17:40                   ` Alexei Starovoitov
  2018-03-22 17:44                     ` Andy Lutomirski
  0 siblings, 1 reply; 47+ messages in thread
From: Alexei Starovoitov @ 2018-03-22 17:40 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Thomas Gleixner, David Laight, Rahul Lakkireddy,
	x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua Yu, Eric Biggers,
	Daniel Borkmann

On Thu, Mar 22, 2018 at 10:33:43AM +0100, Ingo Molnar wrote:
> 
>  - I think the BPF JIT, whose byte code machine languge is used by an
>    increasing number of kernel subsystems, could benefit from having vector ops.
>    It would possibly allow the handling of floating point types.

this is on our todo list already.
To process certain traffic inside BPF in XDP we'd like to have access to
floating point. The current workaround is to precompute the math and do
bpf map lookup instead.
Since XDP processing of packets is batched (typically up to napi budget
of 64 packets at a time), we can, in theory, wrap the loop with
kernel_fpu_begin/end and it will be cleaner and faster,
but the work hasn't started yet.
The microbenchmark numbers you quoted for xsave/xrestore look promising,
so we probably will focus on it soon.

Another use case for vector insns is to accelerate fib/lpm lookups
which is likely beneficial for kernel overall regardless of bpf usage.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-22 17:40                   ` Alexei Starovoitov
@ 2018-03-22 17:44                     ` Andy Lutomirski
  0 siblings, 0 replies; 47+ messages in thread
From: Andy Lutomirski @ 2018-03-22 17:44 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Ingo Molnar, Linus Torvalds, Thomas Gleixner, David Laight,
	Rahul Lakkireddy, x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, mingo@redhat.com, hpa@zytor.com,
	davem@davemloft.net, akpm@linux-foundation.org,
	ganeshgr@chelsio.com, nirranjan@chelsio.com, indranil@chelsio.com,
	Andy Lutomirski, Peter Zijlstra, Fenghua Yu, Eric Biggers,
	Daniel Borkmann

On Thu, Mar 22, 2018 at 5:40 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Thu, Mar 22, 2018 at 10:33:43AM +0100, Ingo Molnar wrote:
>>
>>  - I think the BPF JIT, whose byte code machine languge is used by an
>>    increasing number of kernel subsystems, could benefit from having vector ops.
>>    It would possibly allow the handling of floating point types.
>
> this is on our todo list already.
> To process certain traffic inside BPF in XDP we'd like to have access to
> floating point. The current workaround is to precompute the math and do
> bpf map lookup instead.
> Since XDP processing of packets is batched (typically up to napi budget
> of 64 packets at a time), we can, in theory, wrap the loop with
> kernel_fpu_begin/end and it will be cleaner and faster,
> but the work hasn't started yet.
> The microbenchmark numbers you quoted for xsave/xrestore look promising,
> so we probably will focus on it soon.
>
> Another use case for vector insns is to accelerate fib/lpm lookups
> which is likely beneficial for kernel overall regardless of bpf usage.
>

This is a further argument for the deferred restore approach IMO.
With deferred restore, kernel_fpu_begin() + kernel_fpu_end() has very
low amortized cost as long as we do lots of work in the kernel before
re-entering userspace.  For XDP workloads, this could be a pretty big
win, I imagine.

Someone just needs to do the nasty x86 work.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-21 18:15               ` Linus Torvalds
  2018-03-22  9:33                 ` Ingo Molnar
@ 2018-03-22 10:35                 ` David Laight
  2018-03-22 12:48                   ` David Laight
  1 sibling, 1 reply; 47+ messages in thread
From: David Laight @ 2018-03-22 10:35 UTC (permalink / raw)
  To: 'Linus Torvalds', Ingo Molnar
  Cc: Thomas Gleixner, Rahul Lakkireddy, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Andy Lutomirski,
	Peter Zijlstra, Fenghua Yu, Eric Biggers

From: Sent: 21 March 2018 18:16
> To: Ingo Molnar
...
> All this to do a 32-byte PIO access, with absolutely zero data right
> now on what the win is?
> 
> Yes, yes, I can find an Intel white-paper that talks about setting WC
> and then using xmm and ymm instructions to write a single 64-byte
> burst over PCIe, and I assume that is where the code and idea came
> from. But I don't actually see any reason why a burst of 8 regular
> quad-word bytes wouldn't cause a 64-byte burst write too.

The big gain is from wide PCIe reads, not writes.
Writes to uncached locations (without WC) are almost certainly required
to generate the 'obvious' PCIe TLP, otherwise things are likely to break.

> So right now this is for _one_ odd rdma controller, with absolutely
> _zero_ performance numbers, and a very high likelihood that it does
> not matter in the least.
> 
> And if there are some atomicity concerns ("we need to guarantee a
> single atomic access for race conditions with the hardware"), they are
> probably bogus and misplaced anyway, since
> 
>  (a) we can't guarantee that AVX2 exists in the first place

Any code would need to be in memcpy_fromio(), not in every driver that
might benefit.
Then fallback code can be used if the registers aren't available.

>  (b) we can't guarantee that %ymm register write will show up on any
> bus as a single write transaction anyway

Misaligned 8 byte accesses generate a single PCIe TLP.
I can look at what happens for AVX2 transfers later.
I've got code that mmap()s PCIe addresses into user space, and can
look at the TLP (indirectly through tracing on an fpga target).
Just need to set something up that uses AVX copies.

> So as far as I can tell, there are basically *zero* upsides, and a lot
> of potential downsides.

There are some upsides.
I'll do a performance measurement for reads.

	David

^ permalink raw reply	[flat|nested] 47+ messages in thread

* RE: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-22 10:35                 ` David Laight
@ 2018-03-22 12:48                   ` David Laight
  2018-03-22 17:07                     ` Linus Torvalds
  0 siblings, 1 reply; 47+ messages in thread
From: David Laight @ 2018-03-22 12:48 UTC (permalink / raw)
  To: 'Linus Torvalds', 'Ingo Molnar'
  Cc: 'Thomas Gleixner', 'Rahul Lakkireddy',
	'x86@kernel.org', 'linux-kernel@vger.kernel.org',
	'netdev@vger.kernel.org', 'mingo@redhat.com',
	'hpa@zytor.com', 'davem@davemloft.net',
	'akpm@linux-foundation.org',
	'ganeshgr@chelsio.com', 'nirranjan@chelsio.com',
	'indranil@chelsio.com', 'Andy Lutomirski',
	'Peter Zijlstra', 'Fenghua Yu',
	'Eric Biggers'

From: David Laight
> Sent: 22 March 2018 10:36
...
> Any code would need to be in memcpy_fromio(), not in every driver that
> might benefit.
> Then fallback code can be used if the registers aren't available.
> 
> >  (b) we can't guarantee that %ymm register write will show up on any
> > bus as a single write transaction anyway
> 
> Misaligned 8 byte accesses generate a single PCIe TLP.
> I can look at what happens for AVX2 transfers later.
> I've got code that mmap()s PCIe addresses into user space, and can
> look at the TLP (indirectly through tracing on an fpga target).
> Just need to set something up that uses AVX copies.

On my i7-7700 reads into xmm registers generate 16 byte TLP and
reads into ymm registers 32 byte TLP.
I don't think I've a system that supports avx-512

With my lethargic fpga slave 32 bytes reads happen every 144 clocks
and 16 byte ones every 126 (+/- the odd clock).
Single bytes ones happen every 108, 8 byte 117.
So I have (about) 110 clock overhead on every read cycle.
These clocks are the 62.5MHz clock on the fpga.

So if we needed to do PIO reads using the AVX2 (or better AVX-512)
registers would make a significant difference.
Fortunately we can 'dma' most of the data we need to transfer.

I've traced writes before, they are a lot faster and are limited
by things in the fpga fabric (they appear back to back).

	David

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-22 12:48                   ` David Laight
@ 2018-03-22 17:07                     ` Linus Torvalds
  0 siblings, 0 replies; 47+ messages in thread
From: Linus Torvalds @ 2018-03-22 17:07 UTC (permalink / raw)
  To: David Laight
  Cc: Ingo Molnar, Thomas Gleixner, Rahul Lakkireddy, x86@kernel.org,
	linux-kernel@vger.kernel.org, netdev@vger.kernel.org,
	mingo@redhat.com, hpa@zytor.com, davem@davemloft.net,
	akpm@linux-foundation.org, ganeshgr@chelsio.com,
	nirranjan@chelsio.com, indranil@chelsio.com, Andy Lutomirski,
	Peter Zijlstra, Fenghua Yu, Eric Biggers

On Thu, Mar 22, 2018 at 5:48 AM, David Laight <David.Laight@aculab.com> wrote:
>
> So if we needed to do PIO reads using the AVX2 (or better AVX-512)
> registers would make a significant difference.
> Fortunately we can 'dma' most of the data we need to transfer.

I think this is the really fundamental issue.

A device that expects PIO to do some kind of high-performance
transaction is a *broken* device.

It really is that simple. We don't bend over for misdesigned hardware
crap unless it is really common.

> I've traced writes before, they are a lot faster and are limited
> by things in the fpga fabric (they appear back to back).

The write combine buffer really should be much more effective than any
AVX or similar can ever be.

               Linus

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-19 14:20 [RFC PATCH 0/3] kernel: add support for 256-bit IO access Rahul Lakkireddy
                   ` (3 preceding siblings ...)
  2018-03-19 14:53 ` [RFC PATCH 0/3] kernel: add support for 256-bit IO access David Laight
@ 2018-03-19 15:27 ` Christoph Hellwig
  2018-03-20 13:45   ` Rahul Lakkireddy
  4 siblings, 1 reply; 47+ messages in thread
From: Christoph Hellwig @ 2018-03-19 15:27 UTC (permalink / raw)
  To: Rahul Lakkireddy
  Cc: x86, linux-kernel, netdev, tglx, mingo, hpa, davem, akpm,
	torvalds, ganeshgr, nirranjan, indranil

On Mon, Mar 19, 2018 at 07:50:33PM +0530, Rahul Lakkireddy wrote:
> This series of patches add support for 256-bit IO read and write.
> The APIs are readqq and writeqq (quad quadword - 4 x 64), that read
> and write 256-bits at a time from IO, respectively.

What a horrible name.  please encode the actual number of bits instead.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [RFC PATCH 0/3] kernel: add support for 256-bit IO access
  2018-03-19 15:27 ` Christoph Hellwig
@ 2018-03-20 13:45   ` Rahul Lakkireddy
  0 siblings, 0 replies; 47+ messages in thread
From: Rahul Lakkireddy @ 2018-03-20 13:45 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: x86@kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, tglx@linutronix.de, mingo@redhat.com,
	hpa@zytor.com, davem@davemloft.net, akpm@linux-foundation.org,
	torvalds@linux-foundation.org, Ganesh GR, Nirranjan Kirubaharan,
	Indranil Choudhury

On Monday, March 03/19/18, 2018 at 20:57:22 +0530, Christoph Hellwig wrote:
> On Mon, Mar 19, 2018 at 07:50:33PM +0530, Rahul Lakkireddy wrote:
> > This series of patches add support for 256-bit IO read and write.
> > The APIs are readqq and writeqq (quad quadword - 4 x 64), that read
> > and write 256-bits at a time from IO, respectively.
> 
> What a horrible name.  please encode the actual number of bits instead.

Ok.  Will change it to read256() and write256().

Thanks,
Rahul

^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, other threads:[~2018-04-03 10:37 UTC | newest]

Thread overview: 47+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-03-19 14:20 [RFC PATCH 0/3] kernel: add support for 256-bit IO access Rahul Lakkireddy
2018-03-19 14:20 ` [RFC PATCH 1/3] include/linux: add 256-bit IO accessors Rahul Lakkireddy
2018-03-19 14:20 ` [RFC PATCH 2/3] x86/io: implement 256-bit IO read and write Rahul Lakkireddy
2018-03-19 14:43   ` Thomas Gleixner
2018-03-20 13:32     ` Rahul Lakkireddy
2018-03-20 13:44       ` Andy Shevchenko
2018-03-21 12:27         ` Rahul Lakkireddy
2018-03-20 14:40       ` David Laight
2018-03-21 12:28         ` Rahul Lakkireddy
2018-03-20 14:42       ` Alexander Duyck
2018-03-21 12:28         ` Rahul Lakkireddy
2018-03-22  1:26         ` Linus Torvalds
2018-03-22 10:48           ` David Laight
2018-03-22 17:16             ` Linus Torvalds
2018-03-19 14:20 ` [RFC PATCH 3/3] cxgb4: read on-chip memory 256-bits at a time Rahul Lakkireddy
2018-03-19 14:53 ` [RFC PATCH 0/3] kernel: add support for 256-bit IO access David Laight
2018-03-19 15:05   ` Thomas Gleixner
2018-03-19 15:19     ` David Laight
2018-03-19 15:37       ` Thomas Gleixner
2018-03-19 15:53         ` David Laight
2018-03-19 16:29           ` Linus Torvalds
2018-03-20  8:26         ` Ingo Molnar
2018-03-20  8:38           ` Thomas Gleixner
2018-03-20  9:08             ` Ingo Molnar
2018-03-20  9:41               ` Thomas Gleixner
2018-03-20  9:59                 ` David Laight
2018-03-20 10:54                 ` Ingo Molnar
2018-03-20 13:30                   ` David Laight
2018-04-03  8:49                   ` Pavel Machek
2018-04-03 10:36                     ` Ingo Molnar
2018-03-20 14:57           ` Andy Lutomirski
2018-03-20 15:10             ` David Laight
2018-03-21  0:39               ` Andy Lutomirski
2018-03-20 18:01           ` Linus Torvalds
2018-03-21  6:32             ` Ingo Molnar
2018-03-21 15:45               ` Andy Lutomirski
2018-03-22  9:36                 ` Ingo Molnar
2018-03-21  7:46             ` Ingo Molnar
2018-03-21 18:15               ` Linus Torvalds
2018-03-22  9:33                 ` Ingo Molnar
2018-03-22 17:40                   ` Alexei Starovoitov
2018-03-22 17:44                     ` Andy Lutomirski
2018-03-22 10:35                 ` David Laight
2018-03-22 12:48                   ` David Laight
2018-03-22 17:07                     ` Linus Torvalds
2018-03-19 15:27 ` Christoph Hellwig
2018-03-20 13:45   ` Rahul Lakkireddy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).