* [PATCH RFC 0/4] Introduce 128-bit IO access
@ 2025-11-12 1:58 Chenghai Huang
2025-11-12 1:58 ` [PATCH RFC 1/4] UAPI: Introduce 128-bit types and byteswap operations Chenghai Huang
` (3 more replies)
0 siblings, 4 replies; 9+ messages in thread
From: Chenghai Huang @ 2025-11-12 1:58 UTC (permalink / raw)
To: arnd, catalin.marinas, will, akpm, anshuman.khandual,
ryan.roberts, andriy.shevchenko, herbert, linux-kernel,
linux-arch, linux-arm-kernel, linux-crypto, linux-api
Cc: fanghao11, shenyang39, liulongfang, qianweili
These patches introduce 128-bit IO access functionality. The reason
is that the current HiSilicon cryptographic devices need to
maintain atomic operations when accessing 128-bit MMIO across
physical and virtual functions.
Currently, 128-bit atomic writes have already been implemented in
the device driver, and the driver also depends on a 128-bit atomic
read access interface. Therefore, we have introduced a generic
128-bit IO access interface to replace the implementation of
128-bit read and write IO interfaces using instructions in the
device driver. When the architecture does not support 128-bit
atomic operations, non-atomic 128-bit read and write interfaces can
be used to make the driver functional.
Weili Qian (4):
UAPI: Introduce 128-bit types and byteswap operations
asm-generic/io.h: add io{read,write}128 accessors
io-128-nonatomic: introduce io{read|write}128_{lo_hi|hi_lo}
arm64/io: Add {__raw_read|__raw_write}128 support
arch/arm64/include/asm/io.h | 21 +++++++++
include/asm-generic/io.h | 48 ++++++++++++++++++++
include/linux/io-128-nonatomic-hi-lo.h | 35 ++++++++++++++
include/linux/io-128-nonatomic-lo-hi.h | 34 ++++++++++++++
include/uapi/linux/byteorder/big_endian.h | 6 +++
include/uapi/linux/byteorder/little_endian.h | 6 +++
include/uapi/linux/swab.h | 10 ++++
include/uapi/linux/types.h | 3 ++
8 files changed, 163 insertions(+)
create mode 100644 include/linux/io-128-nonatomic-hi-lo.h
create mode 100644 include/linux/io-128-nonatomic-lo-hi.h
--
2.33.0
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH RFC 1/4] UAPI: Introduce 128-bit types and byteswap operations
2025-11-12 1:58 [PATCH RFC 0/4] Introduce 128-bit IO access Chenghai Huang
@ 2025-11-12 1:58 ` Chenghai Huang
2025-11-12 1:58 ` [PATCH RFC 2/4] asm-generic/io.h: add io{read,write}128 accessors Chenghai Huang
` (2 subsequent siblings)
3 siblings, 0 replies; 9+ messages in thread
From: Chenghai Huang @ 2025-11-12 1:58 UTC (permalink / raw)
To: arnd, catalin.marinas, will, akpm, anshuman.khandual,
ryan.roberts, andriy.shevchenko, herbert, linux-kernel,
linux-arch, linux-arm-kernel, linux-crypto, linux-api
Cc: fanghao11, shenyang39, liulongfang, qianweili
From: Weili Qian <qianweili@huawei.com>
Architectures like ARM64 support 128-bit integer types and
operations. This patch adds a generic byte order conversion
interface for 128-bit.
Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Chenghai Huang <huangchenghai2@huawei.com>
---
include/uapi/linux/byteorder/big_endian.h | 6 ++++++
include/uapi/linux/byteorder/little_endian.h | 6 ++++++
include/uapi/linux/swab.h | 10 ++++++++++
include/uapi/linux/types.h | 3 +++
4 files changed, 25 insertions(+)
diff --git a/include/uapi/linux/byteorder/big_endian.h b/include/uapi/linux/byteorder/big_endian.h
index 80aa5c41a763..318d51a18f43 100644
--- a/include/uapi/linux/byteorder/big_endian.h
+++ b/include/uapi/linux/byteorder/big_endian.h
@@ -29,6 +29,12 @@
#define __constant_be32_to_cpu(x) ((__force __u32)(__be32)(x))
#define __constant_cpu_to_be16(x) ((__force __be16)(__u16)(x))
#define __constant_be16_to_cpu(x) ((__force __u16)(__be16)(x))
+
+#ifdef __SIZEOF_INT128__
+#define __cpu_to_le128(x) ((__force __le128)__swab128((x)))
+#define __le128_to_cpu(x) __swab128((__force __u128)(__le128)(x))
+#endif
+
#define __cpu_to_le64(x) ((__force __le64)__swab64((x)))
#define __le64_to_cpu(x) __swab64((__force __u64)(__le64)(x))
#define __cpu_to_le32(x) ((__force __le32)__swab32((x)))
diff --git a/include/uapi/linux/byteorder/little_endian.h b/include/uapi/linux/byteorder/little_endian.h
index cd98982e7523..b2732452b825 100644
--- a/include/uapi/linux/byteorder/little_endian.h
+++ b/include/uapi/linux/byteorder/little_endian.h
@@ -29,6 +29,12 @@
#define __constant_be32_to_cpu(x) ___constant_swab32((__force __u32)(__be32)(x))
#define __constant_cpu_to_be16(x) ((__force __be16)___constant_swab16((x)))
#define __constant_be16_to_cpu(x) ___constant_swab16((__force __u16)(__be16)(x))
+
+#ifdef __SIZEOF_INT128__
+#define __cpu_to_le128(x) ((__force __le128)(__u128)(x))
+#define __le128_to_cpu(x) ((__force __u128)(__le128)(x))
+#endif
+
#define __cpu_to_le64(x) ((__force __le64)(__u64)(x))
#define __le64_to_cpu(x) ((__force __u64)(__le64)(x))
#define __cpu_to_le32(x) ((__force __le32)(__u32)(x))
diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
index 01717181339e..7381b9a785ce 100644
--- a/include/uapi/linux/swab.h
+++ b/include/uapi/linux/swab.h
@@ -133,6 +133,16 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
__fswab64(x))
#endif
+#ifdef __SIZEOF_INT128__
+static inline __attribute_const__ __u128 __swab128(__u128 val)
+{
+ __u64 h = val >> 64;
+ __u64 l = val;
+
+ return (((__u128)__swab64(l)) << 64) | ((__u128)(__swab64(h)));
+}
+#endif
+
static __always_inline unsigned long __swab(const unsigned long y)
{
#if __BITS_PER_LONG == 64
diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index 48b933938877..9624ea43cd8a 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -40,6 +40,9 @@ typedef __u32 __bitwise __be32;
typedef __u64 __bitwise __le64;
typedef __u64 __bitwise __be64;
+#ifdef __SIZEOF_INT128__
+typedef __u128 __bitwise __le128;
+#endif
typedef __u16 __bitwise __sum16;
typedef __u32 __bitwise __wsum;
--
2.33.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH RFC 2/4] asm-generic/io.h: add io{read,write}128 accessors
2025-11-12 1:58 [PATCH RFC 0/4] Introduce 128-bit IO access Chenghai Huang
2025-11-12 1:58 ` [PATCH RFC 1/4] UAPI: Introduce 128-bit types and byteswap operations Chenghai Huang
@ 2025-11-12 1:58 ` Chenghai Huang
2025-11-12 1:58 ` [PATCH RFC 3/4] io-128-nonatomic: introduce io{read|write}128_{lo_hi|hi_lo} Chenghai Huang
2025-11-12 1:58 ` [PATCH RFC 4/4] arm64/io: Add {__raw_read|__raw_write}128 support Chenghai Huang
3 siblings, 0 replies; 9+ messages in thread
From: Chenghai Huang @ 2025-11-12 1:58 UTC (permalink / raw)
To: arnd, catalin.marinas, will, akpm, anshuman.khandual,
ryan.roberts, andriy.shevchenko, herbert, linux-kernel,
linux-arch, linux-arm-kernel, linux-crypto, linux-api
Cc: fanghao11, shenyang39, liulongfang, qianweili
From: Weili Qian <qianweili@huawei.com>
Architectures like ARM64 already support 128-bit memory access. Currently,
device drivers implement atomic read and write operations for 128-bit
memory using assembly. This patch adds generic io{read,write}128 access
functions, which will enable device drivers to consistently use
io{read,write}128 for 128-bit access.
Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Chenghai Huang <huangchenghai2@huawei.com>
---
include/asm-generic/io.h | 48 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 48 insertions(+)
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index ca5a1ce6f0f8..c419021318e6 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -146,6 +146,16 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)
#endif
#endif /* CONFIG_64BIT */
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+#ifndef __raw_read128
+#define __raw_read128 __raw_read128
+static inline u128 __raw_read128(volatile void __iomem *addr)
+{
+ return *(const volatile u128 __force *)addr;
+}
+#endif
+#endif /* CONFIG_ARCH_SUPPORTS_INT128 */
+
#ifndef __raw_writeb
#define __raw_writeb __raw_writeb
static inline void __raw_writeb(u8 value, volatile void __iomem *addr)
@@ -180,6 +190,16 @@ static inline void __raw_writeq(u64 value, volatile void __iomem *addr)
#endif
#endif /* CONFIG_64BIT */
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+#ifndef __raw_write128
+#define __raw_write128 __raw_write128
+static inline void __raw_write128(u128 value, volatile void __iomem *addr)
+{
+ *(volatile u128 __force *)addr = value;
+}
+#endif
+#endif /* CONFIG_ARCH_SUPPORTS_INT128 */
+
/*
* {read,write}{b,w,l,q}() access little endian memory and return result in
* native endianness.
@@ -917,6 +937,22 @@ static inline u64 ioread64(const volatile void __iomem *addr)
#endif
#endif /* CONFIG_64BIT */
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+#ifndef ioread128
+#define ioread128 ioread128
+static inline u128 ioread128(const volatile void __iomem *addr)
+{
+ u128 val;
+
+ __io_br();
+ val = __le128_to_cpu((__le128 __force)__raw_read128(addr));
+ __io_ar(val);
+
+ return val;
+}
+#endif
+#endif /* CONFIG_ARCH_SUPPORTS_INT128 */
+
#ifndef iowrite8
#define iowrite8 iowrite8
static inline void iowrite8(u8 value, volatile void __iomem *addr)
@@ -951,6 +987,18 @@ static inline void iowrite64(u64 value, volatile void __iomem *addr)
#endif
#endif /* CONFIG_64BIT */
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+#ifndef iowrite128
+#define iowrite128 iowrite128
+static inline void iowrite128(u128 value, volatile void __iomem *addr)
+{
+ __io_bw();
+ __raw_write128((u128 __force)__cpu_to_le128(value), addr);
+ __io_aw();
+}
+#endif
+#endif /* CONFIG_ARCH_SUPPORTS_INT128 */
+
#ifndef ioread16be
#define ioread16be ioread16be
static inline u16 ioread16be(const volatile void __iomem *addr)
--
2.33.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH RFC 3/4] io-128-nonatomic: introduce io{read|write}128_{lo_hi|hi_lo}
2025-11-12 1:58 [PATCH RFC 0/4] Introduce 128-bit IO access Chenghai Huang
2025-11-12 1:58 ` [PATCH RFC 1/4] UAPI: Introduce 128-bit types and byteswap operations Chenghai Huang
2025-11-12 1:58 ` [PATCH RFC 2/4] asm-generic/io.h: add io{read,write}128 accessors Chenghai Huang
@ 2025-11-12 1:58 ` Chenghai Huang
2025-11-12 14:48 ` Ben Dooks
2025-11-12 1:58 ` [PATCH RFC 4/4] arm64/io: Add {__raw_read|__raw_write}128 support Chenghai Huang
3 siblings, 1 reply; 9+ messages in thread
From: Chenghai Huang @ 2025-11-12 1:58 UTC (permalink / raw)
To: arnd, catalin.marinas, will, akpm, anshuman.khandual,
ryan.roberts, andriy.shevchenko, herbert, linux-kernel,
linux-arch, linux-arm-kernel, linux-crypto, linux-api
Cc: fanghao11, shenyang39, liulongfang, qianweili
From: Weili Qian <qianweili@huawei.com>
In order to provide non-atomic functions for io{read|write}128.
We define a number of variants of these functions in the generic
iomap that will do non-atomic operations.
These functions are only defined if io{read|write}128 are defined.
If they are not, then the wrappers that always use non-atomic operations
from include/linux/io-128-nonatomic*.h will be used.
Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Chenghai Huang <huangchenghai2@huawei.com>
---
include/linux/io-128-nonatomic-hi-lo.h | 35 ++++++++++++++++++++++++++
include/linux/io-128-nonatomic-lo-hi.h | 34 +++++++++++++++++++++++++
2 files changed, 69 insertions(+)
create mode 100644 include/linux/io-128-nonatomic-hi-lo.h
create mode 100644 include/linux/io-128-nonatomic-lo-hi.h
diff --git a/include/linux/io-128-nonatomic-hi-lo.h b/include/linux/io-128-nonatomic-hi-lo.h
new file mode 100644
index 000000000000..b5b083a9e81b
--- /dev/null
+++ b/include/linux/io-128-nonatomic-hi-lo.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IO_128_NONATOMIC_HI_LO_H_
+#define _LINUX_IO_128_NONATOMIC_HI_LO_H_
+
+#include <linux/io.h>
+#include <asm-generic/int-ll64.h>
+
+static inline u128 ioread128_hi_lo(const void __iomem *addr)
+{
+ u32 low, high;
+
+ high = ioread64(addr + sizeof(u64));
+ low = ioread64(addr);
+
+ return low + ((u128)high << 64);
+}
+
+static inline void iowrite128_hi_lo(u128 val, void __iomem *addr)
+{
+ iowrite64(val >> 64, addr + sizeof(u64));
+ iowrite64(val, addr);
+}
+
+#ifndef ioread128
+#define ioread128_is_nonatomic
+#define ioread128 ioread128_hi_lo
+#endif
+
+#ifndef iowrite128
+#define iowrite128_is_nonatomic
+#define iowrite128 iowrite128_hi_lo
+#endif
+
+#endif /* _LINUX_IO_128_NONATOMIC_HI_LO_H_ */
+
diff --git a/include/linux/io-128-nonatomic-lo-hi.h b/include/linux/io-128-nonatomic-lo-hi.h
new file mode 100644
index 000000000000..0448ee5a13de
--- /dev/null
+++ b/include/linux/io-128-nonatomic-lo-hi.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IO_128_NONATOMIC_LO_HI_H_
+#define _LINUX_IO_128_NONATOMIC_LO_HI_H_
+
+#include <linux/io.h>
+#include <asm-generic/int-ll64.h>
+
+static inline u128 ioread128_lo_hi(const void __iomem *addr)
+{
+ u64 low, high;
+
+ low = ioread64(addr);
+ high = ioread64(addr + sizeof(u64));
+
+ return low + ((u128)high << 64);
+}
+
+static inline void iowrite128_lo_hi(u128 val, void __iomem *addr)
+{
+ iowrite64(val, addr);
+ iowrite64(val >> 64, addr + sizeof(u64));
+}
+
+#ifndef ioread128
+#define ioread128_is_nonatomic
+#define ioread128 ioread128_lo_hi
+#endif
+
+#ifndef iowrite128
+#define iowrite128_is_nonatomic
+#define iowrite128 iowrite128_lo_hi
+#endif
+
+#endif /* _LINUX_IO_128_NONATOMIC_LO_HI_H_ */
--
2.33.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH RFC 4/4] arm64/io: Add {__raw_read|__raw_write}128 support
2025-11-12 1:58 [PATCH RFC 0/4] Introduce 128-bit IO access Chenghai Huang
` (2 preceding siblings ...)
2025-11-12 1:58 ` [PATCH RFC 3/4] io-128-nonatomic: introduce io{read|write}128_{lo_hi|hi_lo} Chenghai Huang
@ 2025-11-12 1:58 ` Chenghai Huang
2025-11-12 12:28 ` Mark Rutland
3 siblings, 1 reply; 9+ messages in thread
From: Chenghai Huang @ 2025-11-12 1:58 UTC (permalink / raw)
To: arnd, catalin.marinas, will, akpm, anshuman.khandual,
ryan.roberts, andriy.shevchenko, herbert, linux-kernel,
linux-arch, linux-arm-kernel, linux-crypto, linux-api
Cc: fanghao11, shenyang39, liulongfang, qianweili
From: Weili Qian <qianweili@huawei.com>
Starting from ARMv8.4, stp and ldp instructions become atomic.
Currently, device drivers depend on 128-bit atomic memory IO access,
but these are implemented within the drivers. Therefore, this introduces
generic {__raw_read|__raw_write}128 function for 128-bit memory access.
Signed-off-by: Weili Qian <qianweili@huawei.com>
Signed-off-by: Chenghai Huang <huangchenghai2@huawei.com>
---
arch/arm64/include/asm/io.h | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 83e03abbb2ca..80430750a28c 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -50,6 +50,17 @@ static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
asm volatile("str %x0, %1" : : "rZ" (val), "Qo" (*ptr));
}
+#define __raw_write128 __raw_write128
+static __always_inline void __raw_write128(u128 val, volatile void __iomem *addr)
+{
+ u64 low, high;
+
+ low = val;
+ high = (u64)(val >> 64);
+
+ asm volatile ("stp %x0, %x1, [%2]\n" :: "rZ"(low), "rZ"(high), "r"(addr));
+}
+
#define __raw_readb __raw_readb
static __always_inline u8 __raw_readb(const volatile void __iomem *addr)
{
@@ -95,6 +106,16 @@ static __always_inline u64 __raw_readq(const volatile void __iomem *addr)
return val;
}
+#define __raw_read128 __raw_read128
+static __always_inline u128 __raw_read128(const volatile void __iomem *addr)
+{
+ u64 high, low;
+
+ asm volatile("ldp %0, %1, [%2]" : "=r" (low), "=r" (high) : "r" (addr));
+
+ return (((u128)high << 64) | (u128)low);
+}
+
/* IO barriers */
#define __io_ar(v) \
({ \
--
2.33.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH RFC 4/4] arm64/io: Add {__raw_read|__raw_write}128 support
2025-11-12 1:58 ` [PATCH RFC 4/4] arm64/io: Add {__raw_read|__raw_write}128 support Chenghai Huang
@ 2025-11-12 12:28 ` Mark Rutland
2025-11-12 14:01 ` David Laight
0 siblings, 1 reply; 9+ messages in thread
From: Mark Rutland @ 2025-11-12 12:28 UTC (permalink / raw)
To: Chenghai Huang
Cc: arnd, catalin.marinas, will, akpm, anshuman.khandual,
ryan.roberts, andriy.shevchenko, herbert, linux-kernel,
linux-arch, linux-arm-kernel, linux-crypto, linux-api, fanghao11,
shenyang39, liulongfang, qianweili
On Wed, Nov 12, 2025 at 09:58:46AM +0800, Chenghai Huang wrote:
> From: Weili Qian <qianweili@huawei.com>
>
> Starting from ARMv8.4, stp and ldp instructions become atomic.
That's not true for accesses to Device memory types.
Per ARM DDI 0487, L.b, section B2.2.1.1 ("Changes to single-copy atomicity in
Armv8.4"):
If FEAT_LSE2 is implemented, LDP, LDNP, and STP instructions that load
or store two 64-bit registers are single-copy atomic when all of the
following conditions are true:
• The overall memory access is aligned to 16 bytes.
• Accesses are to Inner Write-Back, Outer Write-Back Normal cacheable memory.
IIUC when used for Device memory types, those can be split, and a part
of the access could be replayed multiple times (e.g. due to an
intetrupt).
I don't think we can add this generally. It is not atomic, and not
generally safe.
Mark.
> Currently, device drivers depend on 128-bit atomic memory IO access,
> but these are implemented within the drivers. Therefore, this introduces
> generic {__raw_read|__raw_write}128 function for 128-bit memory access.
>
> Signed-off-by: Weili Qian <qianweili@huawei.com>
> Signed-off-by: Chenghai Huang <huangchenghai2@huawei.com>
> ---
> arch/arm64/include/asm/io.h | 21 +++++++++++++++++++++
> 1 file changed, 21 insertions(+)
>
> diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
> index 83e03abbb2ca..80430750a28c 100644
> --- a/arch/arm64/include/asm/io.h
> +++ b/arch/arm64/include/asm/io.h
> @@ -50,6 +50,17 @@ static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
> asm volatile("str %x0, %1" : : "rZ" (val), "Qo" (*ptr));
> }
>
> +#define __raw_write128 __raw_write128
> +static __always_inline void __raw_write128(u128 val, volatile void __iomem *addr)
> +{
> + u64 low, high;
> +
> + low = val;
> + high = (u64)(val >> 64);
> +
> + asm volatile ("stp %x0, %x1, [%2]\n" :: "rZ"(low), "rZ"(high), "r"(addr));
> +}
> +
> #define __raw_readb __raw_readb
> static __always_inline u8 __raw_readb(const volatile void __iomem *addr)
> {
> @@ -95,6 +106,16 @@ static __always_inline u64 __raw_readq(const volatile void __iomem *addr)
> return val;
> }
>
> +#define __raw_read128 __raw_read128
> +static __always_inline u128 __raw_read128(const volatile void __iomem *addr)
> +{
> + u64 high, low;
> +
> + asm volatile("ldp %0, %1, [%2]" : "=r" (low), "=r" (high) : "r" (addr));
> +
> + return (((u128)high << 64) | (u128)low);
> +}
> +
> /* IO barriers */
> #define __io_ar(v) \
> ({ \
> --
> 2.33.0
>
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH RFC 4/4] arm64/io: Add {__raw_read|__raw_write}128 support
2025-11-12 12:28 ` Mark Rutland
@ 2025-11-12 14:01 ` David Laight
2025-11-12 14:17 ` Mark Rutland
0 siblings, 1 reply; 9+ messages in thread
From: David Laight @ 2025-11-12 14:01 UTC (permalink / raw)
To: Mark Rutland
Cc: Chenghai Huang, arnd, catalin.marinas, will, akpm,
anshuman.khandual, ryan.roberts, andriy.shevchenko, herbert,
linux-kernel, linux-arch, linux-arm-kernel, linux-crypto,
linux-api, fanghao11, shenyang39, liulongfang, qianweili
On Wed, 12 Nov 2025 12:28:01 +0000
Mark Rutland <mark.rutland@arm.com> wrote:
> On Wed, Nov 12, 2025 at 09:58:46AM +0800, Chenghai Huang wrote:
> > From: Weili Qian <qianweili@huawei.com>
> >
> > Starting from ARMv8.4, stp and ldp instructions become atomic.
>
> That's not true for accesses to Device memory types.
>
> Per ARM DDI 0487, L.b, section B2.2.1.1 ("Changes to single-copy atomicity in
> Armv8.4"):
>
> If FEAT_LSE2 is implemented, LDP, LDNP, and STP instructions that load
> or store two 64-bit registers are single-copy atomic when all of the
> following conditions are true:
> • The overall memory access is aligned to 16 bytes.
> • Accesses are to Inner Write-Back, Outer Write-Back Normal cacheable memory.
>
> IIUC when used for Device memory types, those can be split, and a part
> of the access could be replayed multiple times (e.g. due to an
> intetrupt).
That can't be right.
IO accesses can reference hardware FIFO so must only happen once.
(Or is 'Device memory' something different from 'Device register'?
I'm also not sure that the bus cycles could get split by an interrupt,
that would require a mid-instruction interrupt - very unlikely.
Interleaving is most likely to come from another cpu.
More interesting would be whether the instructions generate a single
PCIe TLP? (perhaps even only most of the time.)
PCIe reads are high latency, anything that can be done to increase the
size of the TLP improves PIO throughput massively.
David
>
> I don't think we can add this generally. It is not atomic, and not
> generally safe.
>
> Mark.
...
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH RFC 4/4] arm64/io: Add {__raw_read|__raw_write}128 support
2025-11-12 14:01 ` David Laight
@ 2025-11-12 14:17 ` Mark Rutland
0 siblings, 0 replies; 9+ messages in thread
From: Mark Rutland @ 2025-11-12 14:17 UTC (permalink / raw)
To: David Laight
Cc: Chenghai Huang, arnd, catalin.marinas, will, akpm,
anshuman.khandual, ryan.roberts, andriy.shevchenko, herbert,
linux-kernel, linux-arch, linux-arm-kernel, linux-crypto,
linux-api, fanghao11, shenyang39, liulongfang, qianweili
On Wed, Nov 12, 2025 at 02:01:57PM +0000, David Laight wrote:
> On Wed, 12 Nov 2025 12:28:01 +0000
> Mark Rutland <mark.rutland@arm.com> wrote:
>
> > On Wed, Nov 12, 2025 at 09:58:46AM +0800, Chenghai Huang wrote:
> > > From: Weili Qian <qianweili@huawei.com>
> > >
> > > Starting from ARMv8.4, stp and ldp instructions become atomic.
> >
> > That's not true for accesses to Device memory types.
> >
> > Per ARM DDI 0487, L.b, section B2.2.1.1 ("Changes to single-copy atomicity in
> > Armv8.4"):
> >
> > If FEAT_LSE2 is implemented, LDP, LDNP, and STP instructions that load
> > or store two 64-bit registers are single-copy atomic when all of the
> > following conditions are true:
> > • The overall memory access is aligned to 16 bytes.
> > • Accesses are to Inner Write-Back, Outer Write-Back Normal cacheable memory.
> >
> > IIUC when used for Device memory types, those can be split, and a part
> > of the access could be replayed multiple times (e.g. due to an
> > intetrupt).
>
> That can't be right.
For better or worse, the architecture permits this, and I understand
that there are implementations on which this can happen.
> IO accesses can reference hardware FIFO so must only happen once.
This has nothing to do with the endpoint, and so any FIFO in the
endpoint is immaterial.
I agree that we want to ensure that the accesses only happen once, which
is why I have raised that it is unsound to use LDP/LDNP/STP in this way.
> (Or is 'Device memory' something different from 'Device register'?
I specifically said "Device memory type", which is an attribute that the
MMU associates with a VA, and determines how the MMU (and memory system
as a whole) treats accesses to that VA.
You can find the architecture documentation I referenced at:
https://developer.arm.com/documentation/ddi0487/lb/
> I'm also not sure that the bus cycles could get split by an interrupt,
> that would require a mid-instruction interrupt - very unlikely.
There are various reasons why an implementation might split the accesses
made by a single instruction, and why an interrupt (or other event)
might occur between accesses and cause a replay of some of the
constituent accesses. This has nothing to do with splitting bus cycles.
Mark.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH RFC 3/4] io-128-nonatomic: introduce io{read|write}128_{lo_hi|hi_lo}
2025-11-12 1:58 ` [PATCH RFC 3/4] io-128-nonatomic: introduce io{read|write}128_{lo_hi|hi_lo} Chenghai Huang
@ 2025-11-12 14:48 ` Ben Dooks
0 siblings, 0 replies; 9+ messages in thread
From: Ben Dooks @ 2025-11-12 14:48 UTC (permalink / raw)
To: Chenghai Huang, arnd, catalin.marinas, will, akpm,
anshuman.khandual, ryan.roberts, andriy.shevchenko, herbert,
linux-kernel, linux-arch, linux-arm-kernel, linux-crypto,
linux-api
Cc: fanghao11, shenyang39, liulongfang, qianweili
On 12/11/2025 01:58, Chenghai Huang wrote:
> From: Weili Qian <qianweili@huawei.com>
>
> In order to provide non-atomic functions for io{read|write}128.
> We define a number of variants of these functions in the generic
> iomap that will do non-atomic operations.
>
> These functions are only defined if io{read|write}128 are defined.
> If they are not, then the wrappers that always use non-atomic operations
> from include/linux/io-128-nonatomic*.h will be used.
>
> Signed-off-by: Weili Qian <qianweili@huawei.com>
> Signed-off-by: Chenghai Huang <huangchenghai2@huawei.com>
> ---
> include/linux/io-128-nonatomic-hi-lo.h | 35 ++++++++++++++++++++++++++
> include/linux/io-128-nonatomic-lo-hi.h | 34 +++++++++++++++++++++++++
> 2 files changed, 69 insertions(+)
> create mode 100644 include/linux/io-128-nonatomic-hi-lo.h
> create mode 100644 include/linux/io-128-nonatomic-lo-hi.h
>
> diff --git a/include/linux/io-128-nonatomic-hi-lo.h b/include/linux/io-128-nonatomic-hi-lo.h
> new file mode 100644
> index 000000000000..b5b083a9e81b
> --- /dev/null
> +++ b/include/linux/io-128-nonatomic-hi-lo.h
> @@ -0,0 +1,35 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_IO_128_NONATOMIC_HI_LO_H_
> +#define _LINUX_IO_128_NONATOMIC_HI_LO_H_
> +
> +#include <linux/io.h>
> +#include <asm-generic/int-ll64.h>
> +
> +static inline u128 ioread128_hi_lo(const void __iomem *addr)
> +{
> + u32 low, high;
did you mean u64 here?
> + high = ioread64(addr + sizeof(u64));
> + low = ioread64(addr);
> +
> + return low + ((u128)high << 64);
> +}
> +
> +static inline void iowrite128_hi_lo(u128 val, void __iomem *addr)
> +{
> + iowrite64(val >> 64, addr + sizeof(u64));
> + iowrite64(val, addr);
> +}
> +
--
Ben Dooks http://www.codethink.co.uk/
Senior Engineer Codethink - Providing Genius
https://www.codethink.co.uk/privacy.html
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2025-11-12 14:48 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-12 1:58 [PATCH RFC 0/4] Introduce 128-bit IO access Chenghai Huang
2025-11-12 1:58 ` [PATCH RFC 1/4] UAPI: Introduce 128-bit types and byteswap operations Chenghai Huang
2025-11-12 1:58 ` [PATCH RFC 2/4] asm-generic/io.h: add io{read,write}128 accessors Chenghai Huang
2025-11-12 1:58 ` [PATCH RFC 3/4] io-128-nonatomic: introduce io{read|write}128_{lo_hi|hi_lo} Chenghai Huang
2025-11-12 14:48 ` Ben Dooks
2025-11-12 1:58 ` [PATCH RFC 4/4] arm64/io: Add {__raw_read|__raw_write}128 support Chenghai Huang
2025-11-12 12:28 ` Mark Rutland
2025-11-12 14:01 ` David Laight
2025-11-12 14:17 ` Mark Rutland
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).