public inbox for linux-arm-kernel@lists.infradead.org
 help / color / mirror / Atom feed
* [RFC 0/1] aarch64: Simplify __range_ok
@ 2020-03-21  5:13 Richard Henderson
  2020-03-21  5:13 ` [PATCH 1/1] arm64: " Richard Henderson
  0 siblings, 1 reply; 3+ messages in thread
From: Richard Henderson @ 2020-03-21  5:13 UTC (permalink / raw)
  To: linux-arm-kernel; +Cc: mark.rutland, robin.murphy

Continuing the conversation from last week, in which I attempted
to improve __range_ok with gcc asm flag outputs.  Mark and Robin
suggested it might be time to move back to C.

The largest improvment that I can manage avoids 65-bit arithmetic
entirely.  I simply need to assume that limit has some minimum value.
This covers the vast majority of the uses within the kernel.


r~


Richard Henderson (1):
  arm64: Simplify __range_ok

 arch/arm64/include/asm/uaccess.h | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

---

In the meanime, I've also done some work on 128-bit comparisons
for gcc.  It improves the general case, but note that the constant
case is handled even by an older compiler.

https://gcc.gnu.org/pipermail/gcc-patches/2020-March/542447.html

------------------ Test

void doit(void);
void robin(unsigned long addr, unsigned long size, unsigned long limit)
{
        __uint128_t tmp = (__uint128_t)addr + size;
        if (!tmp || tmp - 1 <= limit)
                doit();
}

static inline bool
range_ok(unsigned long addr, unsigned long size, unsigned long limit)
{
        if (__builtin_constant_p(size) && size > 0 && size < 0x100000)
                return addr <= limit + 1 - size;
        return (__uint128_t)addr + size <= (__uint128_t)limit + 1;
}

void test_v(unsigned long addr, unsigned long size, unsigned long limit)
{
        if (range_ok(addr, size, limit)) doit();
}

void test_0(unsigned long addr, unsigned long limit)
{
        if (range_ok(addr, 0, limit)) doit();
}

void test_1(unsigned long addr, unsigned long limit)
{
        if (range_ok(addr, 1, limit)) doit();
}

void test_10(unsigned long addr, unsigned long limit)
{
        if (range_ok(addr, 10, limit)) doit();
}

------------------ GCC 10.0.1 patched

robin:
        adds    x1, x0, x1
        cset    x0, cs
        orr     x3, x0, x1
        cbz     x3, .L2
        subs    x1, x1, #1
        sbc     x0, x0, xzr
        cmp     x2, x1
        sbcs    xzr, xzr, x0
        bcc     .L1
.L2:	b       doit
.L1:	ret

test_v:
        adds    x0, x0, x1
        cset    x1, cs
        adds    x2, x2, #1
        cset    x3, cs
        cmp     x2, x0
        sbcs    xzr, x3, x1
        bcs     .L10
        ret
.L10:	b       doit

test_0:
        adds    x1, x1, #1
        cset    x2, cs
        cmp     x1, x0
        sbcs    xzr, x2, xzr
        bcs     .L13
        ret
.L13:	b       doit

test_1:
        cmp     x0, x1
        bls     .L16
        ret
.L16:	b       doit

test_10:
        sub     x1, x1, #9
        cmp     x0, x1
        bls     .L19
        ret
.L19:	b       doit

------------------ GCC 7.5-ubuntu~18.04

robin:
        adds    x0, x0, x1
        cset    x1, cs
        orr     x3, x0, x1
        cbz     x3, .L2
        subs    x0, x0, #1
        sbc     x1, x1, xzr
        cbnz    x1, .L1
        cmp     x0, x2
        bhi     .L1
.L2:	b       doit
.L1:	ret

test_v:
        adds    x0, x0, x1
        cset    x4, cs
        adds    x2, x2, 1
        cset    x3, cs
        cmp     x4, x3
        bls     .L14
.L10:	ret
.L14:	bne     .L13
        cmp     x0, x2
        bhi     .L10
.L13:	b       doit

test_0:
        adds    x1, x1, 1
        bcs     .L18
        cmp     x0, x1
        bls     .L18
        ret
.L18:	b       doit

test_1:
        cmp     x0, x1
        bls     .L21
        ret
.L21:	b       doit

test_10:
        sub     x1, x1, #9
        cmp     x0, x1
        bls     .L24
        ret
.L24:	b       doit

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/1] arm64: Simplify __range_ok
  2020-03-21  5:13 [RFC 0/1] aarch64: Simplify __range_ok Richard Henderson
@ 2020-03-21  5:13 ` Richard Henderson
  2020-03-23 11:28   ` Mark Rutland
  0 siblings, 1 reply; 3+ messages in thread
From: Richard Henderson @ 2020-03-21  5:13 UTC (permalink / raw)
  To: linux-arm-kernel; +Cc: mark.rutland, robin.murphy

The general case is not quite as compact as the inline assembly,
but with a sufficiently advanced compiler it is only 6 insns vs 5.

The real improvement comes from assuming that limit is never tiny,
and using __builtin_constant_p to make sure the constant folding
does not go awry.  This produces a 2 insn sequence even for older
compilers.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 arch/arm64/include/asm/uaccess.h | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 32fc8061aa76..683727696dc3 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -60,7 +60,8 @@ static inline void set_fs(mm_segment_t fs)
  */
 static inline unsigned long __range_ok(const void __user *addr, unsigned long size)
 {
-	unsigned long ret, limit = current_thread_info()->addr_limit;
+	unsigned long limit = current_thread_info()->addr_limit;
+	unsigned long iaddr;
 
 	/*
 	 * Asynchronous I/O running in a kernel thread does not have the
@@ -72,24 +73,18 @@ static inline unsigned long __range_ok(const void __user *addr, unsigned long si
 		addr = untagged_addr(addr);
 
 	__chk_user_ptr(addr);
-	asm volatile(
-	// A + B <= C + 1 for all A,B,C, in four easy steps:
-	// 1: X = A + B; X' = X % 2^64
-	"	adds	%0, %3, %2\n"
-	// 2: Set C = 0 if X > 2^64, to guarantee X' > C in step 4
-	"	csel	%1, xzr, %1, hi\n"
-	// 3: Set X' = ~0 if X >= 2^64. For X == 2^64, this decrements X'
-	//    to compensate for the carry flag being set in step 4. For
-	//    X > 2^64, X' merely has to remain nonzero, which it does.
-	"	csinv	%0, %0, xzr, cc\n"
-	// 4: For X < 2^64, this gives us X' - C - 1 <= 0, where the -1
-	//    comes from the carry in being clear. Otherwise, we are
-	//    testing X' - C == 0, subject to the previous adjustments.
-	"	sbcs	xzr, %0, %1\n"
-	"	cset	%0, ls\n"
-	: "=&r" (ret), "+r" (limit) : "Ir" (size), "0" (addr) : "cc");
 
-	return ret;
+	/*
+	 * Quite a lot of range checks use sizeof(some_type), and are
+	 * therefore constant.  If we can assume that limit is never unusably
+	 * small, then we can rearrange the computation to avoid the need for
+	 * 65-bit arithmetic.  Arbitrary choice for size limit of 1MiB.
+	 */
+	iaddr = (unsigned long)addr;
+	if (__builtin_constant_p(size) && size > 0 && size < 0x100000)
+		return iaddr <= limit + 1 - size;
+
+	return (__uint128_t)iaddr + size <= (__uint128_t)limit + 1;
 }
 
 #define access_ok(addr, size)	__range_ok(addr, size)
-- 
2.20.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH 1/1] arm64: Simplify __range_ok
  2020-03-21  5:13 ` [PATCH 1/1] arm64: " Richard Henderson
@ 2020-03-23 11:28   ` Mark Rutland
  0 siblings, 0 replies; 3+ messages in thread
From: Mark Rutland @ 2020-03-23 11:28 UTC (permalink / raw)
  To: Richard Henderson; +Cc: robin.murphy, linux-arm-kernel

On Fri, Mar 20, 2020 at 10:13:52PM -0700, Richard Henderson wrote:
> The general case is not quite as compact as the inline assembly,
> but with a sufficiently advanced compiler it is only 6 insns vs 5.
> 
> The real improvement comes from assuming that limit is never tiny,
> and using __builtin_constant_p to make sure the constant folding
> does not go awry.  This produces a 2 insn sequence even for older
> compilers.

Neat; thanks for putting this together!

Do you happen to have numbers for the impact on a defconfig Image size
(or vmlinux .text size)?

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  arch/arm64/include/asm/uaccess.h | 31 +++++++++++++------------------
>  1 file changed, 13 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
> index 32fc8061aa76..683727696dc3 100644
> --- a/arch/arm64/include/asm/uaccess.h
> +++ b/arch/arm64/include/asm/uaccess.h
> @@ -60,7 +60,8 @@ static inline void set_fs(mm_segment_t fs)
>   */
>  static inline unsigned long __range_ok(const void __user *addr, unsigned long size)
>  {
> -	unsigned long ret, limit = current_thread_info()->addr_limit;
> +	unsigned long limit = current_thread_info()->addr_limit;
> +	unsigned long iaddr;

Trivial: could we move the initialisation here, please?

>  
>  	/*
>  	 * Asynchronous I/O running in a kernel thread does not have the
> @@ -72,24 +73,18 @@ static inline unsigned long __range_ok(const void __user *addr, unsigned long si
>  		addr = untagged_addr(addr);
>  
>  	__chk_user_ptr(addr);
> -	asm volatile(
> -	// A + B <= C + 1 for all A,B,C, in four easy steps:
> -	// 1: X = A + B; X' = X % 2^64
> -	"	adds	%0, %3, %2\n"
> -	// 2: Set C = 0 if X > 2^64, to guarantee X' > C in step 4
> -	"	csel	%1, xzr, %1, hi\n"
> -	// 3: Set X' = ~0 if X >= 2^64. For X == 2^64, this decrements X'
> -	//    to compensate for the carry flag being set in step 4. For
> -	//    X > 2^64, X' merely has to remain nonzero, which it does.
> -	"	csinv	%0, %0, xzr, cc\n"
> -	// 4: For X < 2^64, this gives us X' - C - 1 <= 0, where the -1
> -	//    comes from the carry in being clear. Otherwise, we are
> -	//    testing X' - C == 0, subject to the previous adjustments.
> -	"	sbcs	xzr, %0, %1\n"
> -	"	cset	%0, ls\n"
> -	: "=&r" (ret), "+r" (limit) : "Ir" (size), "0" (addr) : "cc");
>  
> -	return ret;
> +	/*
> +	 * Quite a lot of range checks use sizeof(some_type), and are
> +	 * therefore constant.  If we can assume that limit is never unusably
> +	 * small, then we can rearrange the computation to avoid the need for
> +	 * 65-bit arithmetic.  Arbitrary choice for size limit of 1MiB.
> +	 */
> +	iaddr = (unsigned long)addr;
> +	if (__builtin_constant_p(size) && size > 0 && size < 0x100000)
> +		return iaddr <= limit + 1 - size;

The limit should be either USER_DS or KERNEL_DS, where USER_DS is
smaller than KERNEL_DS, so we could derive a less arbitrary bound from
USER_DS.

Thanks,
Mark.

> +
> +	return (__uint128_t)iaddr + size <= (__uint128_t)limit + 1;
>  }
>  
>  #define access_ok(addr, size)	__range_ok(addr, size)
> -- 
> 2.20.1
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-03-23 11:28 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-03-21  5:13 [RFC 0/1] aarch64: Simplify __range_ok Richard Henderson
2020-03-21  5:13 ` [PATCH 1/1] arm64: " Richard Henderson
2020-03-23 11:28   ` Mark Rutland

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox