[PATCH bpf-next] bpf,x64: use shrx/sarx/shlx when available

public inbox for bpf@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH bpf-next] bpf,x64: use shrx/sarx/shlx when available
@ 2022-09-21  2:21 Jie Meng
  2022-09-22 15:07 ` Daniel Borkmann
  0 siblings, 1 reply; 24+ messages in thread
From: Jie Meng @ 2022-09-21  2:21 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
alternatives provided in BMI2

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 arch/x86/net/bpf_jit_comp.c                | 53 ++++++++++++++++++++++
 tools/testing/selftests/bpf/verifier/jit.c |  7 +--
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ae89f4143eb4..81a3b34327ae 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -889,6 +889,35 @@ static void emit_nops(u8 **pprog, int len)
 	*pprog = prog;
 }
 
+static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
+		      bool w, u8 src_reg2, bool l, u8 p)
+{
+	u8 *prog = *pprog;
+	u8 b0 = 0xc4, b1, b2;
+	u8 src2 = reg2hex[src_reg2];
+
+	if (is_ereg(src_reg2))
+		src2 |= 1 << 3;
+
+	/*
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  |~R |~X |~B |         m         |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
+	/*
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  | W |     ~vvvv     | L |   pp  |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
+
+	EMIT3(b0, b1, b2);
+	*pprog = prog;
+}
+
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
@@ -1135,7 +1164,31 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		case BPF_ALU64 | BPF_LSH | BPF_X:
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
+			if (boot_cpu_has(X86_FEATURE_BMI2)) {
+				/* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
+				bool r = is_ereg(dst_reg);
+				u8 m = 2; /* escape code 0f38 */
+				bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
+				u8 p;
+
+				switch (BPF_OP(insn->code)) {
+				case BPF_LSH:
+					p = 1; /* prefix 0x66 */
+					break;
+				case BPF_RSH:
+					p = 3; /* prefix 0xf2 */
+					break;
+				case BPF_ARSH:
+					p = 2; /* prefix 0xf3 */
+					break;
+				}
+
+				emit_3vex(&prog, r, false, r, m,
+					  w, src_reg, false, p);
+				EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
 
+				break;
+			}
 			/* Check for bad case when dst_reg == rcx */
 			if (dst_reg == BPF_REG_4) {
 				/* mov r11, dst_reg */
diff --git a/tools/testing/selftests/bpf/verifier/jit.c b/tools/testing/selftests/bpf/verifier/jit.c
index 79021c30e51e..3323b93f0972 100644
--- a/tools/testing/selftests/bpf/verifier/jit.c
+++ b/tools/testing/selftests/bpf/verifier/jit.c
@@ -4,15 +4,16 @@
 	BPF_MOV64_IMM(BPF_REG_0, 1),
 	BPF_MOV64_IMM(BPF_REG_1, 0xff),
 	BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 1),
-	BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 1),
+	BPF_ALU32_REG(BPF_LSH, BPF_REG_1, BPF_REG_0),
 	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1),
 	BPF_EXIT_INSN(),
-	BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 1),
+	BPF_ALU64_REG(BPF_RSH, BPF_REG_1, BPF_REG_0),
 	BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 1),
 	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0xff, 1),
 	BPF_EXIT_INSN(),
 	BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 1),
-	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x7f, 1),
+	BPF_ALU32_REG(BPF_ARSH, BPF_REG_1, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3f, 1),
 	BPF_EXIT_INSN(),
 	BPF_MOV64_IMM(BPF_REG_0, 2),
 	BPF_EXIT_INSN(),
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH bpf-next] bpf,x64: use shrx/sarx/shlx when available
  2022-09-21  2:21 [PATCH bpf-next] bpf,x64: use shrx/sarx/shlx when available Jie Meng
@ 2022-09-22 15:07 ` Daniel Borkmann
  2022-09-24  0:32   ` [PATCH bpf-next v2 0/2] bpf,x64: Use BMI2 for shifts Jie Meng
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Borkmann @ 2022-09-22 15:07 UTC (permalink / raw)
  To: Jie Meng, bpf, ast, andrii

On 9/21/22 4:21 AM, Jie Meng wrote:
> Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
> alternatives provided in BMI2
> 
> Signed-off-by: Jie Meng <jmeng@fb.com>
> ---
>   arch/x86/net/bpf_jit_comp.c                | 53 ++++++++++++++++++++++
>   tools/testing/selftests/bpf/verifier/jit.c |  7 +--
>   2 files changed, 57 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index ae89f4143eb4..81a3b34327ae 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -889,6 +889,35 @@ static void emit_nops(u8 **pprog, int len)
>   	*pprog = prog;
>   }
>   
> +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
> +		      bool w, u8 src_reg2, bool l, u8 p)
> +{
> +	u8 *prog = *pprog;
> +	u8 b0 = 0xc4, b1, b2;
> +	u8 src2 = reg2hex[src_reg2];
> +
> +	if (is_ereg(src_reg2))
> +		src2 |= 1 << 3;
> +
> +	/*
> +	 *    7                           0
> +	 *  +---+---+---+---+---+---+---+---+
> +	 *  |~R |~X |~B |         m         |
> +	 *  +---+---+---+---+---+---+---+---+
> +	 */
> +	b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
> +	/*
> +	 *    7                           0
> +	 *  +---+---+---+---+---+---+---+---+
> +	 *  | W |     ~vvvv     | L |   pp  |
> +	 *  +---+---+---+---+---+---+---+---+
> +	 */
> +	b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
> +
> +	EMIT3(b0, b1, b2);
> +	*pprog = prog;
> +}
> +
>   #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
>   
>   static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
> @@ -1135,7 +1164,31 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>   		case BPF_ALU64 | BPF_LSH | BPF_X:
>   		case BPF_ALU64 | BPF_RSH | BPF_X:
>   		case BPF_ALU64 | BPF_ARSH | BPF_X:
> +			if (boot_cpu_has(X86_FEATURE_BMI2)) {
> +				/* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
> +				bool r = is_ereg(dst_reg);
> +				u8 m = 2; /* escape code 0f38 */
> +				bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
> +				u8 p;
> +
> +				switch (BPF_OP(insn->code)) {
> +				case BPF_LSH:
> +					p = 1; /* prefix 0x66 */
> +					break;
> +				case BPF_RSH:
> +					p = 3; /* prefix 0xf2 */
> +					break;
> +				case BPF_ARSH:
> +					p = 2; /* prefix 0xf3 */
> +					break;
> +				}
> +
> +				emit_3vex(&prog, r, false, r, m,
> +					  w, src_reg, false, p);
> +				EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
>   
> +				break;
> +			}
>   			/* Check for bad case when dst_reg == rcx */
>   			if (dst_reg == BPF_REG_4) {
>   				/* mov r11, dst_reg */

Nice!

> diff --git a/tools/testing/selftests/bpf/verifier/jit.c b/tools/testing/selftests/bpf/verifier/jit.c
> index 79021c30e51e..3323b93f0972 100644
> --- a/tools/testing/selftests/bpf/verifier/jit.c
> +++ b/tools/testing/selftests/bpf/verifier/jit.c
> @@ -4,15 +4,16 @@
>   	BPF_MOV64_IMM(BPF_REG_0, 1),
>   	BPF_MOV64_IMM(BPF_REG_1, 0xff),
>   	BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 1),
> -	BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 1),
> +	BPF_ALU32_REG(BPF_LSH, BPF_REG_1, BPF_REG_0),
>   	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1),
>   	BPF_EXIT_INSN(),
> -	BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 1),
> +	BPF_ALU64_REG(BPF_RSH, BPF_REG_1, BPF_REG_0),
>   	BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 1),
>   	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0xff, 1),
>   	BPF_EXIT_INSN(),
>   	BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 1),
> -	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x7f, 1),
> +	BPF_ALU32_REG(BPF_ARSH, BPF_REG_1, BPF_REG_0),
> +	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3f, 1),

Could you add this as separate commit with adding tests rather than changing
existing ones?

Would also be great to include lib/test_bpf.ko test results into above commit
message given it has more extensive JIT tests than test_verifier.

>   	BPF_EXIT_INSN(),
>   	BPF_MOV64_IMM(BPF_REG_0, 2),
>   	BPF_EXIT_INSN(),
> 


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v2 0/2] bpf,x64: Use BMI2 for shifts
  2022-09-22 15:07 ` Daniel Borkmann
@ 2022-09-24  0:32   ` Jie Meng
  2022-09-24  0:32     ` [PATCH bpf-next v2 1/2] bpf,x64: use shrx/sarx/shlx when available Jie Meng
  2022-09-24  0:32     ` [PATCH bpf-next v2 2/2] bpf: Add " Jie Meng
  0 siblings, 2 replies; 24+ messages in thread
From: Jie Meng @ 2022-09-24  0:32 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

With baseline x64 instruction set, shift count can only be an immediate
or in %cl. The implicit dependency on %cl makes it necessary to shuffle
registers around and/or add push/pop operations.

BMI2 provides shift instructions that can use any general register as
the shift count, saving us instructions and bytes in most cases.

test_progs: Summary: 267/1340 PASSED, 25 SKIPPED, 0 FAILED
test_progs-no_alu32: Summary: 267/1333 PASSED, 26 SKIPPED, 0 FAILED
test_verifier: Summary: 1367 PASSED, 636 SKIPPED, 0 FAILED
test_maps: OK, 0 SKIPPED
lib/test_bpf:
  test_bpf: Summary: 1026 PASSED, 0 FAILED, [1014/1014 JIT'ed]
  test_bpf: test_tail_calls: Summary: 10 PASSED, 0 FAILED, [10/10 JIT'ed]
  test_bpf: test_skb_segment: Summary: 2 PASSED, 0 FAILED

Jie Meng (2):
  bpf,x64: use shrx/sarx/shlx when available
  bpf: Add selftests for lsh, rsh, arsh with reg operand

 arch/x86/net/bpf_jit_comp.c                | 53 ++++++++++++++++++++++
 tools/testing/selftests/bpf/verifier/jit.c | 22 +++++++++
 2 files changed, 75 insertions(+)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v2 1/2] bpf,x64: use shrx/sarx/shlx when available
  2022-09-24  0:32   ` [PATCH bpf-next v2 0/2] bpf,x64: Use BMI2 for shifts Jie Meng
@ 2022-09-24  0:32     ` Jie Meng
  2022-09-26 19:16       ` Daniel Borkmann
  2022-09-24  0:32     ` [PATCH bpf-next v2 2/2] bpf: Add " Jie Meng
  1 sibling, 1 reply; 24+ messages in thread
From: Jie Meng @ 2022-09-24  0:32 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
alternatives provided in BMI2

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 arch/x86/net/bpf_jit_comp.c | 53 +++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ae89f4143eb4..2227d81a5e44 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -889,6 +889,35 @@ static void emit_nops(u8 **pprog, int len)
 	*pprog = prog;
 }
 
+static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
+		      bool w, u8 src_reg2, bool l, u8 p)
+{
+	u8 *prog = *pprog;
+	u8 b0 = 0xc4, b1, b2;
+	u8 src2 = reg2hex[src_reg2];
+
+	if (is_ereg(src_reg2))
+		src2 |= 1 << 3;
+
+	/*
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  |~R |~X |~B |         m         |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
+	/*
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  | W |     ~vvvv     | L |   pp  |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
+
+	EMIT3(b0, b1, b2);
+	*pprog = prog;
+}
+
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
@@ -1135,7 +1164,31 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		case BPF_ALU64 | BPF_LSH | BPF_X:
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
+			if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
+				/* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
+				bool r = is_ereg(dst_reg);
+				u8 m = 2; /* escape code 0f38 */
+				bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
+				u8 p;
+
+				switch (BPF_OP(insn->code)) {
+				case BPF_LSH:
+					p = 1; /* prefix 0x66 */
+					break;
+				case BPF_RSH:
+					p = 3; /* prefix 0xf2 */
+					break;
+				case BPF_ARSH:
+					p = 2; /* prefix 0xf3 */
+					break;
+				}
+
+				emit_3vex(&prog, r, false, r, m,
+					  w, src_reg, false, p);
+				EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
 
+				break;
+			}
 			/* Check for bad case when dst_reg == rcx */
 			if (dst_reg == BPF_REG_4) {
 				/* mov r11, dst_reg */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v2 2/2] bpf: Add selftests for lsh, rsh, arsh with reg operand
  2022-09-24  0:32   ` [PATCH bpf-next v2 0/2] bpf,x64: Use BMI2 for shifts Jie Meng
  2022-09-24  0:32     ` [PATCH bpf-next v2 1/2] bpf,x64: use shrx/sarx/shlx when available Jie Meng
@ 2022-09-24  0:32     ` Jie Meng
  1 sibling, 0 replies; 24+ messages in thread
From: Jie Meng @ 2022-09-24  0:32 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

Current tests cover only shifts with an immediate as the source
operand/shift counts; add a new test case to cover register operand.

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 tools/testing/selftests/bpf/verifier/jit.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/jit.c b/tools/testing/selftests/bpf/verifier/jit.c
index 79021c30e51e..44f2e0c614b8 100644
--- a/tools/testing/selftests/bpf/verifier/jit.c
+++ b/tools/testing/selftests/bpf/verifier/jit.c
@@ -20,6 +20,28 @@
 	.result = ACCEPT,
 	.retval = 2,
 },
+{
+	"jit: lsh, rsh, arsh by reg",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_MOV64_IMM(BPF_REG_1, 0xff),
+	BPF_ALU64_REG(BPF_LSH, BPF_REG_1, BPF_REG_0),
+	BPF_ALU32_REG(BPF_LSH, BPF_REG_1, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1),
+	BPF_EXIT_INSN(),
+	BPF_ALU64_REG(BPF_RSH, BPF_REG_1, BPF_REG_0),
+	BPF_ALU32_REG(BPF_RSH, BPF_REG_1, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0xff, 1),
+	BPF_EXIT_INSN(),
+	BPF_ALU64_REG(BPF_ARSH, BPF_REG_1, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x7f, 1),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_IMM(BPF_REG_0, 2),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.retval = 2,
+},
 {
 	"jit: mov32 for ldimm64, 1",
 	.insns = {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf,x64: use shrx/sarx/shlx when available
  2022-09-24  0:32     ` [PATCH bpf-next v2 1/2] bpf,x64: use shrx/sarx/shlx when available Jie Meng
@ 2022-09-26 19:16       ` Daniel Borkmann
  2022-09-27  0:38         ` Jie Meng
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Borkmann @ 2022-09-26 19:16 UTC (permalink / raw)
  To: Jie Meng, bpf, ast, andrii

On 9/24/22 2:32 AM, Jie Meng wrote:
> Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
> alternatives provided in BMI2
> 
> Signed-off-by: Jie Meng <jmeng@fb.com>
> ---
>   arch/x86/net/bpf_jit_comp.c | 53 +++++++++++++++++++++++++++++++++++++
>   1 file changed, 53 insertions(+)
> 
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index ae89f4143eb4..2227d81a5e44 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -889,6 +889,35 @@ static void emit_nops(u8 **pprog, int len)
>   	*pprog = prog;
>   }
>   
> +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
> +		      bool w, u8 src_reg2, bool l, u8 p)
> +{
> +	u8 *prog = *pprog;
> +	u8 b0 = 0xc4, b1, b2;
> +	u8 src2 = reg2hex[src_reg2];
> +
> +	if (is_ereg(src_reg2))
> +		src2 |= 1 << 3;
> +
> +	/*
> +	 *    7                           0
> +	 *  +---+---+---+---+---+---+---+---+
> +	 *  |~R |~X |~B |         m         |
> +	 *  +---+---+---+---+---+---+---+---+
> +	 */
> +	b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
> +	/*
> +	 *    7                           0
> +	 *  +---+---+---+---+---+---+---+---+
> +	 *  | W |     ~vvvv     | L |   pp  |
> +	 *  +---+---+---+---+---+---+---+---+
> +	 */
> +	b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
> +
> +	EMIT3(b0, b1, b2);
> +	*pprog = prog;
> +}
> +
>   #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
>   
>   static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
> @@ -1135,7 +1164,31 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>   		case BPF_ALU64 | BPF_LSH | BPF_X:
>   		case BPF_ALU64 | BPF_RSH | BPF_X:
>   		case BPF_ALU64 | BPF_ARSH | BPF_X:
> +			if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
> +				/* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
> +				bool r = is_ereg(dst_reg);
> +				u8 m = 2; /* escape code 0f38 */
> +				bool w = (BPF_CLASS(insn->code) == BPF_ALU64);

Looks like you just pass all the above vars into emit_3vex(), so why not hide them
there directly? The only thing really needed is p (and should probably be called op?),
so you just pass emit_3vex(&prog, op, dst_reg, src_reg).. please also improve the
commit message a bit, e.g. before/after disasm + opcode hexdump example (e.g. extract
from bpftool dump) would be nice and also add a sentence about the BPF_REG_4 limitation
case.

> +				u8 p;
> +
> +				switch (BPF_OP(insn->code)) {
> +				case BPF_LSH:
> +					p = 1; /* prefix 0x66 */
> +					break;
> +				case BPF_RSH:
> +					p = 3; /* prefix 0xf2 */
> +					break;
> +				case BPF_ARSH:
> +					p = 2; /* prefix 0xf3 */
> +					break;
> +				}
> +
> +				emit_3vex(&prog, r, false, r, m,
> +					  w, src_reg, false, p);
> +				EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
>   
> +				break;
> +			}
>   			/* Check for bad case when dst_reg == rcx */
>   			if (dst_reg == BPF_REG_4) {
>   				/* mov r11, dst_reg */
> 


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf,x64: use shrx/sarx/shlx when available
  2022-09-26 19:16       ` Daniel Borkmann
@ 2022-09-27  0:38         ` Jie Meng
  2022-09-27  9:45           ` Daniel Borkmann
  0 siblings, 1 reply; 24+ messages in thread
From: Jie Meng @ 2022-09-27  0:38 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: bpf, ast, andrii

On Mon, Sep 26, 2022 at 09:16:41PM +0200, Daniel Borkmann wrote:
> On 9/24/22 2:32 AM, Jie Meng wrote:
> > Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
> > alternatives provided in BMI2
> > 
> > Signed-off-by: Jie Meng <jmeng@fb.com>
> > ---
> >   arch/x86/net/bpf_jit_comp.c | 53 +++++++++++++++++++++++++++++++++++++
> >   1 file changed, 53 insertions(+)
> > 
> > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> > index ae89f4143eb4..2227d81a5e44 100644
> > --- a/arch/x86/net/bpf_jit_comp.c
> > +++ b/arch/x86/net/bpf_jit_comp.c
> > @@ -889,6 +889,35 @@ static void emit_nops(u8 **pprog, int len)
> >   	*pprog = prog;
> >   }
> > +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
> > +		      bool w, u8 src_reg2, bool l, u8 p)
> > +{
> > +	u8 *prog = *pprog;
> > +	u8 b0 = 0xc4, b1, b2;
> > +	u8 src2 = reg2hex[src_reg2];
> > +
> > +	if (is_ereg(src_reg2))
> > +		src2 |= 1 << 3;
> > +
> > +	/*
> > +	 *    7                           0
> > +	 *  +---+---+---+---+---+---+---+---+
> > +	 *  |~R |~X |~B |         m         |
> > +	 *  +---+---+---+---+---+---+---+---+
> > +	 */
> > +	b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
> > +	/*
> > +	 *    7                           0
> > +	 *  +---+---+---+---+---+---+---+---+
> > +	 *  | W |     ~vvvv     | L |   pp  |
> > +	 *  +---+---+---+---+---+---+---+---+
> > +	 */
> > +	b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
> > +
> > +	EMIT3(b0, b1, b2);
> > +	*pprog = prog;
> > +}
> > +
> >   #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
> >   static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
> > @@ -1135,7 +1164,31 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
> >   		case BPF_ALU64 | BPF_LSH | BPF_X:
> >   		case BPF_ALU64 | BPF_RSH | BPF_X:
> >   		case BPF_ALU64 | BPF_ARSH | BPF_X:
> > +			if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
> > +				/* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
> > +				bool r = is_ereg(dst_reg);
> > +				u8 m = 2; /* escape code 0f38 */
> > +				bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
> 
> Looks like you just pass all the above vars into emit_3vex(), so why not hide them
> there directly? The only thing really needed is p (and should probably be called op?),
> so you just pass emit_3vex(&prog, op, dst_reg, src_reg).. 

emit_3vex() is to encode the 3 bytes VEX prefix and exposes all the
information that can be encoded. The wish is to make it reusable for future
instructions that may use VEX so I deliberately avoided hardcoding anything that is specific to a particular instruction.

> please also improve the
> commit message a bit, e.g. before/after disasm + opcode hexdump example (e.g. extract
> from bpftool dump) would be nice and also add a sentence about the BPF_REG_4 limitation
> case.
>

Sure I can do that but would like to know your opinion about emit_3vex()
first.
 
> > +				u8 p;
> > +
> > +				switch (BPF_OP(insn->code)) {
> > +				case BPF_LSH:
> > +					p = 1; /* prefix 0x66 */
> > +					break;
> > +				case BPF_RSH:
> > +					p = 3; /* prefix 0xf2 */
> > +					break;
> > +				case BPF_ARSH:
> > +					p = 2; /* prefix 0xf3 */
> > +					break;
> > +				}
> > +
> > +				emit_3vex(&prog, r, false, r, m,
> > +					  w, src_reg, false, p);
> > +				EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
> > +				break;
> > +			}
> >   			/* Check for bad case when dst_reg == rcx */
> >   			if (dst_reg == BPF_REG_4) {
> >   				/* mov r11, dst_reg */
> > 
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH bpf-next v2 1/2] bpf,x64: use shrx/sarx/shlx when available
  2022-09-27  0:38         ` Jie Meng
@ 2022-09-27  9:45           ` Daniel Borkmann
  2022-09-27 18:57             ` [PATCH bpf-next v3 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Borkmann @ 2022-09-27  9:45 UTC (permalink / raw)
  To: Jie Meng; +Cc: bpf, ast, andrii

On 9/27/22 2:38 AM, Jie Meng wrote:
> On Mon, Sep 26, 2022 at 09:16:41PM +0200, Daniel Borkmann wrote:
>> On 9/24/22 2:32 AM, Jie Meng wrote:
>>> Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
>>> alternatives provided in BMI2
>>>
>>> Signed-off-by: Jie Meng <jmeng@fb.com>
>>> ---
>>>    arch/x86/net/bpf_jit_comp.c | 53 +++++++++++++++++++++++++++++++++++++
>>>    1 file changed, 53 insertions(+)
>>>
>>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>>> index ae89f4143eb4..2227d81a5e44 100644
>>> --- a/arch/x86/net/bpf_jit_comp.c
>>> +++ b/arch/x86/net/bpf_jit_comp.c
>>> @@ -889,6 +889,35 @@ static void emit_nops(u8 **pprog, int len)
>>>    	*pprog = prog;
>>>    }
>>> +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
>>> +		      bool w, u8 src_reg2, bool l, u8 p)
>>> +{
>>> +	u8 *prog = *pprog;
>>> +	u8 b0 = 0xc4, b1, b2;
>>> +	u8 src2 = reg2hex[src_reg2];
>>> +
>>> +	if (is_ereg(src_reg2))
>>> +		src2 |= 1 << 3;
>>> +
>>> +	/*
>>> +	 *    7                           0
>>> +	 *  +---+---+---+---+---+---+---+---+
>>> +	 *  |~R |~X |~B |         m         |
>>> +	 *  +---+---+---+---+---+---+---+---+
>>> +	 */
>>> +	b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
>>> +	/*
>>> +	 *    7                           0
>>> +	 *  +---+---+---+---+---+---+---+---+
>>> +	 *  | W |     ~vvvv     | L |   pp  |
>>> +	 *  +---+---+---+---+---+---+---+---+
>>> +	 */
>>> +	b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
>>> +
>>> +	EMIT3(b0, b1, b2);
>>> +	*pprog = prog;
>>> +}
>>> +
>>>    #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
>>>    static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
>>> @@ -1135,7 +1164,31 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>>>    		case BPF_ALU64 | BPF_LSH | BPF_X:
>>>    		case BPF_ALU64 | BPF_RSH | BPF_X:
>>>    		case BPF_ALU64 | BPF_ARSH | BPF_X:
>>> +			if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
>>> +				/* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
>>> +				bool r = is_ereg(dst_reg);
>>> +				u8 m = 2; /* escape code 0f38 */
>>> +				bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
>>
>> Looks like you just pass all the above vars into emit_3vex(), so why not hide them
>> there directly? The only thing really needed is p (and should probably be called op?),
>> so you just pass emit_3vex(&prog, op, dst_reg, src_reg)..
> 
> emit_3vex() is to encode the 3 bytes VEX prefix and exposes all the
> information that can be encoded. The wish is to make it reusable for future
> instructions that may use VEX so I deliberately avoided hardcoding anything that is specific to a particular instruction.

This bit of context was missing from your description, but I also think it's okay
to do the refactor when the time comes where this gets reused. (You could also just
hide these in an emit_shift which calls emit_3vex or such.. and explain your rationale
in the commit message.)

>> please also improve the
>> commit message a bit, e.g. before/after disasm + opcode hexdump example (e.g. extract
>> from bpftool dump) would be nice and also add a sentence about the BPF_REG_4 limitation
>> case.
> 
> Sure I can do that but would like to know your opinion about emit_3vex()
> first.
>   
>>> +				u8 p;
>>> +
>>> +				switch (BPF_OP(insn->code)) {
>>> +				case BPF_LSH:
>>> +					p = 1; /* prefix 0x66 */
>>> +					break;
>>> +				case BPF_RSH:
>>> +					p = 3; /* prefix 0xf2 */
>>> +					break;
>>> +				case BPF_ARSH:
>>> +					p = 2; /* prefix 0xf3 */
>>> +					break;
>>> +				}
>>> +
>>> +				emit_3vex(&prog, r, false, r, m,
>>> +					  w, src_reg, false, p);
>>> +				EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
>>> +				break;
>>> +			}
>>>    			/* Check for bad case when dst_reg == rcx */
>>>    			if (dst_reg == BPF_REG_4) {
>>>    				/* mov r11, dst_reg */
>>>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v3 0/3] bpf,x64: Use BMI2 for shifts
  2022-09-27  9:45           ` Daniel Borkmann
@ 2022-09-27 18:57             ` Jie Meng
  2022-09-27 18:57               ` [PATCH bpf-next v3 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx Jie Meng
                                 ` (3 more replies)
  0 siblings, 4 replies; 24+ messages in thread
From: Jie Meng @ 2022-09-27 18:57 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

With baseline x64 instruction set, shift count can only be an immediate
or in %cl. The implicit dependency on %cl makes it necessary to shuffle
registers around and/or add push/pop operations.

BMI2 provides shift instructions that can use any general register as
the shift count, saving us instructions and a few bytes in most cases.

Suboptimal codegen when %ecx is source and/or destination is also
addressed and unnecessary instructions are removed.

test_progs: Summary: 267/1340 PASSED, 25 SKIPPED, 0 FAILED
test_progs-no_alu32: Summary: 267/1333 PASSED, 26 SKIPPED, 0 FAILED
test_verifier: Summary: 1367 PASSED, 636 SKIPPED, 0 FAILED (same result
 with or without BMI2)
test_maps: OK, 0 SKIPPED
lib/test_bpf:
  test_bpf: Summary: 1026 PASSED, 0 FAILED, [1014/1014 JIT'ed]
  test_bpf: test_tail_calls: Summary: 10 PASSED, 0 FAILED, [10/10 JIT'ed]
  test_bpf: test_skb_segment: Summary: 2 PASSED, 0 FAILED

Jie Meng (3):
  bpf,x64: avoid unnecessary instructions when shift dest is ecx
  bpf,x64: use shrx/sarx/shlx when available
  bpf: add selftests for lsh, rsh, arsh with reg operand

 arch/x86/net/bpf_jit_comp.c                | 94 ++++++++++++++++++----
 tools/testing/selftests/bpf/verifier/jit.c | 24 ++++++
 2 files changed, 104 insertions(+), 14 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v3 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx
  2022-09-27 18:57             ` [PATCH bpf-next v3 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
@ 2022-09-27 18:57               ` Jie Meng
  2022-09-27 18:58               ` [PATCH bpf-next v3 2/3] bpf,x64: use shrx/sarx/shlx when available Jie Meng
                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 24+ messages in thread
From: Jie Meng @ 2022-09-27 18:57 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

x64 JIT produces redundant instructions when a shift operation's
destination register is BPF_REG_4/ecx and this patch removes them.

Specifically, when dest reg is BPF_REG_4 but the src isn't, we
needn't push and pop ecx around shift only to get it overwritten
by r11 immediately afterwards.

In the rare case when both dest and src registers are BPF_REG_4,
a single shift instruction is sufficient and we don't need the
two MOV instructions around the shift.

To summarize using shift left as an example, without patch:
-------------------------------------------------
            |   dst == ecx     |    dst != ecx
=================================================
src == ecx  |   mov r11, ecx   |    shl dst, cl
            |   shl r11, ecx   |
            |   mov ecx, r11   |
-------------------------------------------------
src != ecx  |   mov r11, ecx   |    push ecx
            |   push ecx       |    mov ecx, src
            |   mov ecx, src   |    shl dst, cl
            |   shl r11, cl    |    pop ecx
            |   pop ecx        |
            |   mov ecx, r11   |
-------------------------------------------------

With patch:
-------------------------------------------------
            |   dst == ecx     |    dst != ecx
=================================================
src == ecx  |   shl ecx, cl    |    shl dst, cl
-------------------------------------------------
src != ecx  |   mov r11, ecx   |    push ecx
            |   mov ecx, src   |    mov ecx, src
            |   shl r11, cl    |    shl dst, cl
            |   mov ecx, r11   |    pop ecx
-------------------------------------------------

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 arch/x86/net/bpf_jit_comp.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 35796db58116..6a5c59f1e6f9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1136,18 +1136,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
 
-			/* Check for bad case when dst_reg == rcx */
-			if (dst_reg == BPF_REG_4) {
-				/* mov r11, dst_reg */
-				EMIT_mov(AUX_REG, dst_reg);
-				dst_reg = AUX_REG;
-			}
-
 			if (src_reg != BPF_REG_4) { /* common case */
-				EMIT1(0x51); /* push rcx */
-
-				/* mov rcx, src_reg */
-				EMIT_mov(BPF_REG_4, src_reg);
+				/* Check for bad case when dst_reg == rcx */
+				if (dst_reg == BPF_REG_4) {
+					/* mov r11, dst_reg */
+					EMIT_mov(AUX_REG, dst_reg);
+					dst_reg = AUX_REG;
+				} else {
+					EMIT1(0x51); /* push rcx */
+
+					/* mov rcx, src_reg */
+					EMIT_mov(BPF_REG_4, src_reg);
+				}
 			}
 
 			/* shl %rax, %cl | shr %rax, %cl | sar %rax, %cl */
@@ -1157,12 +1157,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 			b3 = simple_alu_opcodes[BPF_OP(insn->code)];
 			EMIT2(0xD3, add_1reg(b3, dst_reg));
 
-			if (src_reg != BPF_REG_4)
-				EMIT1(0x59); /* pop rcx */
+			if (src_reg != BPF_REG_4) {
+				if (insn->dst_reg == BPF_REG_4)
+					/* mov dst_reg, r11 */
+					EMIT_mov(insn->dst_reg, AUX_REG);
+				else
+					EMIT1(0x59); /* pop rcx */
+			}
 
-			if (insn->dst_reg == BPF_REG_4)
-				/* mov dst_reg, r11 */
-				EMIT_mov(insn->dst_reg, AUX_REG);
 			break;
 
 		case BPF_ALU | BPF_END | BPF_FROM_BE:
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v3 2/3] bpf,x64: use shrx/sarx/shlx when available
  2022-09-27 18:57             ` [PATCH bpf-next v3 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
  2022-09-27 18:57               ` [PATCH bpf-next v3 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx Jie Meng
@ 2022-09-27 18:58               ` Jie Meng
  2022-09-27 18:58               ` [PATCH bpf-next v3 3/3] bpf: add selftests for lsh, rsh, arsh with reg operand Jie Meng
  2022-10-02  5:11               ` [PATCH bpf-next v4 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
  3 siblings, 0 replies; 24+ messages in thread
From: Jie Meng @ 2022-09-27 18:58 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
alternatives provided in BMI2 when advantageous; keep using the non BMI2
instructions when shift count is already in BPF_REG_4/rcx as non BMI2
instructions are shorter.

To summarize, when BMI2 is available:
-------------------------------------------------
            |   arbitrary dst
=================================================
src == ecx  |   shl dst, cl
-------------------------------------------------
src != ecx  |   shlx dst, dst, src
-------------------------------------------------

A concrete example between non BMI2 and BMI2 codegen.  To shift %rsi by
%rdi:

Without BMI2:

 ef3:   push   %rcx
        51
 ef4:   mov    %rdi,%rcx
        48 89 f9
 ef7:   shl    %cl,%rsi
        48 d3 e6
 efa:   pop    %rcx
        59

With BMI2:

 f0b:   shlx   %rdi,%rsi,%rsi
        c4 e2 c1 f7 f6

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 arch/x86/net/bpf_jit_comp.c | 64 +++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 6a5c59f1e6f9..f91eac901c32 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -889,6 +889,48 @@ static void emit_nops(u8 **pprog, int len)
 	*pprog = prog;
 }
 
+/* emit the 3-byte VEX prefix */
+static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
+		      bool w, u8 src_reg2, bool l, u8 p)
+{
+	u8 *prog = *pprog;
+	u8 b0 = 0xc4, b1, b2;
+	u8 src2 = reg2hex[src_reg2];
+
+	if (is_ereg(src_reg2))
+		src2 |= 1 << 3;
+
+	/*
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  |~R |~X |~B |         m         |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
+	/*
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  | W |     ~vvvv     | L |   pp  |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
+
+	EMIT3(b0, b1, b2);
+	*pprog = prog;
+}
+
+/* emit BMI2 shift instruction */
+static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
+{
+	u8 *prog = *pprog;
+	bool r = is_ereg(dst_reg);
+	u8 m = 2; /* escape code 0f38 */
+
+	emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
+	EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
+	*pprog = prog;
+}
+
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
@@ -1135,6 +1177,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		case BPF_ALU64 | BPF_LSH | BPF_X:
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
+			/* BMI2 shifts aren't better when shift count is already in rcx */
+			if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
+				/* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
+				bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
+				u8 op;
+
+				switch (BPF_OP(insn->code)) {
+				case BPF_LSH:
+					op = 1; /* prefix 0x66 */
+					break;
+				case BPF_RSH:
+					op = 3; /* prefix 0xf2 */
+					break;
+				case BPF_ARSH:
+					op = 2; /* prefix 0xf3 */
+					break;
+				}
+
+				emit_shiftx(&prog, dst_reg, src_reg, w, op);
+
+				break;
+			}
 
 			if (src_reg != BPF_REG_4) { /* common case */
 				/* Check for bad case when dst_reg == rcx */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v3 3/3] bpf: add selftests for lsh, rsh, arsh with reg operand
  2022-09-27 18:57             ` [PATCH bpf-next v3 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
  2022-09-27 18:57               ` [PATCH bpf-next v3 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx Jie Meng
  2022-09-27 18:58               ` [PATCH bpf-next v3 2/3] bpf,x64: use shrx/sarx/shlx when available Jie Meng
@ 2022-09-27 18:58               ` Jie Meng
  2022-10-02  5:11               ` [PATCH bpf-next v4 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
  3 siblings, 0 replies; 24+ messages in thread
From: Jie Meng @ 2022-09-27 18:58 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

Current tests cover only shifts with an immediate as the source
operand/shift counts; add a new test case to cover register operand.

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 tools/testing/selftests/bpf/verifier/jit.c | 24 ++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/jit.c b/tools/testing/selftests/bpf/verifier/jit.c
index 79021c30e51e..8bf37e5207f1 100644
--- a/tools/testing/selftests/bpf/verifier/jit.c
+++ b/tools/testing/selftests/bpf/verifier/jit.c
@@ -20,6 +20,30 @@
 	.result = ACCEPT,
 	.retval = 2,
 },
+{
+	"jit: lsh, rsh, arsh by reg",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_MOV64_IMM(BPF_REG_4, 1),
+	BPF_MOV64_IMM(BPF_REG_1, 0xff),
+	BPF_ALU64_REG(BPF_LSH, BPF_REG_1, BPF_REG_0),
+	BPF_ALU32_REG(BPF_LSH, BPF_REG_1, BPF_REG_4),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1),
+	BPF_EXIT_INSN(),
+	BPF_ALU64_REG(BPF_RSH, BPF_REG_1, BPF_REG_4),
+	BPF_MOV64_REG(BPF_REG_4, BPF_REG_1),
+	BPF_ALU32_REG(BPF_RSH, BPF_REG_4, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0xff, 1),
+	BPF_EXIT_INSN(),
+	BPF_ALU64_REG(BPF_ARSH, BPF_REG_4, BPF_REG_4),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_IMM(BPF_REG_0, 2),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.retval = 2,
+},
 {
 	"jit: mov32 for ldimm64, 1",
 	.insns = {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v4 0/3] bpf,x64: Use BMI2 for shifts
  2022-09-27 18:57             ` [PATCH bpf-next v3 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
                                 ` (2 preceding siblings ...)
  2022-09-27 18:58               ` [PATCH bpf-next v3 3/3] bpf: add selftests for lsh, rsh, arsh with reg operand Jie Meng
@ 2022-10-02  5:11               ` Jie Meng
  2022-10-02  5:11                 ` [PATCH bpf-next v4 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx Jie Meng
                                   ` (2 more replies)
  3 siblings, 3 replies; 24+ messages in thread
From: Jie Meng @ 2022-10-02  5:11 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

With baseline x64 instruction set, shift count can only be an immediate
or in %cl. The implicit dependency on %cl makes it necessary to shuffle
registers around and/or add push/pop operations.

BMI2 provides shift instructions that can use any general register as
the shift count, saving us instructions and a few bytes in most cases.

Suboptimal codegen when %ecx is source and/or destination is also
addressed and unnecessary instructions are removed.

test_progs: Summary: 267/1340 PASSED, 25 SKIPPED, 0 FAILED
test_progs-no_alu32: Summary: 267/1333 PASSED, 26 SKIPPED, 0 FAILED
test_verifier: Summary: 1367 PASSED, 636 SKIPPED, 0 FAILED (same result
 with or without BMI2)
test_maps: OK, 0 SKIPPED
lib/test_bpf:
  test_bpf: Summary: 1026 PASSED, 0 FAILED, [1014/1014 JIT'ed]
  test_bpf: test_tail_calls: Summary: 10 PASSED, 0 FAILED, [10/10 JIT'ed]
  test_bpf: test_skb_segment: Summary: 2 PASSED, 0 FAILED

---
v3 -> v4:
- Fixed a regression when BMI2 isn't available

Jie Meng (3):
  bpf,x64: avoid unnecessary instructions when shift dest is ecx
  bpf,x64: use shrx/sarx/shlx when available
  bpf: add selftests for lsh, rsh, arsh with reg operand

 arch/x86/net/bpf_jit_comp.c                | 89 +++++++++++++++++++---
 tools/testing/selftests/bpf/verifier/jit.c | 24 ++++++
 2 files changed, 101 insertions(+), 12 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v4 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx
  2022-10-02  5:11               ` [PATCH bpf-next v4 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
@ 2022-10-02  5:11                 ` Jie Meng
  2022-10-02  5:11                 ` [PATCH bpf-next v4 2/3] bpf,x64: use shrx/sarx/shlx when available Jie Meng
  2022-10-02  5:11                 ` [PATCH bpf-next v4 " Jie Meng
  2 siblings, 0 replies; 24+ messages in thread
From: Jie Meng @ 2022-10-02  5:11 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

x64 JIT produces redundant instructions when a shift operation's
destination register is BPF_REG_4/ecx and this patch removes them.

Specifically, when dest reg is BPF_REG_4 but the src isn't, we
needn't push and pop ecx around shift only to get it overwritten
by r11 immediately afterwards.

In the rare case when both dest and src registers are BPF_REG_4,
a single shift instruction is sufficient and we don't need the
two MOV instructions around the shift.

To summarize using shift left as an example, without patch:
-------------------------------------------------
            |   dst == ecx     |    dst != ecx
=================================================
src == ecx  |   mov r11, ecx   |    shl dst, cl
            |   shl r11, ecx   |
            |   mov ecx, r11   |
-------------------------------------------------
src != ecx  |   mov r11, ecx   |    push ecx
            |   push ecx       |    mov ecx, src
            |   mov ecx, src   |    shl dst, cl
            |   shl r11, cl    |    pop ecx
            |   pop ecx        |
            |   mov ecx, r11   |
-------------------------------------------------

With patch:
-------------------------------------------------
            |   dst == ecx     |    dst != ecx
=================================================
src == ecx  |   shl ecx, cl    |    shl dst, cl
-------------------------------------------------
src != ecx  |   mov r11, ecx   |    push ecx
            |   mov ecx, src   |    mov ecx, src
            |   shl r11, cl    |    shl dst, cl
            |   mov ecx, r11   |    pop ecx
-------------------------------------------------

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 arch/x86/net/bpf_jit_comp.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5b6230779cf3..d9ba997c5891 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1136,16 +1136,15 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
 
-			/* Check for bad case when dst_reg == rcx */
-			if (dst_reg == BPF_REG_4) {
-				/* mov r11, dst_reg */
-				EMIT_mov(AUX_REG, dst_reg);
-				dst_reg = AUX_REG;
-			}
-
 			if (src_reg != BPF_REG_4) { /* common case */
-				EMIT1(0x51); /* push rcx */
-
+				/* Check for bad case when dst_reg == rcx */
+				if (dst_reg == BPF_REG_4) {
+					/* mov r11, dst_reg */
+					EMIT_mov(AUX_REG, dst_reg);
+					dst_reg = AUX_REG;
+				} else {
+					EMIT1(0x51); /* push rcx */
+				}
 				/* mov rcx, src_reg */
 				EMIT_mov(BPF_REG_4, src_reg);
 			}
@@ -1157,12 +1156,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 			b3 = simple_alu_opcodes[BPF_OP(insn->code)];
 			EMIT2(0xD3, add_1reg(b3, dst_reg));
 
-			if (src_reg != BPF_REG_4)
-				EMIT1(0x59); /* pop rcx */
+			if (src_reg != BPF_REG_4) {
+				if (insn->dst_reg == BPF_REG_4)
+					/* mov dst_reg, r11 */
+					EMIT_mov(insn->dst_reg, AUX_REG);
+				else
+					EMIT1(0x59); /* pop rcx */
+			}
 
-			if (insn->dst_reg == BPF_REG_4)
-				/* mov dst_reg, r11 */
-				EMIT_mov(insn->dst_reg, AUX_REG);
 			break;
 
 		case BPF_ALU | BPF_END | BPF_FROM_BE:
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v4 2/3] bpf,x64: use shrx/sarx/shlx when available
  2022-10-02  5:11               ` [PATCH bpf-next v4 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
  2022-10-02  5:11                 ` [PATCH bpf-next v4 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx Jie Meng
@ 2022-10-02  5:11                 ` Jie Meng
  2022-10-06  4:11                   ` KP Singh
  2022-10-02  5:11                 ` [PATCH bpf-next v4 " Jie Meng
  2 siblings, 1 reply; 24+ messages in thread
From: Jie Meng @ 2022-10-02  5:11 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
alternatives provided in BMI2 when advantageous; keep using the non BMI2
instructions when shift count is already in BPF_REG_4/rcx as non BMI2
instructions are shorter.

To summarize, when BMI2 is available:
-------------------------------------------------
            |   arbitrary dst
=================================================
src == ecx  |   shl dst, cl
-------------------------------------------------
src != ecx  |   shlx dst, dst, src
-------------------------------------------------

A concrete example between non BMI2 and BMI2 codegen.  To shift %rsi by
%rdi:

Without BMI2:

 ef3:   push   %rcx
        51
 ef4:   mov    %rdi,%rcx
        48 89 f9
 ef7:   shl    %cl,%rsi
        48 d3 e6
 efa:   pop    %rcx
        59

With BMI2:

 f0b:   shlx   %rdi,%rsi,%rsi
        c4 e2 c1 f7 f6

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 arch/x86/net/bpf_jit_comp.c | 64 +++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d9ba997c5891..d09c54f3d2e0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -889,6 +889,48 @@ static void emit_nops(u8 **pprog, int len)
 	*pprog = prog;
 }
 
+/* emit the 3-byte VEX prefix */
+static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
+		      bool w, u8 src_reg2, bool l, u8 p)
+{
+	u8 *prog = *pprog;
+	u8 b0 = 0xc4, b1, b2;
+	u8 src2 = reg2hex[src_reg2];
+
+	if (is_ereg(src_reg2))
+		src2 |= 1 << 3;
+
+	/*
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  |~R |~X |~B |         m         |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
+	/*
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  | W |     ~vvvv     | L |   pp  |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
+
+	EMIT3(b0, b1, b2);
+	*pprog = prog;
+}
+
+/* emit BMI2 shift instruction */
+static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
+{
+	u8 *prog = *pprog;
+	bool r = is_ereg(dst_reg);
+	u8 m = 2; /* escape code 0f38 */
+
+	emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
+	EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
+	*pprog = prog;
+}
+
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
@@ -1135,6 +1177,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		case BPF_ALU64 | BPF_LSH | BPF_X:
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
+			/* BMI2 shifts aren't better when shift count is already in rcx */
+			if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
+				/* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
+				bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
+				u8 op;
+
+				switch (BPF_OP(insn->code)) {
+				case BPF_LSH:
+					op = 1; /* prefix 0x66 */
+					break;
+				case BPF_RSH:
+					op = 3; /* prefix 0xf2 */
+					break;
+				case BPF_ARSH:
+					op = 2; /* prefix 0xf3 */
+					break;
+				}
+
+				emit_shiftx(&prog, dst_reg, src_reg, w, op);
+
+				break;
+			}
 
 			if (src_reg != BPF_REG_4) { /* common case */
 				/* Check for bad case when dst_reg == rcx */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v4 3/3] bpf: add selftests for lsh, rsh, arsh with reg operand
  2022-10-02  5:11               ` [PATCH bpf-next v4 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
  2022-10-02  5:11                 ` [PATCH bpf-next v4 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx Jie Meng
  2022-10-02  5:11                 ` [PATCH bpf-next v4 2/3] bpf,x64: use shrx/sarx/shlx when available Jie Meng
@ 2022-10-02  5:11                 ` Jie Meng
  2 siblings, 0 replies; 24+ messages in thread
From: Jie Meng @ 2022-10-02  5:11 UTC (permalink / raw)
  To: bpf, ast, andrii, daniel; +Cc: Jie Meng

Current tests cover only shifts with an immediate as the source
operand/shift counts; add a new test case to cover register operand.

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 tools/testing/selftests/bpf/verifier/jit.c | 24 ++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/jit.c b/tools/testing/selftests/bpf/verifier/jit.c
index 79021c30e51e..8bf37e5207f1 100644
--- a/tools/testing/selftests/bpf/verifier/jit.c
+++ b/tools/testing/selftests/bpf/verifier/jit.c
@@ -20,6 +20,30 @@
 	.result = ACCEPT,
 	.retval = 2,
 },
+{
+	"jit: lsh, rsh, arsh by reg",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_MOV64_IMM(BPF_REG_4, 1),
+	BPF_MOV64_IMM(BPF_REG_1, 0xff),
+	BPF_ALU64_REG(BPF_LSH, BPF_REG_1, BPF_REG_0),
+	BPF_ALU32_REG(BPF_LSH, BPF_REG_1, BPF_REG_4),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1),
+	BPF_EXIT_INSN(),
+	BPF_ALU64_REG(BPF_RSH, BPF_REG_1, BPF_REG_4),
+	BPF_MOV64_REG(BPF_REG_4, BPF_REG_1),
+	BPF_ALU32_REG(BPF_RSH, BPF_REG_4, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0xff, 1),
+	BPF_EXIT_INSN(),
+	BPF_ALU64_REG(BPF_ARSH, BPF_REG_4, BPF_REG_4),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_IMM(BPF_REG_0, 2),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.retval = 2,
+},
 {
 	"jit: mov32 for ldimm64, 1",
 	.insns = {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH bpf-next v4 2/3] bpf,x64: use shrx/sarx/shlx when available
  2022-10-02  5:11                 ` [PATCH bpf-next v4 2/3] bpf,x64: use shrx/sarx/shlx when available Jie Meng
@ 2022-10-06  4:11                   ` KP Singh
  2022-10-07  3:14                     ` Jie Meng
  0 siblings, 1 reply; 24+ messages in thread
From: KP Singh @ 2022-10-06  4:11 UTC (permalink / raw)
  To: Jie Meng; +Cc: bpf, ast, andrii, daniel

On Sat, Oct 1, 2022 at 10:12 PM Jie Meng <jmeng@fb.com> wrote:
>
> Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
> alternatives provided in BMI2 when advantageous; keep using the non BMI2
> instructions when shift count is already in BPF_REG_4/rcx as non BMI2
> instructions are shorter.

This is confusing, you mention %CL in the first sentence and then RCX in the
second sentence. Can you clarify this more here?

Also, It would be good to have some explanations about the
performance benefits here as well.

i.e. a load + store + non vector instruction v/s a single vector instruction
omitting the load. How many cycles do we expect in each case, I do expect the
latter to be lesser, but mentioning it in the commit removes any ambiguity.

>
> To summarize, when BMI2 is available:
> -------------------------------------------------
>             |   arbitrary dst
> =================================================
> src == ecx  |   shl dst, cl
> -------------------------------------------------
> src != ecx  |   shlx dst, dst, src
> -------------------------------------------------
>
> A concrete example between non BMI2 and BMI2 codegen.  To shift %rsi by
> %rdi:
>
> Without BMI2:
>
>  ef3:   push   %rcx
>         51
>  ef4:   mov    %rdi,%rcx
>         48 89 f9
>  ef7:   shl    %cl,%rsi
>         48 d3 e6
>  efa:   pop    %rcx
>         59
>
> With BMI2:
>
>  f0b:   shlx   %rdi,%rsi,%rsi
>         c4 e2 c1 f7 f6
>
> Signed-off-by: Jie Meng <jmeng@fb.com>
> ---
>  arch/x86/net/bpf_jit_comp.c | 64 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 64 insertions(+)
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index d9ba997c5891..d09c54f3d2e0 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -889,6 +889,48 @@ static void emit_nops(u8 **pprog, int len)
>         *pprog = prog;
>  }
>
> +/* emit the 3-byte VEX prefix */
> +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
> +                     bool w, u8 src_reg2, bool l, u8 p)

Can you please use somewhat more descriptive variable names here?

or add more information about what x, b, m, w, l and p mean?

> +{
> +       u8 *prog = *pprog;
> +       u8 b0 = 0xc4, b1, b2;
> +       u8 src2 = reg2hex[src_reg2];
> +
> +       if (is_ereg(src_reg2))
> +               src2 |= 1 << 3;
> +
> +       /*
> +        *    7                           0
> +        *  +---+---+---+---+---+---+---+---+
> +        *  |~R |~X |~B |         m         |
> +        *  +---+---+---+---+---+---+---+---+
> +        */
> +       b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);

Some explanation here would help, not everyone is aware of x86 vex encoding :)

> +       /*
> +        *    7                           0
> +        *  +---+---+---+---+---+---+---+---+
> +        *  | W |     ~vvvv     | L |   pp  |
> +        *  +---+---+---+---+---+---+---+---+
> +        */
> +       b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
> +
> +       EMIT3(b0, b1, b2);
> +       *pprog = prog;
> +}
> +
> +/* emit BMI2 shift instruction */
> +static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
> +{
> +       u8 *prog = *pprog;
> +       bool r = is_ereg(dst_reg);
> +       u8 m = 2; /* escape code 0f38 */
> +
> +       emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
> +       EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
> +       *pprog = prog;
> +}
> +
>  #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
>
>  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
> @@ -1135,6 +1177,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>                 case BPF_ALU64 | BPF_LSH | BPF_X:
>                 case BPF_ALU64 | BPF_RSH | BPF_X:
>                 case BPF_ALU64 | BPF_ARSH | BPF_X:
> +                       /* BMI2 shifts aren't better when shift count is already in rcx */
> +                       if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
> +                               /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
> +                               bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
> +                               u8 op;
> +
> +                               switch (BPF_OP(insn->code)) {
> +                               case BPF_LSH:
> +                                       op = 1; /* prefix 0x66 */
> +                                       break;
> +                               case BPF_RSH:
> +                                       op = 3; /* prefix 0xf2 */
> +                                       break;
> +                               case BPF_ARSH:
> +                                       op = 2; /* prefix 0xf3 */
> +                                       break;
> +                               }
> +
> +                               emit_shiftx(&prog, dst_reg, src_reg, w, op);
> +
> +                               break;
> +                       }
>
>                         if (src_reg != BPF_REG_4) { /* common case */
>                                 /* Check for bad case when dst_reg == rcx */
> --
> 2.30.2
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH bpf-next v4 2/3] bpf,x64: use shrx/sarx/shlx when available
  2022-10-06  4:11                   ` KP Singh
@ 2022-10-07  3:14                     ` Jie Meng
  2022-10-07 18:11                       ` KP Singh
  0 siblings, 1 reply; 24+ messages in thread
From: Jie Meng @ 2022-10-07  3:14 UTC (permalink / raw)
  To: KP Singh; +Cc: bpf, ast, andrii, daniel

On Wed, Oct 05, 2022 at 09:11:01PM -0700, KP Singh wrote:
> On Sat, Oct 1, 2022 at 10:12 PM Jie Meng <jmeng@fb.com> wrote:
> >
> > Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
> > alternatives provided in BMI2 when advantageous; keep using the non BMI2
> > instructions when shift count is already in BPF_REG_4/rcx as non BMI2
> > instructions are shorter.
> 
> This is confusing, you mention %CL in the first sentence and then RCX in the
> second sentence. Can you clarify this more here?

%cl is the lowest 8 bit of %rcx. In assembly, non BMI2 shifts with shift
count in register is written as

 SHR eax, cl

Although the use of CL is mandatory and if the shift count is in another
register it has to be moved into RCX first, unless of course when the
shift count is already in BPF_REG_4, which is mapped to RCX in x86-64.

It is indeed awkward but exactly what one would see in assembly: a MOV
to RCX and a shift that uses CL as the source register.

> 
> Also, It would be good to have some explanations about the
> performance benefits here as well.
> 
> i.e. a load + store + non vector instruction v/s a single vector instruction
> omitting the load. How many cycles do we expect in each case, I do expect the
> latter to be lesser, but mentioning it in the commit removes any ambiguity.

Although it uses similar encoding as AVX instructions BMI2 actually
operates on general purpose registers and no vector register is ever
involved [1]. Inside a CPU all shifts instructions (both baseline and BMI2
flavors) are almost always handled by the same units and have the same
latency and throughput [2].

[1] https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
[2] https://www.agner.org/optimize/instruction_tables.pdf
> 
> >
> > To summarize, when BMI2 is available:
> > -------------------------------------------------
> >             |   arbitrary dst
> > =================================================
> > src == ecx  |   shl dst, cl
> > -------------------------------------------------
> > src != ecx  |   shlx dst, dst, src
> > -------------------------------------------------
> >
> > A concrete example between non BMI2 and BMI2 codegen.  To shift %rsi by
> > %rdi:
> >
> > Without BMI2:
> >
> >  ef3:   push   %rcx
> >         51
> >  ef4:   mov    %rdi,%rcx
> >         48 89 f9
> >  ef7:   shl    %cl,%rsi
> >         48 d3 e6
> >  efa:   pop    %rcx
> >         59
> >
> > With BMI2:
> >
> >  f0b:   shlx   %rdi,%rsi,%rsi
> >         c4 e2 c1 f7 f6
> >
> > Signed-off-by: Jie Meng <jmeng@fb.com>
> > ---
> >  arch/x86/net/bpf_jit_comp.c | 64 +++++++++++++++++++++++++++++++++++++
> >  1 file changed, 64 insertions(+)
> >
> > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> > index d9ba997c5891..d09c54f3d2e0 100644
> > --- a/arch/x86/net/bpf_jit_comp.c
> > +++ b/arch/x86/net/bpf_jit_comp.c
> > @@ -889,6 +889,48 @@ static void emit_nops(u8 **pprog, int len)
> >         *pprog = prog;
> >  }
> >
> > +/* emit the 3-byte VEX prefix */
> > +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
> > +                     bool w, u8 src_reg2, bool l, u8 p)
> 
> Can you please use somewhat more descriptive variable names here?
> 
> or add more information about what x, b, m, w, l and p mean?

Apart from src_reg2, the rest is the same as what Intel has chosen to
name the various fields in the VEX prefix. Would rather keep them 
consistent so that people won't get confused when comparing with other
documents across the Internet.
> 
> > +{
> > +       u8 *prog = *pprog;
> > +       u8 b0 = 0xc4, b1, b2;
> > +       u8 src2 = reg2hex[src_reg2];
> > +
> > +       if (is_ereg(src_reg2))
> > +               src2 |= 1 << 3;
> > +
> > +       /*
> > +        *    7                           0
> > +        *  +---+---+---+---+---+---+---+---+
> > +        *  |~R |~X |~B |         m         |
> > +        *  +---+---+---+---+---+---+---+---+
> > +        */
> > +       b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
> 
> Some explanation here would help, not everyone is aware of x86 vex encoding :)

The comment is the exact rule how different pieces of information is
encoded into the 3-byte VEX prefix i.e. their position and length, and
whether a field needs to be bit inverted. Combined with code the comment
should give one clear idea what the intent is here.
> 
> > +       /*
> > +        *    7                           0
> > +        *  +---+---+---+---+---+---+---+---+
> > +        *  | W |     ~vvvv     | L |   pp  |
> > +        *  +---+---+---+---+---+---+---+---+
> > +        */
> > +       b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
> > +
> > +       EMIT3(b0, b1, b2);
> > +       *pprog = prog;
> > +}
> > +
> > +/* emit BMI2 shift instruction */
> > +static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
> > +{
> > +       u8 *prog = *pprog;
> > +       bool r = is_ereg(dst_reg);
> > +       u8 m = 2; /* escape code 0f38 */
> > +
> > +       emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
> > +       EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
> > +       *pprog = prog;
> > +}
> > +
> >  #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
> >
> >  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
> > @@ -1135,6 +1177,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
> >                 case BPF_ALU64 | BPF_LSH | BPF_X:
> >                 case BPF_ALU64 | BPF_RSH | BPF_X:
> >                 case BPF_ALU64 | BPF_ARSH | BPF_X:
> > +                       /* BMI2 shifts aren't better when shift count is already in rcx */
> > +                       if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
> > +                               /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
> > +                               bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
> > +                               u8 op;
> > +
> > +                               switch (BPF_OP(insn->code)) {
> > +                               case BPF_LSH:
> > +                                       op = 1; /* prefix 0x66 */
> > +                                       break;
> > +                               case BPF_RSH:
> > +                                       op = 3; /* prefix 0xf2 */
> > +                                       break;
> > +                               case BPF_ARSH:
> > +                                       op = 2; /* prefix 0xf3 */
> > +                                       break;
> > +                               }
> > +
> > +                               emit_shiftx(&prog, dst_reg, src_reg, w, op);
> > +
> > +                               break;
> > +                       }
> >
> >                         if (src_reg != BPF_REG_4) { /* common case */
> >                                 /* Check for bad case when dst_reg == rcx */
> > --
> > 2.30.2
> >

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH bpf-next v4 2/3] bpf,x64: use shrx/sarx/shlx when available
  2022-10-07  3:14                     ` Jie Meng
@ 2022-10-07 18:11                       ` KP Singh
  2022-10-07 20:23                         ` [PATCH bpf-next v5 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
                                           ` (3 more replies)
  0 siblings, 4 replies; 24+ messages in thread
From: KP Singh @ 2022-10-07 18:11 UTC (permalink / raw)
  To: Jie Meng; +Cc: bpf, ast, andrii, daniel

On Thu, Oct 6, 2022 at 8:14 PM Jie Meng <jmeng@fb.com> wrote:
>
> On Wed, Oct 05, 2022 at 09:11:01PM -0700, KP Singh wrote:
> > On Sat, Oct 1, 2022 at 10:12 PM Jie Meng <jmeng@fb.com> wrote:
> > >
> > > Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
> > > alternatives provided in BMI2 when advantageous; keep using the non BMI2
> > > instructions when shift count is already in BPF_REG_4/rcx as non BMI2
> > > instructions are shorter.
> >
> > This is confusing, you mention %CL in the first sentence and then RCX in the
> > second sentence. Can you clarify this more here?
>
> %cl is the lowest 8 bit of %rcx. In assembly, non BMI2 shifts with shift
> count in register is written as
>
>  SHR eax, cl
>
> Although the use of CL is mandatory and if the shift count is in another
> register it has to be moved into RCX first, unless of course when the
> shift count is already in BPF_REG_4, which is mapped to RCX in x86-64.
>
> It is indeed awkward but exactly what one would see in assembly: a MOV
> to RCX and a shift that uses CL as the source register.
>
> >
> > Also, It would be good to have some explanations about the
> > performance benefits here as well.
> >
> > i.e. a load + store + non vector instruction v/s a single vector instruction
> > omitting the load. How many cycles do we expect in each case, I do expect the
> > latter to be lesser, but mentioning it in the commit removes any ambiguity.
>
> Although it uses similar encoding as AVX instructions BMI2 actually
> operates on general purpose registers and no vector register is ever
> involved [1]. Inside a CPU all shifts instructions (both baseline and BMI2
> flavors) are almost always handled by the same units and have the same
> latency and throughput [2].
>
> [1] https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
> [2] https://www.agner.org/optimize/instruction_tables.pdf

Cool, please add this to the commit description.

> >
> > >
> > > To summarize, when BMI2 is available:
> > > -------------------------------------------------
> > >             |   arbitrary dst
> > > =================================================
> > > src == ecx  |   shl dst, cl
> > > -------------------------------------------------
> > > src != ecx  |   shlx dst, dst, src
> > > -------------------------------------------------
> > >
> > > A concrete example between non BMI2 and BMI2 codegen.  To shift %rsi by
> > > %rdi:
> > >
> > > Without BMI2:
> > >
> > >  ef3:   push   %rcx
> > >         51
> > >  ef4:   mov    %rdi,%rcx
> > >         48 89 f9
> > >  ef7:   shl    %cl,%rsi
> > >         48 d3 e6
> > >  efa:   pop    %rcx
> > >         59
> > >
> > > With BMI2:
> > >
> > >  f0b:   shlx   %rdi,%rsi,%rsi
> > >         c4 e2 c1 f7 f6
> > >
> > > Signed-off-by: Jie Meng <jmeng@fb.com>
> > > ---
> > >  arch/x86/net/bpf_jit_comp.c | 64 +++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 64 insertions(+)
> > >
> > > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> > > index d9ba997c5891..d09c54f3d2e0 100644
> > > --- a/arch/x86/net/bpf_jit_comp.c
> > > +++ b/arch/x86/net/bpf_jit_comp.c
> > > @@ -889,6 +889,48 @@ static void emit_nops(u8 **pprog, int len)
> > >         *pprog = prog;
> > >  }
> > >
> > > +/* emit the 3-byte VEX prefix */
> > > +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
> > > +                     bool w, u8 src_reg2, bool l, u8 p)
> >
> > Can you please use somewhat more descriptive variable names here?
> >
> > or add more information about what x, b, m, w, l and p mean?
>
> Apart from src_reg2, the rest is the same as what Intel has chosen to
> name the various fields in the VEX prefix. Would rather keep them
> consistent so that people won't get confused when comparing with other
> documents across the Internet.

Sure, but it would be nice to have a comment about what they mean.

These bits allow indexing various kinds of registers. e.g.

"VEX.~R allows is an extra bit for indexing the ModRM register" or something
similar, if it's preferred not to change the variable names.


> >
> > > +{
> > > +       u8 *prog = *pprog;
> > > +       u8 b0 = 0xc4, b1, b2;
> > > +       u8 src2 = reg2hex[src_reg2];
> > > +
> > > +       if (is_ereg(src_reg2))
> > > +               src2 |= 1 << 3;
> > > +
> > > +       /*
> > > +        *    7                           0
> > > +        *  +---+---+---+---+---+---+---+---+
> > > +        *  |~R |~X |~B |         m         |
> > > +        *  +---+---+---+---+---+---+---+---+
> > > +        */
> > > +       b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
> >
> > Some explanation here would help, not everyone is aware of x86 vex encoding :)
>
> The comment is the exact rule how different pieces of information is
> encoded into the 3-byte VEX prefix i.e. their position and length, and
> whether a field needs to be bit inverted. Combined with code the comment
> should give one clear idea what the intent is here.

I am not sure this gives a clear picture, It assumes that the reader knows
about VEX encoding, which not everyone does. Now they can pull up a
manual and start reading but that doesn't help so the comments need to
explain what's going on here.


> >
> > > +       /*
> > > +        *    7                           0
> > > +        *  +---+---+---+---+---+---+---+---+
> > > +        *  | W |     ~vvvv     | L |   pp  |
> > > +        *  +---+---+---+---+---+---+---+---+
> > > +        */
> > > +       b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);

By reading the code one should be able to understand what
b0, b1 and b2 are.


> > > +
> > > +       EMIT3(b0, b1, b2);
> > > +       *pprog = prog;
> > > +}
> > > +
> > > +/* emit BMI2 shift instruction */
> > > +static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
> > > +{
> > > +       u8 *prog = *pprog;
> > > +       bool r = is_ereg(dst_reg);
> > > +       u8 m = 2; /* escape code 0f38 */
> > > +
> > > +       emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
> > > +       EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
> > > +       *pprog = prog;
> > > +}
> > > +
> > >  #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
> > >
> > >  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
> > > @@ -1135,6 +1177,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
> > >                 case BPF_ALU64 | BPF_LSH | BPF_X:
> > >                 case BPF_ALU64 | BPF_RSH | BPF_X:
> > >                 case BPF_ALU64 | BPF_ARSH | BPF_X:
> > > +                       /* BMI2 shifts aren't better when shift count is already in rcx */
> > > +                       if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
> > > +                               /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
> > > +                               bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
> > > +                               u8 op;
> > > +
> > > +                               switch (BPF_OP(insn->code)) {
> > > +                               case BPF_LSH:
> > > +                                       op = 1; /* prefix 0x66 */
> > > +                                       break;
> > > +                               case BPF_RSH:
> > > +                                       op = 3; /* prefix 0xf2 */
> > > +                                       break;
> > > +                               case BPF_ARSH:
> > > +                                       op = 2; /* prefix 0xf3 */
> > > +                                       break;
> > > +                               }
> > > +
> > > +                               emit_shiftx(&prog, dst_reg, src_reg, w, op);
> > > +
> > > +                               break;
> > > +                       }
> > >
> > >                         if (src_reg != BPF_REG_4) { /* common case */
> > >                                 /* Check for bad case when dst_reg == rcx */
> > > --
> > > 2.30.2
> > >

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v5 0/3] bpf,x64: Use BMI2 for shifts
  2022-10-07 18:11                       ` KP Singh
@ 2022-10-07 20:23                         ` Jie Meng
  2022-10-20  0:00                           ` patchwork-bot+netdevbpf
  2022-10-07 20:23                         ` [PATCH bpf-next v5 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx Jie Meng
                                           ` (2 subsequent siblings)
  3 siblings, 1 reply; 24+ messages in thread
From: Jie Meng @ 2022-10-07 20:23 UTC (permalink / raw)
  To: kpsingh, bpf, ast, andrii, daniel; +Cc: Jie Meng

With baseline x64 instruction set, shift count can only be an immediate
or in %cl. The implicit dependency on %cl makes it necessary to shuffle
registers around and/or add push/pop operations.

BMI2 provides shift instructions that can use any general register as
the shift count, saving us instructions and a few bytes in most cases.

Suboptimal codegen when %ecx is source and/or destination is also
addressed and unnecessary instructions are removed.

test_progs: Summary: 267/1340 PASSED, 25 SKIPPED, 0 FAILED
test_progs-no_alu32: Summary: 267/1333 PASSED, 26 SKIPPED, 0 FAILED
test_verifier: Summary: 1367 PASSED, 636 SKIPPED, 0 FAILED (same result
 with or without BMI2)
test_maps: OK, 0 SKIPPED
lib/test_bpf:
  test_bpf: Summary: 1026 PASSED, 0 FAILED, [1014/1014 JIT'ed]
  test_bpf: test_tail_calls: Summary: 10 PASSED, 0 FAILED, [10/10 JIT'ed]
  test_bpf: test_skb_segment: Summary: 2 PASSED, 0 FAILED

---
v4 -> v5:
- More comments regarding instruction encoding
v3 -> v4:
- Fixed a regression when BMI2 isn't available

Jie Meng (3):
  bpf,x64: avoid unnecessary instructions when shift dest is ecx
  bpf,x64: use shrx/sarx/shlx when available
  bpf: add selftests for lsh, rsh, arsh with reg operand

 arch/x86/net/bpf_jit_comp.c                | 106 ++++++++++++++++++---
 tools/testing/selftests/bpf/verifier/jit.c |  24 +++++
 2 files changed, 118 insertions(+), 12 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v5 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx
  2022-10-07 18:11                       ` KP Singh
  2022-10-07 20:23                         ` [PATCH bpf-next v5 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
@ 2022-10-07 20:23                         ` Jie Meng
  2022-10-07 20:23                         ` [PATCH bpf-next v5 2/3] bpf,x64: use shrx/sarx/shlx when available Jie Meng
  2022-10-07 20:23                         ` [PATCH bpf-next v5 3/3] bpf: add selftests for lsh, rsh, arsh with reg operand Jie Meng
  3 siblings, 0 replies; 24+ messages in thread
From: Jie Meng @ 2022-10-07 20:23 UTC (permalink / raw)
  To: kpsingh, bpf, ast, andrii, daniel; +Cc: Jie Meng

x64 JIT produces redundant instructions when a shift operation's
destination register is BPF_REG_4/ecx and this patch removes them.

Specifically, when dest reg is BPF_REG_4 but the src isn't, we
needn't push and pop ecx around shift only to get it overwritten
by r11 immediately afterwards.

In the rare case when both dest and src registers are BPF_REG_4,
a single shift instruction is sufficient and we don't need the
two MOV instructions around the shift.

To summarize using shift left as an example, without patch:
-------------------------------------------------
            |   dst == ecx     |    dst != ecx
=================================================
src == ecx  |   mov r11, ecx   |    shl dst, cl
            |   shl r11, ecx   |
            |   mov ecx, r11   |
-------------------------------------------------
src != ecx  |   mov r11, ecx   |    push ecx
            |   push ecx       |    mov ecx, src
            |   mov ecx, src   |    shl dst, cl
            |   shl r11, cl    |    pop ecx
            |   pop ecx        |
            |   mov ecx, r11   |
-------------------------------------------------

With patch:
-------------------------------------------------
            |   dst == ecx     |    dst != ecx
=================================================
src == ecx  |   shl ecx, cl    |    shl dst, cl
-------------------------------------------------
src != ecx  |   mov r11, ecx   |    push ecx
            |   mov ecx, src   |    mov ecx, src
            |   shl r11, cl    |    shl dst, cl
            |   mov ecx, r11   |    pop ecx
-------------------------------------------------

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 arch/x86/net/bpf_jit_comp.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 0abd082786e7..d926ca637d8d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1138,16 +1138,15 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
 
-			/* Check for bad case when dst_reg == rcx */
-			if (dst_reg == BPF_REG_4) {
-				/* mov r11, dst_reg */
-				EMIT_mov(AUX_REG, dst_reg);
-				dst_reg = AUX_REG;
-			}
-
 			if (src_reg != BPF_REG_4) { /* common case */
-				EMIT1(0x51); /* push rcx */
-
+				/* Check for bad case when dst_reg == rcx */
+				if (dst_reg == BPF_REG_4) {
+					/* mov r11, dst_reg */
+					EMIT_mov(AUX_REG, dst_reg);
+					dst_reg = AUX_REG;
+				} else {
+					EMIT1(0x51); /* push rcx */
+				}
 				/* mov rcx, src_reg */
 				EMIT_mov(BPF_REG_4, src_reg);
 			}
@@ -1159,12 +1158,14 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 			b3 = simple_alu_opcodes[BPF_OP(insn->code)];
 			EMIT2(0xD3, add_1reg(b3, dst_reg));
 
-			if (src_reg != BPF_REG_4)
-				EMIT1(0x59); /* pop rcx */
+			if (src_reg != BPF_REG_4) {
+				if (insn->dst_reg == BPF_REG_4)
+					/* mov dst_reg, r11 */
+					EMIT_mov(insn->dst_reg, AUX_REG);
+				else
+					EMIT1(0x59); /* pop rcx */
+			}
 
-			if (insn->dst_reg == BPF_REG_4)
-				/* mov dst_reg, r11 */
-				EMIT_mov(insn->dst_reg, AUX_REG);
 			break;
 
 		case BPF_ALU | BPF_END | BPF_FROM_BE:
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v5 2/3] bpf,x64: use shrx/sarx/shlx when available
  2022-10-07 18:11                       ` KP Singh
  2022-10-07 20:23                         ` [PATCH bpf-next v5 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
  2022-10-07 20:23                         ` [PATCH bpf-next v5 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx Jie Meng
@ 2022-10-07 20:23                         ` Jie Meng
  2022-10-07 20:23                         ` [PATCH bpf-next v5 3/3] bpf: add selftests for lsh, rsh, arsh with reg operand Jie Meng
  3 siblings, 0 replies; 24+ messages in thread
From: Jie Meng @ 2022-10-07 20:23 UTC (permalink / raw)
  To: kpsingh, bpf, ast, andrii, daniel; +Cc: Jie Meng

BMI2 provides 3 shift instructions (shrx, sarx and shlx) that use VEX
encoding but target general purpose registers [1]. They allow the shift
count in any general purpose register and have the same performance as
non BMI2 shift instructions [2].

Instead of shr/sar/shl that implicitly use %cl (lowest 8 bit of %rcx),
emit their more flexible alternatives provided in BMI2 when advantageous;
keep using the non BMI2 instructions when shift count is already in
BPF_REG_4/%rcx as non BMI2 instructions are shorter.

To summarize, when BMI2 is available:
-------------------------------------------------
            |   arbitrary dst
=================================================
src == ecx  |   shl dst, cl
-------------------------------------------------
src != ecx  |   shlx dst, dst, src
-------------------------------------------------

And no additional register shuffling is needed.

A concrete example between non BMI2 and BMI2 codegen.  To shift %rsi by
%rdi:

Without BMI2:

 ef3:   push   %rcx
        51
 ef4:   mov    %rdi,%rcx
        48 89 f9
 ef7:   shl    %cl,%rsi
        48 d3 e6
 efa:   pop    %rcx
        59

With BMI2:

 f0b:   shlx   %rdi,%rsi,%rsi
        c4 e2 c1 f7 f6

[1] https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
[2] https://www.agner.org/optimize/instruction_tables.pdf

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 arch/x86/net/bpf_jit_comp.c | 81 +++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d926ca637d8d..d7dd8e0db8da 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -891,6 +891,65 @@ static void emit_nops(u8 **pprog, int len)
 	*pprog = prog;
 }
 
+/* emit the 3-byte VEX prefix
+ *
+ * r: same as rex.r, extra bit for ModRM reg field
+ * x: same as rex.x, extra bit for SIB index field
+ * b: same as rex.b, extra bit for ModRM r/m, or SIB base
+ * m: opcode map select, encoding escape bytes e.g. 0x0f38
+ * w: same as rex.w (32 bit or 64 bit) or opcode specific
+ * src_reg2: additional source reg (encoded as BPF reg)
+ * l: vector length (128 bit or 256 bit) or reserved
+ * pp: opcode prefix (none, 0x66, 0xf2 or 0xf3)
+ */
+static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
+		      bool w, u8 src_reg2, bool l, u8 pp)
+{
+	u8 *prog = *pprog;
+	const u8 b0 = 0xc4; /* first byte of 3-byte VEX prefix */
+	u8 b1, b2;
+	u8 vvvv = reg2hex[src_reg2];
+
+	/* reg2hex gives only the lower 3 bit of vvvv */
+	if (is_ereg(src_reg2))
+		vvvv |= 1 << 3;
+
+	/*
+	 * 2nd byte of 3-byte VEX prefix
+	 * ~ means bit inverted encoding
+	 *
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  |~R |~X |~B |         m         |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);
+	/*
+	 * 3rd byte of 3-byte VEX prefix
+	 *
+	 *    7                           0
+	 *  +---+---+---+---+---+---+---+---+
+	 *  | W |     ~vvvv     | L |   pp  |
+	 *  +---+---+---+---+---+---+---+---+
+	 */
+	b2 = (w << 7) | ((~vvvv & 0xf) << 3) | (l << 2) | (pp & 3);
+
+	EMIT3(b0, b1, b2);
+	*pprog = prog;
+}
+
+/* emit BMI2 shift instruction */
+static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
+{
+	u8 *prog = *pprog;
+	bool r = is_ereg(dst_reg);
+	u8 m = 2; /* escape code 0f38 */
+
+	emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
+	EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
+	*pprog = prog;
+}
+
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
 static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
@@ -1137,6 +1196,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		case BPF_ALU64 | BPF_LSH | BPF_X:
 		case BPF_ALU64 | BPF_RSH | BPF_X:
 		case BPF_ALU64 | BPF_ARSH | BPF_X:
+			/* BMI2 shifts aren't better when shift count is already in rcx */
+			if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
+				/* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
+				bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
+				u8 op;
+
+				switch (BPF_OP(insn->code)) {
+				case BPF_LSH:
+					op = 1; /* prefix 0x66 */
+					break;
+				case BPF_RSH:
+					op = 3; /* prefix 0xf2 */
+					break;
+				case BPF_ARSH:
+					op = 2; /* prefix 0xf3 */
+					break;
+				}
+
+				emit_shiftx(&prog, dst_reg, src_reg, w, op);
+
+				break;
+			}
 
 			if (src_reg != BPF_REG_4) { /* common case */
 				/* Check for bad case when dst_reg == rcx */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH bpf-next v5 3/3] bpf: add selftests for lsh, rsh, arsh with reg operand
  2022-10-07 18:11                       ` KP Singh
                                           ` (2 preceding siblings ...)
  2022-10-07 20:23                         ` [PATCH bpf-next v5 2/3] bpf,x64: use shrx/sarx/shlx when available Jie Meng
@ 2022-10-07 20:23                         ` Jie Meng
  3 siblings, 0 replies; 24+ messages in thread
From: Jie Meng @ 2022-10-07 20:23 UTC (permalink / raw)
  To: kpsingh, bpf, ast, andrii, daniel; +Cc: Jie Meng

Current tests cover only shifts with an immediate as the source
operand/shift counts; add a new test case to cover register operand.

Signed-off-by: Jie Meng <jmeng@fb.com>
---
 tools/testing/selftests/bpf/verifier/jit.c | 24 ++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tools/testing/selftests/bpf/verifier/jit.c b/tools/testing/selftests/bpf/verifier/jit.c
index 79021c30e51e..8bf37e5207f1 100644
--- a/tools/testing/selftests/bpf/verifier/jit.c
+++ b/tools/testing/selftests/bpf/verifier/jit.c
@@ -20,6 +20,30 @@
 	.result = ACCEPT,
 	.retval = 2,
 },
+{
+	"jit: lsh, rsh, arsh by reg",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_MOV64_IMM(BPF_REG_4, 1),
+	BPF_MOV64_IMM(BPF_REG_1, 0xff),
+	BPF_ALU64_REG(BPF_LSH, BPF_REG_1, BPF_REG_0),
+	BPF_ALU32_REG(BPF_LSH, BPF_REG_1, BPF_REG_4),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1),
+	BPF_EXIT_INSN(),
+	BPF_ALU64_REG(BPF_RSH, BPF_REG_1, BPF_REG_4),
+	BPF_MOV64_REG(BPF_REG_4, BPF_REG_1),
+	BPF_ALU32_REG(BPF_RSH, BPF_REG_4, BPF_REG_0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0xff, 1),
+	BPF_EXIT_INSN(),
+	BPF_ALU64_REG(BPF_ARSH, BPF_REG_4, BPF_REG_4),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_4, 0, 1),
+	BPF_EXIT_INSN(),
+	BPF_MOV64_IMM(BPF_REG_0, 2),
+	BPF_EXIT_INSN(),
+	},
+	.result = ACCEPT,
+	.retval = 2,
+},
 {
 	"jit: mov32 for ldimm64, 1",
 	.insns = {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH bpf-next v5 0/3] bpf,x64: Use BMI2 for shifts
  2022-10-07 20:23                         ` [PATCH bpf-next v5 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
@ 2022-10-20  0:00                           ` patchwork-bot+netdevbpf
  0 siblings, 0 replies; 24+ messages in thread
From: patchwork-bot+netdevbpf @ 2022-10-20  0:00 UTC (permalink / raw)
  To: Jie Meng; +Cc: kpsingh, bpf, ast, andrii, daniel

Hello:

This series was applied to bpf/bpf-next.git (master)
by Alexei Starovoitov <ast@kernel.org>:

On Fri, 7 Oct 2022 13:23:46 -0700 you wrote:
> With baseline x64 instruction set, shift count can only be an immediate
> or in %cl. The implicit dependency on %cl makes it necessary to shuffle
> registers around and/or add push/pop operations.
> 
> BMI2 provides shift instructions that can use any general register as
> the shift count, saving us instructions and a few bytes in most cases.
> 
> [...]

Here is the summary with links:
  - [bpf-next,v5,1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx
    https://git.kernel.org/bpf/bpf-next/c/81b35e7cad79
  - [bpf-next,v5,2/3] bpf,x64: use shrx/sarx/shlx when available
    https://git.kernel.org/bpf/bpf-next/c/77d8f5d47bfb
  - [bpf-next,v5,3/3] bpf: add selftests for lsh, rsh, arsh with reg operand
    https://git.kernel.org/bpf/bpf-next/c/8662de232149

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2022-10-20  0:00 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-09-21  2:21 [PATCH bpf-next] bpf,x64: use shrx/sarx/shlx when available Jie Meng
2022-09-22 15:07 ` Daniel Borkmann
2022-09-24  0:32   ` [PATCH bpf-next v2 0/2] bpf,x64: Use BMI2 for shifts Jie Meng
2022-09-24  0:32     ` [PATCH bpf-next v2 1/2] bpf,x64: use shrx/sarx/shlx when available Jie Meng
2022-09-26 19:16       ` Daniel Borkmann
2022-09-27  0:38         ` Jie Meng
2022-09-27  9:45           ` Daniel Borkmann
2022-09-27 18:57             ` [PATCH bpf-next v3 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
2022-09-27 18:57               ` [PATCH bpf-next v3 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx Jie Meng
2022-09-27 18:58               ` [PATCH bpf-next v3 2/3] bpf,x64: use shrx/sarx/shlx when available Jie Meng
2022-09-27 18:58               ` [PATCH bpf-next v3 3/3] bpf: add selftests for lsh, rsh, arsh with reg operand Jie Meng
2022-10-02  5:11               ` [PATCH bpf-next v4 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
2022-10-02  5:11                 ` [PATCH bpf-next v4 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx Jie Meng
2022-10-02  5:11                 ` [PATCH bpf-next v4 2/3] bpf,x64: use shrx/sarx/shlx when available Jie Meng
2022-10-06  4:11                   ` KP Singh
2022-10-07  3:14                     ` Jie Meng
2022-10-07 18:11                       ` KP Singh
2022-10-07 20:23                         ` [PATCH bpf-next v5 0/3] bpf,x64: Use BMI2 for shifts Jie Meng
2022-10-20  0:00                           ` patchwork-bot+netdevbpf
2022-10-07 20:23                         ` [PATCH bpf-next v5 1/3] bpf,x64: avoid unnecessary instructions when shift dest is ecx Jie Meng
2022-10-07 20:23                         ` [PATCH bpf-next v5 2/3] bpf,x64: use shrx/sarx/shlx when available Jie Meng
2022-10-07 20:23                         ` [PATCH bpf-next v5 3/3] bpf: add selftests for lsh, rsh, arsh with reg operand Jie Meng
2022-10-02  5:11                 ` [PATCH bpf-next v4 " Jie Meng
2022-09-24  0:32     ` [PATCH bpf-next v2 2/2] bpf: Add " Jie Meng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox