Re: [PATCH bpf-next v2 1/2] bpf,riscv: Implement PROBE_MEM32 pseudo instructions

From: Puranjay Mohan <puranjay12@gmail.com>
To: "Björn Töpel" <bjorn@kernel.org>,
	"Alexei Starovoitov" <ast@kernel.org>,
	"Daniel Borkmann" <daniel@iogearbox.net>,
	"Andrii Nakryiko" <andrii@kernel.org>,
	"Martin KaFai Lau" <martin.lau@linux.dev>,
	"Eduard Zingerman" <eddyz87@gmail.com>,
	"Song Liu" <song@kernel.org>,
	"Yonghong Song" <yonghong.song@linux.dev>,
	"John Fastabend" <john.fastabend@gmail.com>,
	"KP Singh" <kpsingh@kernel.org>,
	"Stanislav Fomichev" <sdf@google.com>,
	"Hao Luo" <haoluo@google.com>, "Jiri Olsa" <jolsa@kernel.org>,
	"Luke Nelson" <luke.r.nels@gmail.com>,
	"Xi Wang" <xi.wang@gmail.com>,
	"Paul Walmsley" <paul.walmsley@sifive.com>,
	"Palmer Dabbelt" <palmer@dabbelt.com>,
	"Albert Ou" <aou@eecs.berkeley.edu>,
	bpf@vger.kernel.org, linux-riscv@lists.infradead.org,
	linux-kernel@vger.kernel.org,
	"Pu Lehui" <pulehui@huaweicloud.com>
Subject: Re: [PATCH bpf-next v2 1/2] bpf,riscv: Implement PROBE_MEM32 pseudo instructions
Date: Mon, 25 Mar 2024 17:15:51 +0000	[thread overview]
Message-ID: <mb61ple66mdvc.fsf@gmail.com> (raw)
In-Reply-To: <875xxafe33.fsf@all.your.base.are.belong.to.us>

Björn Töpel <bjorn@kernel.org> writes:

> Puranjay Mohan <puranjay12@gmail.com> writes:
>
>> Add support for [LDX | STX | ST], PROBE_MEM32, [B | H | W | DW]
>> instructions.  They are similar to PROBE_MEM instructions with the
>> following differences:
>> - PROBE_MEM32 supports store.
>> - PROBE_MEM32 relies on the verifier to clear upper 32-bit of the
>>   src/dst register
>> - PROBE_MEM32 adds 64-bit kern_vm_start address (which is stored in S7
>>   in the prologue). Due to bpf_arena constructions such S7 + reg +
>>   off16 access is guaranteed to be within arena virtual range, so no
>>   address check at run-time.
>> - S7 is a free callee-saved register, so it is used to store kern_vm_start
>> - PROBE_MEM32 allows STX and ST. If they fault the store is a nop. When
>>   LDX faults the destination register is zeroed.
>>
>> To support these on riscv, we do tmp = S7 + src/dst reg and then use
>> tmp2 as the new src/dst register. This allows us to reuse most of the
>> code for normal [LDX | STX | ST].
>
> Cool to see the RV BPF JIT keeping up with x86 features! ;-) Nice work!

It is my self proclaimed duty to make sure that all 64-bit JITs have
feature parity. :D

>
> A couple of minor comments below.
>
>> Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
>> ---
>>  arch/riscv/net/bpf_jit.h        |   1 +
>>  arch/riscv/net/bpf_jit_comp64.c | 193 +++++++++++++++++++++++++++++++-
>>  arch/riscv/net/bpf_jit_core.c   |   1 +
>>  3 files changed, 192 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
>> index f4b6b3b9edda..8a47da08dd9c 100644
>> --- a/arch/riscv/net/bpf_jit.h
>> +++ b/arch/riscv/net/bpf_jit.h
>> @@ -81,6 +81,7 @@ struct rv_jit_context {
>>  	int nexentries;
>>  	unsigned long flags;
>>  	int stack_size;
>> +	u64 arena_vm_start;
>>  };
>>  
>>  /* Convert from ninsns to bytes. */
>> diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
>> index 1adf2f39ce59..0c0588e327af 100644
>> --- a/arch/riscv/net/bpf_jit_comp64.c
>> +++ b/arch/riscv/net/bpf_jit_comp64.c
>> @@ -255,6 +255,10 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
>>  		emit_ld(RV_REG_S6, store_offset, RV_REG_SP, ctx);
>>  		store_offset -= 8;
>>  	}
>> +	if (ctx->arena_vm_start) {
>> +		emit_ld(RV_REG_S7, store_offset, RV_REG_SP, ctx);
>> +		store_offset -= 8;
>> +	}
>>  
>>  	emit_addi(RV_REG_SP, RV_REG_SP, stack_adjust, ctx);
>>  	/* Set return value. */
>> @@ -548,6 +552,7 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
>>  
>>  #define BPF_FIXUP_OFFSET_MASK   GENMASK(26, 0)
>>  #define BPF_FIXUP_REG_MASK      GENMASK(31, 27)
>> +#define DONT_CLEAR		17	/* RV_REG_A7 unused in pt_regmap */
>
> Hmm, so this is just a a sentinel node, right? Isn't it more robust to
> use, say REG_ZERO which will never be used? Maybe REG_DONT_CLEAR_MARKER
> or smth, so it's obvious how it's used?

Yes, I agree, RV_REG_ZERO would be the best thing to use here.

>
>
>>  bool ex_handler_bpf(const struct exception_table_entry *ex,
>>  		    struct pt_regs *regs)
>> @@ -555,7 +560,8 @@ bool ex_handler_bpf(const struct exception_table_entry *ex,
>>  	off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup);
>>  	int regs_offset = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup);
>>  
>> -	*(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
>> +	if (regs_offset != DONT_CLEAR)
>> +		*(unsigned long *)((void *)regs + pt_regmap[regs_offset]) = 0;
>>  	regs->epc = (unsigned long)&ex->fixup - offset;
>>  
>>  	return true;
>> @@ -572,7 +578,8 @@ static int add_exception_handler(const struct bpf_insn *insn,
>>  	off_t fixup_offset;
>>  
>>  	if (!ctx->insns || !ctx->ro_insns || !ctx->prog->aux->extable ||
>> -	    (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX))
>> +	    (BPF_MODE(insn->code) != BPF_PROBE_MEM && BPF_MODE(insn->code) != BPF_PROBE_MEMSX &&
>> +	     BPF_MODE(insn->code) != BPF_PROBE_MEM32))
>>  		return 0;
>>  
>>  	if (WARN_ON_ONCE(ctx->nexentries >= ctx->prog->aux->num_exentries))
>> @@ -622,6 +629,9 @@ static int add_exception_handler(const struct bpf_insn *insn,
>>  
>>  	ex->insn = ins_offset;
>>  
>> +	if (BPF_CLASS(insn->code) != BPF_LDX)
>> +		dst_reg = DONT_CLEAR;
>> +
>
> Instead of having a side-effect, and passing a dummy dst_reg for the
> probe_mem32, just explicitly add DONT_CLEAR when calling
> add_exception_handler(). It's more obvious to me at least.

Sure, will do that in the next version.

>
>>  	ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, fixup_offset) |
>>  		FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg);
>>  	ex->type = EX_TYPE_BPF;
>> @@ -1063,7 +1073,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>>  		    BPF_CLASS(insn->code) == BPF_JMP;
>>  	int s, e, rvoff, ret, i = insn - ctx->prog->insnsi;
>>  	struct bpf_prog_aux *aux = ctx->prog->aux;
>> -	u8 rd = -1, rs = -1, code = insn->code;
>> +	u8 rd = -1, rs = -1, code = insn->code, reg_arena_vm_start = RV_REG_S7;
>>  	s16 off = insn->off;
>>  	s32 imm = insn->imm;
>>  
>> @@ -1539,6 +1549,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>>  	case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
>>  	case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
>>  	case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
>> +	/* LDX | PROBE_MEM32: dst = *(unsigned size *)(src + S7 + off)*/
>> +	case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
>> +	case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
>> +	case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
>> +	case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
>>  	{
>>  		int insn_len, insns_start;
>>  		bool sign_ext;
>> @@ -1546,6 +1561,11 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>>  		sign_ext = BPF_MODE(insn->code) == BPF_MEMSX ||
>>  			   BPF_MODE(insn->code) == BPF_PROBE_MEMSX;
>>  
>> +		if (BPF_MODE(insn->code) == BPF_PROBE_MEM32) {
>> +			emit_add(RV_REG_T2, rs, reg_arena_vm_start, ctx);
>> +			rs = RV_REG_T2;
>> +		}
>> +
>>  		switch (BPF_SIZE(code)) {
>>  		case BPF_B:
>>  			if (is_12b_int(off)) {
>> @@ -1682,6 +1702,87 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
>>  		emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
>>  		break;
>>  
>> +	case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
>> +	case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
>> +	case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
>> +	case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
>> +	{
>> +		int insn_len, insns_start;
>> +
>> +		emit_add(RV_REG_T3, rd, reg_arena_vm_start, ctx);
>> +		rd = RV_REG_T3;
>> +
>> +		/* Load imm to a register then store it */
>> +		emit_imm(RV_REG_T1, imm, ctx);
>> +
>> +		switch (BPF_SIZE(code)) {
>> +		case BPF_B:
>> +			if (is_12b_int(off)) {
>> +				insns_start = ctx->ninsns;
>> +				emit(rv_sb(rd, off, RV_REG_T1), ctx);
>> +				insn_len = ctx->ninsns - insns_start;
>> +				break;
>> +			}
>> +
>> +			emit_imm(RV_REG_T2, off, ctx);
>> +			emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>> +			insns_start = ctx->ninsns;
>> +			emit(rv_sb(RV_REG_T2, 0, RV_REG_T1), ctx);
>> +			insn_len = ctx->ninsns - insns_start;
>> +
>> +			break;
>> +
>> +		case BPF_H:
>> +			if (is_12b_int(off)) {
>> +				insns_start = ctx->ninsns;
>> +				emit(rv_sh(rd, off, RV_REG_T1), ctx);
>> +				insn_len = ctx->ninsns - insns_start;
>> +				break;
>> +			}
>> +
>> +			emit_imm(RV_REG_T2, off, ctx);
>> +			emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>> +			insns_start = ctx->ninsns;
>> +			emit(rv_sh(RV_REG_T2, 0, RV_REG_T1), ctx);
>> +			insn_len = ctx->ninsns - insns_start;
>> +			break;
>> +		case BPF_W:
>> +			if (is_12b_int(off)) {
>> +				insns_start = ctx->ninsns;
>> +				emit_sw(rd, off, RV_REG_T1, ctx);
>> +				insn_len = ctx->ninsns - insns_start;
>> +				break;
>> +			}
>> +
>> +			emit_imm(RV_REG_T2, off, ctx);
>> +			emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>> +			insns_start = ctx->ninsns;
>> +			emit_sw(RV_REG_T2, 0, RV_REG_T1, ctx);
>> +			insn_len = ctx->ninsns - insns_start;
>> +			break;
>> +		case BPF_DW:
>> +			if (is_12b_int(off)) {
>> +				insns_start = ctx->ninsns;
>> +				emit_sd(rd, off, RV_REG_T1, ctx);
>> +				insn_len = ctx->ninsns - insns_start;
>> +				break;
>> +			}
>> +
>> +			emit_imm(RV_REG_T2, off, ctx);
>> +			emit_add(RV_REG_T2, RV_REG_T2, rd, ctx);
>> +			insns_start = ctx->ninsns;
>> +			emit_sd(RV_REG_T2, 0, RV_REG_T1, ctx);
>> +			insn_len = ctx->ninsns - insns_start;
>> +			break;
>> +		}
>
> A lot of similar code, with emit of different sizes. Possible to move
> move out to a function, and wrap the emits? The main loop is hard read
> already!

I thought about this as well. My plan is to refactor the whole thing in a
seperate patch. I did not do it with this feature as it will cause a lot
of unrelated code churn.

Thanks,
Puranjay