From: Stephen Hemminger <stephen@networkplumber.org>
To: dev@dpdk.org
Cc: Stephen Hemminger <stephen@networkplumber.org>,
Wathsala Vithanage <wathsala.vithanage@arm.com>,
Konstantin Ananyev <konstantin.ananyev@huawei.com>,
Marat Khalili <marat.khalili@huawei.com>
Subject: [PATCH v6 7/9] bpf/arm64: add BPF_ABS/BPF_IND packet load support
Date: Thu, 25 Jun 2026 10:30:17 -0700 [thread overview]
Message-ID: <20260625173231.216074-8-stephen@networkplumber.org> (raw)
In-Reply-To: <20260625173231.216074-1-stephen@networkplumber.org>
The arm64 JIT rejected BPF_LD | BPF_ABS and BPF_LD | BPF_IND with
"invalid opcode", so cBPF programs converted by rte_bpf_convert() could
not be JITed. Add these opcodes, mirroring the x86 JIT: a fast path for
data held in the first mbuf segment, and a __rte_pktmbuf_read() slow
path for everything else.
The forward branches over the call cannot use fixed distances:
emit_call() materializes the helper address with a variable number of
mov/movk instructions, so the block sizes are not known up front. Size
the three blocks (fast path, slow path, common tail) in a dry run, then
emit for real with the branches resolved from the measured offsets.
The effective offset is validated before use: src is a runtime value for
BPF_IND, so a negative offset is routed to the slow path rather than
read from the first segment, and the offset is bounded to UINT32_MAX
before __rte_pktmbuf_read(), whose off argument is uint32_t.
Programs using these opcodes use the call register layout, since the
slow path makes a function call.
For example, BPF_LD | BPF_IND | BPF_W (4-byte indirect load, mbuf in
R6/x19, effective offset kept in x9) emits:
mov x9, #imm // off = imm
add x9, x9, src // off += src (BPF_IND)
cmp x9, xzr // reject negative
b.mi slow // effective offset
mov x10, #data_len_ofs
ldrh w10, [x19, x10] // mbuf->data_len
sub x10, x10, x9 // data_len - off
mov x11, #sz
cmp x10, x11
b.lt slow // not in first segment
mov x10, #data_off_ofs
ldrh w10, [x19, x10] // mbuf->data_off
mov x7, #buf_addr_ofs
ldr x7, [x19, x7] // mbuf->buf_addr
add x7, x7, x10
add x7, x7, x9 // ptr = buf_addr + data_off + off
b load
slow:
mov x10, #UINT32_MAX
cmp x9, x10
b.ls 1f // off fits uint32_t ...
mov x7, #0 // else return 0
b epilogue
1: mov x1, x9 // __rte_pktmbuf_read(mbuf, off, sz, buf)
mov x0, x19
mov w2, #sz
sub x3, x25, #stack_ofs
mov x9, #<helper lo>
movk x9, #<helper hi>
blr x9
mov x7, x0 // ptr = return value
cbnz x7, load // non-NULL -> common tail
mov x7, #0 // else return 0
b epilogue
load:
ldr w7, [x7, xzr] // *(uint32_t *)ptr (size varies)
rev32 x7, x7 // ntoh (size varies; omitted for BPF_B)
For BPF_ABS the "add x9, x9, src" is omitted; the final load/byte-swap
vary with the access size.
Bugzilla ID: 1427
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
lib/bpf/bpf_jit_arm64.c | 169 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 168 insertions(+), 1 deletion(-)
diff --git a/lib/bpf/bpf_jit_arm64.c b/lib/bpf/bpf_jit_arm64.c
index 51906c7f0d..6d531dc83d 100644
--- a/lib/bpf/bpf_jit_arm64.c
+++ b/lib/bpf/bpf_jit_arm64.c
@@ -1133,6 +1133,155 @@ emit_branch(struct a64_jit_ctx *ctx, uint8_t op, uint32_t i, int16_t off)
emit_b_cond(ctx, ebpf_to_a64_cond(op), jump_offset_get(ctx, i, off));
}
+/* LD_ABS/LD_IND code block offsets (in arm64 instructions) */
+enum {
+ LDMB_FAST_OFS, /* fast path */
+ LDMB_SLOW_OFS, /* slow path */
+ LDMB_FIN_OFS, /* common tail */
+ LDMB_OFS_NUM
+};
+
+/*
+ * Helper for emit_ld_mbuf(): fast path.
+ * Compute the packet offset; if it lies inside the first segment leave the
+ * data pointer in R0, otherwise branch to the slow path.
+ */
+static void
+emit_ldmb_fast_path(struct a64_jit_ctx *ctx, uint8_t src, uint8_t mode,
+ uint32_t sz, int32_t imm, const uint32_t ofs[LDMB_OFS_NUM])
+{
+ uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
+ uint8_t r6 = ebpf_to_a64_reg(ctx, EBPF_REG_6);
+ uint8_t tmp1 = ebpf_to_a64_reg(ctx, TMP_REG_1);
+ uint8_t tmp2 = ebpf_to_a64_reg(ctx, TMP_REG_2);
+ uint8_t tmp3 = ebpf_to_a64_reg(ctx, TMP_REG_3);
+
+ /* off = imm (+ src for BPF_IND) */
+ emit_mov_imm(ctx, 1, tmp1, imm);
+ if (mode == BPF_IND)
+ emit_add(ctx, 1, tmp1, src);
+
+ /*
+ * A negative effective offset (src can be < 0 for BPF_IND) would pass
+ * the signed check below and read before the segment, so route it to
+ * the slow path, which rejects it via the uint32_t bound on off.
+ */
+ emit_cmp(ctx, 1, tmp1, A64_ZR);
+ emit_b_cond(ctx, A64_MI, (int32_t)(ofs[LDMB_SLOW_OFS] - ctx->idx));
+
+ /* if ((int64_t)(mbuf->data_len - off) < sz) goto slow_path */
+ emit_mov_imm(ctx, 1, tmp2, offsetof(struct rte_mbuf, data_len));
+ emit_ldr(ctx, BPF_H, tmp2, r6, tmp2);
+ emit_sub(ctx, 1, tmp2, tmp1);
+ emit_mov_imm(ctx, 1, tmp3, sz);
+ emit_cmp(ctx, 1, tmp2, tmp3);
+ emit_b_cond(ctx, A64_LT, (int32_t)(ofs[LDMB_SLOW_OFS] - ctx->idx));
+
+ /* R0 = mbuf->buf_addr + mbuf->data_off + off */
+ emit_mov_imm(ctx, 1, tmp2, offsetof(struct rte_mbuf, data_off));
+ emit_ldr(ctx, BPF_H, tmp2, r6, tmp2);
+ emit_mov_imm(ctx, 1, r0, offsetof(struct rte_mbuf, buf_addr));
+ emit_ldr(ctx, EBPF_DW, r0, r6, r0);
+ emit_add(ctx, 1, r0, tmp2);
+ emit_add(ctx, 1, r0, tmp1);
+
+ emit_b(ctx, (int32_t)(ofs[LDMB_FIN_OFS] - ctx->idx));
+}
+
+/*
+ * Helper for emit_ld_mbuf(): slow path.
+ * R0 = __rte_pktmbuf_read(mbuf, off, sz, buf); return 0 if NULL.
+ * The scratch buffer is the space reserved by __rte_bpf_validate() at the
+ * bottom of the eBPF stack frame, i.e. (frame_pointer - stack_ofs).
+ */
+static void
+emit_ldmb_slow_path(struct a64_jit_ctx *ctx, uint32_t sz, uint32_t stack_ofs)
+{
+ uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
+ uint8_t r6 = ebpf_to_a64_reg(ctx, EBPF_REG_6);
+ uint8_t fp = ebpf_to_a64_reg(ctx, EBPF_FP);
+ uint8_t tmp1 = ebpf_to_a64_reg(ctx, TMP_REG_1);
+ uint8_t tmp2 = ebpf_to_a64_reg(ctx, TMP_REG_2);
+
+ /*
+ * __rte_pktmbuf_read() takes a uint32_t off, so a 64-bit off that does
+ * not fit would be silently truncated. Return 0 if it is out of range;
+ * this also catches the negative off routed here by the fast path.
+ */
+ emit_mov_imm(ctx, 1, tmp2, UINT32_MAX);
+ emit_cmp(ctx, 1, tmp1, tmp2);
+ emit_b_cond(ctx, A64_LS, 3); /* off <= UINT32_MAX: do the call */
+ emit_mov_imm(ctx, 1, r0, 0);
+ emit_b(ctx, (ctx->program_start + ctx->program_sz) - ctx->idx);
+
+ /* arguments of __rte_pktmbuf_read(mbuf, off, len, buf) */
+ emit_mov_64(ctx, A64_R(1), tmp1); /* off (held in tmp1) */
+ emit_mov_64(ctx, A64_R(0), r6); /* mbuf */
+ emit_mov_imm(ctx, 0, A64_R(2), sz); /* len */
+ emit_sub_imm_64(ctx, A64_R(3), fp, stack_ofs); /* buf */
+
+ emit_call(ctx, tmp1, (void *)(uintptr_t)__rte_pktmbuf_read);
+ emit_return_zero_if_src_zero(ctx, 1, r0);
+}
+
+/*
+ * Helper for emit_ld_mbuf(): common tail.
+ * Load the value pointed to by R0 and convert from network byte order.
+ */
+static void
+emit_ldmb_fin(struct a64_jit_ctx *ctx, uint8_t opsz, uint32_t sz)
+{
+ uint8_t r0 = ebpf_to_a64_reg(ctx, EBPF_REG_0);
+
+ emit_ldr(ctx, opsz, r0, r0, A64_ZR);
+ if (opsz != BPF_B)
+ emit_be(ctx, r0, sz * 8);
+}
+
+/*
+ * Emit code for BPF_LD | BPF_ABS and BPF_LD | BPF_IND packet loads:
+ *
+ * off = imm (+ src for BPF_IND)
+ * if (off >= 0 && mbuf->data_len - off >= sz) -- fast path
+ * ptr = mbuf->buf_addr + mbuf->data_off + off;
+ * else -- slow path
+ * if ((uint64_t)off > UINT32_MAX)
+ * return 0;
+ * ptr = __rte_pktmbuf_read(mbuf, off, sz, buf);
+ * if (ptr == NULL)
+ * return 0;
+ * R0 = ntoh(*(size *)ptr); -- common tail
+ *
+ * The three blocks are sized in a dry run so the forward branches can be
+ * resolved, then emitted for real (arm64 instructions are fixed width, so
+ * the dry run reproduces the real instruction count exactly).
+ */
+static void
+emit_ld_mbuf(struct a64_jit_ctx *ctx, uint8_t op, uint8_t src, int32_t imm,
+ uint32_t stack_ofs)
+{
+ uint8_t mode = BPF_MODE(op);
+ uint8_t opsz = BPF_SIZE(op);
+ uint32_t sz = bpf_size(opsz);
+ uint32_t ofs[LDMB_OFS_NUM];
+
+ /* seed offsets so the dry-run branches stay in range */
+ ofs[LDMB_FAST_OFS] = ofs[LDMB_SLOW_OFS] = ofs[LDMB_FIN_OFS] = ctx->idx;
+
+ /* dry run to record block offsets */
+ emit_ldmb_fast_path(ctx, src, mode, sz, imm, ofs);
+ ofs[LDMB_SLOW_OFS] = ctx->idx;
+ emit_ldmb_slow_path(ctx, sz, stack_ofs);
+ ofs[LDMB_FIN_OFS] = ctx->idx;
+ emit_ldmb_fin(ctx, opsz, sz);
+
+ /* rewind and emit for real with resolved offsets */
+ ctx->idx = ofs[LDMB_FAST_OFS];
+ emit_ldmb_fast_path(ctx, src, mode, sz, imm, ofs);
+ emit_ldmb_slow_path(ctx, sz, stack_ofs);
+ emit_ldmb_fin(ctx, opsz, sz);
+}
+
static void
check_program_has_call(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
{
@@ -1145,8 +1294,17 @@ check_program_has_call(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
op = ins->code;
switch (op) {
- /* Call imm */
+ /*
+ * BPF_ABS/BPF_IND can fall through to __rte_pktmbuf_read(),
+ * so they need the call-clobbered register layout as well.
+ */
case (BPF_JMP | EBPF_CALL):
+ case (BPF_LD | BPF_ABS | BPF_B):
+ case (BPF_LD | BPF_ABS | BPF_H):
+ case (BPF_LD | BPF_ABS | BPF_W):
+ case (BPF_LD | BPF_IND | BPF_B):
+ case (BPF_LD | BPF_IND | BPF_H):
+ case (BPF_LD | BPF_IND | BPF_W):
ctx->foundcall = 1;
return;
}
@@ -1348,6 +1506,15 @@ emit(struct a64_jit_ctx *ctx, struct rte_bpf *bpf)
emit_mov_imm(ctx, 1, dst, u64);
i++;
break;
+ /* R0 = ntoh(*(size *)(mbuf data + (src) + imm)) */
+ case (BPF_LD | BPF_ABS | BPF_B):
+ case (BPF_LD | BPF_ABS | BPF_H):
+ case (BPF_LD | BPF_ABS | BPF_W):
+ case (BPF_LD | BPF_IND | BPF_B):
+ case (BPF_LD | BPF_IND | BPF_H):
+ case (BPF_LD | BPF_IND | BPF_W):
+ emit_ld_mbuf(ctx, op, src, imm, bpf->stack_sz);
+ break;
/* *(size *)(dst + off) = src */
case (BPF_STX | BPF_MEM | BPF_B):
case (BPF_STX | BPF_MEM | BPF_H):
--
2.53.0
next prev parent reply other threads:[~2026-06-25 17:33 UTC|newest]
Thread overview: 72+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-08 20:28 [PATCH 0/4] bpf/arm64: add BPF_ABS/BPF_IND packet load support Stephen Hemminger
2026-06-08 20:28 ` [PATCH 1/4] bpf/arm64: fix zero-return branch in multi-exit programs Stephen Hemminger
2026-06-17 18:03 ` Marat Khalili
2026-06-08 20:28 ` [PATCH 2/4] test: bpf check that JIT was generated Stephen Hemminger
2026-06-17 18:09 ` Marat Khalili
2026-06-08 20:28 ` [PATCH 3/4] test: bpf check that bpf_convert can be JIT'd Stephen Hemminger
2026-06-17 18:14 ` Marat Khalili
2026-06-08 20:28 ` [PATCH 4/4] bpf/arm64: add BPF_ABS/BPF_IND packet load support Stephen Hemminger
2026-06-17 19:35 ` Marat Khalili
2026-06-17 17:37 ` [PATCH 0/4] " Marat Khalili
2026-06-17 21:17 ` Stephen Hemminger
2026-06-18 20:47 ` [PATCH v2 0/6] bpf: JIT related bug fixes Stephen Hemminger
2026-06-18 20:47 ` [PATCH v2 1/6] bpf/x86: fix JIT encoding of BPF_JSET with immediate Stephen Hemminger
2026-06-19 2:09 ` Stephen Hemminger
2026-06-18 20:47 ` [PATCH v2 2/6] test/bpf: add JSET test with small immediate Stephen Hemminger
2026-06-18 20:47 ` [PATCH v2 3/6] bpf/arm64: fix offset type to allow a negative jump Stephen Hemminger
2026-06-18 20:47 ` [PATCH v2 4/6] test/bpf: check that JIT was generated Stephen Hemminger
2026-06-18 20:47 ` [PATCH v2 5/6] bpf/arm64: add BPF_ABS/BPF_IND packet load support Stephen Hemminger
2026-06-18 20:47 ` [PATCH v2 6/6] test/bpf: check that bpf_convert can be JIT'd Stephen Hemminger
2026-06-21 16:23 ` [PATCH v3 0/6] bpf: JIT related bug fixes Stephen Hemminger
2026-06-21 16:23 ` [PATCH v3 1/6] bpf/x86: fix JIT encoding of BPF_JSET with immediate Stephen Hemminger
2026-06-23 10:11 ` Marat Khalili
2026-06-21 16:23 ` [PATCH v3 2/6] test/bpf: add JSET test with small immediate Stephen Hemminger
2026-06-23 10:16 ` Marat Khalili
2026-06-21 16:23 ` [PATCH v3 3/6] bpf/arm64: fix offset type to allow a negative jump Stephen Hemminger
2026-06-22 16:26 ` Marat Khalili
2026-06-21 16:23 ` [PATCH v3 4/6] test/bpf: check that JIT was generated Stephen Hemminger
2026-06-21 16:23 ` [PATCH v3 5/6] bpf/arm64: add BPF_ABS/BPF_IND packet load support Stephen Hemminger
2026-06-21 16:23 ` [PATCH v3 6/6] test/bpf: check that bpf_convert can be JIT'd Stephen Hemminger
2026-06-23 13:57 ` Marat Khalili
2026-06-23 15:51 ` Stephen Hemminger
2026-06-23 20:58 ` Stephen Hemminger
2026-06-23 23:23 ` [PATCH v4 0/7] bpf: JIT related bug fixes Stephen Hemminger
2026-06-23 23:23 ` [PATCH v4 1/7] bpf/x86: fix JIT encoding of fixed-width immediates Stephen Hemminger
2026-06-23 23:23 ` [PATCH v4 2/7] test/bpf: add JSET test with small immediate Stephen Hemminger
2026-06-23 23:23 ` [PATCH v4 3/7] test/bpf: add test for large shift Stephen Hemminger
2026-06-24 7:59 ` Marat Khalili
2026-06-24 13:44 ` Marat Khalili
2026-06-23 23:23 ` [PATCH v4 4/7] bpf/arm64: fix offset type to allow a negative jump Stephen Hemminger
2026-06-24 8:43 ` Marat Khalili
2026-06-23 23:23 ` [PATCH v4 5/7] test/bpf: check that JIT was generated Stephen Hemminger
2026-06-23 23:23 ` [PATCH v4 6/7] bpf/arm64: add BPF_ABS/BPF_IND packet load support Stephen Hemminger
2026-06-23 23:23 ` [PATCH v4 7/7] test/bpf: check that bpf_convert can be JIT'd Stephen Hemminger
2026-06-24 8:39 ` Marat Khalili
2026-06-24 17:54 ` [PATCH v5 0/9] bpf: JIT related bug fixes Stephen Hemminger
2026-06-24 17:55 ` [PATCH v5 1/9] bpf/x86: fix JIT encoding of fixed-width immediates Stephen Hemminger
2026-06-24 17:55 ` [PATCH v5 2/9] test/bpf: add JSET test with small immediate Stephen Hemminger
2026-06-24 17:55 ` [PATCH v5 3/9] bpf: mask shift count in interpreter per RFC 9669 Stephen Hemminger
2026-06-25 15:35 ` Marat Khalili
2026-06-24 17:55 ` [PATCH v5 4/9] bpf/arm64: mask shift count " Stephen Hemminger
2026-06-25 15:40 ` Marat Khalili
2026-06-24 17:55 ` [PATCH v5 5/9] test/bpf: add test for large shift Stephen Hemminger
2026-06-25 15:38 ` Marat Khalili
2026-06-24 17:55 ` [PATCH v5 6/9] bpf/arm64: fix offset type to allow a negative jump Stephen Hemminger
2026-06-24 17:55 ` [PATCH v5 7/9] bpf/arm64: add BPF_ABS/BPF_IND packet load support Stephen Hemminger
2026-06-25 13:59 ` Marat Khalili
2026-06-24 17:55 ` [PATCH v5 8/9] test/bpf: check that JIT was generated Stephen Hemminger
2026-06-24 17:55 ` [PATCH v5 9/9] test/bpf: check that bpf_convert can be JIT'd Stephen Hemminger
2026-06-25 17:30 ` [PATCH v6 0/9] bpf: JIT related bug fixes Stephen Hemminger
2026-06-25 17:30 ` [PATCH v6 1/9] bpf/x86: fix JIT encoding of fixed-width immediates Stephen Hemminger
2026-06-25 17:30 ` [PATCH v6 2/9] test/bpf: add JSET test with small immediate Stephen Hemminger
2026-06-25 17:30 ` [PATCH v6 3/9] bpf: mask shift count in interpreter per RFC 9669 Stephen Hemminger
2026-06-25 17:30 ` [PATCH v6 4/9] bpf/arm64: mask shift count " Stephen Hemminger
2026-06-25 17:30 ` [PATCH v6 5/9] test/bpf: add test for large shift Stephen Hemminger
2026-06-25 17:30 ` [PATCH v6 6/9] bpf/arm64: fix offset type to allow a negative jump Stephen Hemminger
2026-06-25 17:30 ` Stephen Hemminger [this message]
2026-06-26 10:34 ` [PATCH v6 7/9] bpf/arm64: add BPF_ABS/BPF_IND packet load support Marat Khalili
2026-06-25 17:30 ` [PATCH v6 8/9] test/bpf: check that JIT was generated Stephen Hemminger
2026-06-25 17:30 ` [PATCH v6 9/9] test/bpf: check that bpf_convert can be JIT'd Stephen Hemminger
2026-06-25 23:12 ` Stephen Hemminger
2026-06-26 10:35 ` [PATCH v6 0/9] bpf: JIT related bug fixes Marat Khalili
2026-06-26 15:01 ` Stephen Hemminger
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260625173231.216074-8-stephen@networkplumber.org \
--to=stephen@networkplumber.org \
--cc=dev@dpdk.org \
--cc=konstantin.ananyev@huawei.com \
--cc=marat.khalili@huawei.com \
--cc=wathsala.vithanage@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.